1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @vadd(i32* %s1, i32 %c0, i32 %N) { 5; CHECK-LABEL: vadd: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #1 10; CHECK-NEXT: it lt 11; CHECK-NEXT: poplt {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %while.body.lr.ph 13; CHECK-NEXT: dlstp.32 lr, r2 14; CHECK-NEXT: .LBB0_2: @ %while.body 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q0, [r0] 17; CHECK-NEXT: vadd.i32 q0, q0, r1 18; CHECK-NEXT: vstrw.32 q0, [r0], #16 19; CHECK-NEXT: letp lr, .LBB0_2 20; CHECK-NEXT: @ %bb.3: @ %while.end 21; CHECK-NEXT: pop {r7, pc} 22entry: 23 %cmp11 = icmp sgt i32 %N, 0 24 br i1 %cmp11, label %while.body.lr.ph, label %while.end 25 26while.body.lr.ph: ; preds = %entry 27 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 28 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 29 br label %while.body 30 31while.body: ; preds = %while.body.lr.ph, %while.body 32 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 33 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 34 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 35 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 36 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 37 %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 38 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 39 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 40 %sub = add nsw i32 %N.addr.012, -4 41 %cmp = icmp sgt i32 %N.addr.012, 4 42 br i1 %cmp, label %while.body, label %while.end 43 44while.end: ; preds = %while.body, %entry 45 ret void 46} 47 48define void @vsub(i32* %s1, i32 %c0, i32 %N) { 49; CHECK-LABEL: vsub: 50; CHECK: @ %bb.0: @ %entry 51; CHECK-NEXT: .save {r7, lr} 52; CHECK-NEXT: push {r7, lr} 53; CHECK-NEXT: cmp r2, #1 54; CHECK-NEXT: it lt 55; CHECK-NEXT: poplt {r7, pc} 56; CHECK-NEXT: .LBB1_1: @ %while.body.lr.ph 57; CHECK-NEXT: dlstp.32 lr, r2 58; CHECK-NEXT: .LBB1_2: @ %while.body 59; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 60; CHECK-NEXT: vldrw.u32 q0, [r0] 61; CHECK-NEXT: vsub.i32 q0, q0, r1 62; CHECK-NEXT: vstrw.32 q0, [r0], #16 63; CHECK-NEXT: letp lr, .LBB1_2 64; CHECK-NEXT: @ %bb.3: @ %while.end 65; CHECK-NEXT: pop {r7, pc} 66entry: 67 %cmp11 = icmp sgt i32 %N, 0 68 br i1 %cmp11, label %while.body.lr.ph, label %while.end 69 70while.body.lr.ph: ; preds = %entry 71 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 72 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 73 br label %while.body 74 75while.body: ; preds = %while.body.lr.ph, %while.body 76 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 77 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 78 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 79 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 80 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 81 %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 82 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 83 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 84 %sub = add nsw i32 %N.addr.012, -4 85 %cmp = icmp sgt i32 %N.addr.012, 4 86 br i1 %cmp, label %while.body, label %while.end 87 88while.end: ; preds = %while.body, %entry 89 ret void 90} 91 92define void @vmul(i32* %s1, i32 %c0, i32 %N) { 93; CHECK-LABEL: vmul: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: .save {r7, lr} 96; CHECK-NEXT: push {r7, lr} 97; CHECK-NEXT: cmp r2, #1 98; CHECK-NEXT: it lt 99; CHECK-NEXT: poplt {r7, pc} 100; CHECK-NEXT: .LBB2_1: @ %while.body.lr.ph 101; CHECK-NEXT: dlstp.32 lr, r2 102; CHECK-NEXT: .LBB2_2: @ %while.body 103; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 104; CHECK-NEXT: vldrw.u32 q0, [r0] 105; CHECK-NEXT: vmul.i32 q0, q0, r1 106; CHECK-NEXT: vstrw.32 q0, [r0], #16 107; CHECK-NEXT: letp lr, .LBB2_2 108; CHECK-NEXT: @ %bb.3: @ %while.end 109; CHECK-NEXT: pop {r7, pc} 110entry: 111 %cmp11 = icmp sgt i32 %N, 0 112 br i1 %cmp11, label %while.body.lr.ph, label %while.end 113 114while.body.lr.ph: ; preds = %entry 115 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 116 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 117 br label %while.body 118 119while.body: ; preds = %while.body.lr.ph, %while.body 120 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 121 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 122 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 123 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 124 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 125 %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 126 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 127 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 128 %sub = add nsw i32 %N.addr.012, -4 129 %cmp = icmp sgt i32 %N.addr.012, 4 130 br i1 %cmp, label %while.body, label %while.end 131 132while.end: ; preds = %while.body, %entry 133 ret void 134} 135 136define void @vqadd(i32* %s1, i32 %c0, i32 %N) { 137; CHECK-LABEL: vqadd: 138; CHECK: @ %bb.0: @ %entry 139; CHECK-NEXT: .save {r7, lr} 140; CHECK-NEXT: push {r7, lr} 141; CHECK-NEXT: cmp r2, #1 142; CHECK-NEXT: it lt 143; CHECK-NEXT: poplt {r7, pc} 144; CHECK-NEXT: .LBB3_1: @ %while.body.lr.ph 145; CHECK-NEXT: dlstp.32 lr, r2 146; CHECK-NEXT: .LBB3_2: @ %while.body 147; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 148; CHECK-NEXT: vldrw.u32 q0, [r0] 149; CHECK-NEXT: vqadd.s32 q0, q0, r1 150; CHECK-NEXT: vstrw.32 q0, [r0], #16 151; CHECK-NEXT: letp lr, .LBB3_2 152; CHECK-NEXT: @ %bb.3: @ %while.end 153; CHECK-NEXT: pop {r7, pc} 154entry: 155 %cmp11 = icmp sgt i32 %N, 0 156 br i1 %cmp11, label %while.body.lr.ph, label %while.end 157 158while.body.lr.ph: ; preds = %entry 159 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 160 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 161 br label %while.body 162 163while.body: ; preds = %while.body.lr.ph, %while.body 164 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 165 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 166 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 167 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 168 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 169 %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) 170 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 171 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 172 %sub = add nsw i32 %N.addr.012, -4 173 %cmp = icmp sgt i32 %N.addr.012, 4 174 br i1 %cmp, label %while.body, label %while.end 175 176while.end: ; preds = %while.body, %entry 177 ret void 178} 179 180define void @vqsub(i32* %s1, i32 %c0, i32 %N) { 181; CHECK-LABEL: vqsub: 182; CHECK: @ %bb.0: @ %entry 183; CHECK-NEXT: .save {r7, lr} 184; CHECK-NEXT: push {r7, lr} 185; CHECK-NEXT: cmp r2, #1 186; CHECK-NEXT: it lt 187; CHECK-NEXT: poplt {r7, pc} 188; CHECK-NEXT: .LBB4_1: @ %while.body.lr.ph 189; CHECK-NEXT: dlstp.32 lr, r2 190; CHECK-NEXT: .LBB4_2: @ %while.body 191; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 192; CHECK-NEXT: vldrw.u32 q0, [r0] 193; CHECK-NEXT: vqsub.s32 q0, q0, r1 194; CHECK-NEXT: vstrw.32 q0, [r0], #16 195; CHECK-NEXT: letp lr, .LBB4_2 196; CHECK-NEXT: @ %bb.3: @ %while.end 197; CHECK-NEXT: pop {r7, pc} 198entry: 199 %cmp11 = icmp sgt i32 %N, 0 200 br i1 %cmp11, label %while.body.lr.ph, label %while.end 201 202while.body.lr.ph: ; preds = %entry 203 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 204 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 205 br label %while.body 206 207while.body: ; preds = %while.body.lr.ph, %while.body 208 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 209 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 210 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 211 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 212 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 213 %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) 214 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 215 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 216 %sub = add nsw i32 %N.addr.012, -4 217 %cmp = icmp sgt i32 %N.addr.012, 4 218 br i1 %cmp, label %while.body, label %while.end 219 220while.end: ; preds = %while.body, %entry 221 ret void 222} 223 224define void @vhadd(i32* %s1, i32 %c0, i32 %N) { 225; CHECK-LABEL: vhadd: 226; CHECK: @ %bb.0: @ %entry 227; CHECK-NEXT: .save {r7, lr} 228; CHECK-NEXT: push {r7, lr} 229; CHECK-NEXT: cmp r2, #1 230; CHECK-NEXT: it lt 231; CHECK-NEXT: poplt {r7, pc} 232; CHECK-NEXT: .LBB5_1: @ %while.body.lr.ph 233; CHECK-NEXT: dlstp.32 lr, r2 234; CHECK-NEXT: .LBB5_2: @ %while.body 235; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 236; CHECK-NEXT: vldrw.u32 q0, [r0] 237; CHECK-NEXT: vhadd.s32 q0, q0, r1 238; CHECK-NEXT: vstrw.32 q0, [r0], #16 239; CHECK-NEXT: letp lr, .LBB5_2 240; CHECK-NEXT: @ %bb.3: @ %while.end 241; CHECK-NEXT: pop {r7, pc} 242entry: 243 %cmp11 = icmp sgt i32 %N, 0 244 br i1 %cmp11, label %while.body.lr.ph, label %while.end 245 246while.body.lr.ph: ; preds = %entry 247 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 248 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 249 br label %while.body 250 251while.body: ; preds = %while.body.lr.ph, %while.body 252 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 253 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 254 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 255 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 256 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 257 %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) 258 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 259 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 260 %sub = add nsw i32 %N.addr.012, -4 261 %cmp = icmp sgt i32 %N.addr.012, 4 262 br i1 %cmp, label %while.body, label %while.end 263 264while.end: ; preds = %while.body, %entry 265 ret void 266} 267 268define void @vhsub(i32* %s1, i32 %c0, i32 %N) { 269; CHECK-LABEL: vhsub: 270; CHECK: @ %bb.0: @ %entry 271; CHECK-NEXT: .save {r7, lr} 272; CHECK-NEXT: push {r7, lr} 273; CHECK-NEXT: cmp r2, #1 274; CHECK-NEXT: it lt 275; CHECK-NEXT: poplt {r7, pc} 276; CHECK-NEXT: .LBB6_1: @ %while.body.lr.ph 277; CHECK-NEXT: dlstp.32 lr, r2 278; CHECK-NEXT: .LBB6_2: @ %while.body 279; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 280; CHECK-NEXT: vldrw.u32 q0, [r0] 281; CHECK-NEXT: vhsub.s32 q0, q0, r1 282; CHECK-NEXT: vstrw.32 q0, [r0], #16 283; CHECK-NEXT: letp lr, .LBB6_2 284; CHECK-NEXT: @ %bb.3: @ %while.end 285; CHECK-NEXT: pop {r7, pc} 286entry: 287 %cmp11 = icmp sgt i32 %N, 0 288 br i1 %cmp11, label %while.body.lr.ph, label %while.end 289 290while.body.lr.ph: ; preds = %entry 291 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 292 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 293 br label %while.body 294 295while.body: ; preds = %while.body.lr.ph, %while.body 296 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 297 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 298 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 299 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 300 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 301 %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2) 302 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 303 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 304 %sub = add nsw i32 %N.addr.012, -4 305 %cmp = icmp sgt i32 %N.addr.012, 4 306 br i1 %cmp, label %while.body, label %while.end 307 308while.end: ; preds = %while.body, %entry 309 ret void 310} 311 312define void @vqdmull(i32* %s1, i32 %c0, i32 %N) { 313; CHECK-LABEL: vqdmull: 314; CHECK: @ %bb.0: @ %entry 315; CHECK-NEXT: .save {r7, lr} 316; CHECK-NEXT: push {r7, lr} 317; CHECK-NEXT: cmp r2, #1 318; CHECK-NEXT: it lt 319; CHECK-NEXT: poplt {r7, pc} 320; CHECK-NEXT: .LBB7_1: @ %while.body.lr.ph 321; CHECK-NEXT: dlstp.32 lr, r2 322; CHECK-NEXT: .LBB7_2: @ %while.body 323; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 324; CHECK-NEXT: vldrh.s32 q0, [r0] 325; CHECK-NEXT: vqdmullb.s16 q0, q0, r1 326; CHECK-NEXT: vstrw.32 q0, [r0], #16 327; CHECK-NEXT: letp lr, .LBB7_2 328; CHECK-NEXT: @ %bb.3: @ %while.end 329; CHECK-NEXT: pop {r7, pc} 330entry: 331 %cmp11 = icmp sgt i32 %N, 0 332 br i1 %cmp11, label %while.body.lr.ph, label %while.end 333 334while.body.lr.ph: ; preds = %entry 335 %conv = trunc i32 %c0 to i16 336 %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0 337 %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 338 br label %while.body 339 340while.body: ; preds = %while.body.lr.ph, %while.body 341 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 342 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 343 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 344 %1 = bitcast i32* %s1.addr.013 to <4 x i16>* 345 %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer) 346 %3 = sext <4 x i16> %2 to <4 x i32> 347 %4 = bitcast <4 x i32> %3 to <8 x i16> 348 %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3) 349 %6 = bitcast i32* %s1.addr.013 to <4 x i32>* 350 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0) 351 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 352 %sub = add nsw i32 %N.addr.012, -4 353 %cmp = icmp sgt i32 %N.addr.012, 4 354 br i1 %cmp, label %while.body, label %while.end 355 356while.end: ; preds = %while.body, %entry 357 ret void 358} 359 360define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) { 361; CHECK-LABEL: vqdmulh: 362; CHECK: @ %bb.0: @ %entry 363; CHECK-NEXT: .save {r7, lr} 364; CHECK-NEXT: push {r7, lr} 365; CHECK-NEXT: cmp r2, #1 366; CHECK-NEXT: it lt 367; CHECK-NEXT: poplt {r7, pc} 368; CHECK-NEXT: .LBB8_1: @ %while.body.lr.ph 369; CHECK-NEXT: dlstp.32 lr, r2 370; CHECK-NEXT: .LBB8_2: @ %while.body 371; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 372; CHECK-NEXT: vldrw.u32 q0, [r0] 373; CHECK-NEXT: vqdmulh.s32 q0, q0, r1 374; CHECK-NEXT: vstrw.32 q0, [r0], #16 375; CHECK-NEXT: letp lr, .LBB8_2 376; CHECK-NEXT: @ %bb.3: @ %while.end 377; CHECK-NEXT: pop {r7, pc} 378entry: 379 %cmp11 = icmp sgt i32 %N, 0 380 br i1 %cmp11, label %while.body.lr.ph, label %while.end 381 382while.body.lr.ph: ; preds = %entry 383 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 384 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 385 br label %while.body 386 387while.body: ; preds = %while.body.lr.ph, %while.body 388 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 389 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 390 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 391 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 392 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 393 %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 394 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 395 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 396 %sub = add nsw i32 %N.addr.012, -4 397 %cmp = icmp sgt i32 %N.addr.012, 4 398 br i1 %cmp, label %while.body, label %while.end 399 400while.end: ; preds = %while.body, %entry 401 ret void 402} 403 404define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) { 405; CHECK-LABEL: vqrdmulh: 406; CHECK: @ %bb.0: @ %entry 407; CHECK-NEXT: .save {r7, lr} 408; CHECK-NEXT: push {r7, lr} 409; CHECK-NEXT: cmp r2, #1 410; CHECK-NEXT: it lt 411; CHECK-NEXT: poplt {r7, pc} 412; CHECK-NEXT: .LBB9_1: @ %while.body.lr.ph 413; CHECK-NEXT: dlstp.32 lr, r2 414; CHECK-NEXT: .LBB9_2: @ %while.body 415; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 416; CHECK-NEXT: vldrw.u32 q0, [r0] 417; CHECK-NEXT: vqrdmulh.s32 q0, q0, r1 418; CHECK-NEXT: vstrw.32 q0, [r0], #16 419; CHECK-NEXT: letp lr, .LBB9_2 420; CHECK-NEXT: @ %bb.3: @ %while.end 421; CHECK-NEXT: pop {r7, pc} 422entry: 423 %cmp11 = icmp sgt i32 %N, 0 424 br i1 %cmp11, label %while.body.lr.ph, label %while.end 425 426while.body.lr.ph: ; preds = %entry 427 %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0 428 %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 429 br label %while.body 430 431while.body: ; preds = %while.body.lr.ph, %while.body 432 %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 433 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 434 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 435 %1 = bitcast i32* %s1.addr.013 to <4 x i32>* 436 %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer) 437 %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2) 438 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0) 439 %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4 440 %sub = add nsw i32 %N.addr.012, -4 441 %cmp = icmp sgt i32 %N.addr.012, 4 442 br i1 %cmp, label %while.body, label %while.end 443 444while.end: ; preds = %while.body, %entry 445 ret void 446} 447 448define void @vaddf(float* %s1, float %c0, i32 %N) { 449; CHECK-LABEL: vaddf: 450; CHECK: @ %bb.0: @ %entry 451; CHECK-NEXT: .save {r7, lr} 452; CHECK-NEXT: push {r7, lr} 453; CHECK-NEXT: cmp r2, #1 454; CHECK-NEXT: it lt 455; CHECK-NEXT: poplt {r7, pc} 456; CHECK-NEXT: .LBB10_1: @ %while.body.lr.ph 457; CHECK-NEXT: dlstp.32 lr, r2 458; CHECK-NEXT: .LBB10_2: @ %while.body 459; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 460; CHECK-NEXT: vldrw.u32 q0, [r0] 461; CHECK-NEXT: vadd.f32 q0, q0, r1 462; CHECK-NEXT: vstrw.32 q0, [r0], #16 463; CHECK-NEXT: letp lr, .LBB10_2 464; CHECK-NEXT: @ %bb.3: @ %while.end 465; CHECK-NEXT: pop {r7, pc} 466entry: 467 %cmp11 = icmp sgt i32 %N, 0 468 br i1 %cmp11, label %while.body.lr.ph, label %while.end 469 470while.body.lr.ph: ; preds = %entry 471 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 472 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 473 br label %while.body 474 475while.body: ; preds = %while.body.lr.ph, %while.body 476 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 477 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 478 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 479 %1 = bitcast float* %s1.addr.013 to <4 x float>* 480 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 481 %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) 482 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) 483 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 484 %sub = add nsw i32 %N.addr.012, -4 485 %cmp = icmp sgt i32 %N.addr.012, 4 486 br i1 %cmp, label %while.body, label %while.end 487 488while.end: ; preds = %while.body, %entry 489 ret void 490} 491 492define void @vsubf(float* %s1, float %c0, i32 %N) { 493; CHECK-LABEL: vsubf: 494; CHECK: @ %bb.0: @ %entry 495; CHECK-NEXT: .save {r7, lr} 496; CHECK-NEXT: push {r7, lr} 497; CHECK-NEXT: cmp r2, #1 498; CHECK-NEXT: it lt 499; CHECK-NEXT: poplt {r7, pc} 500; CHECK-NEXT: .LBB11_1: @ %while.body.lr.ph 501; CHECK-NEXT: dlstp.32 lr, r2 502; CHECK-NEXT: .LBB11_2: @ %while.body 503; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 504; CHECK-NEXT: vldrw.u32 q0, [r0] 505; CHECK-NEXT: vsub.f32 q0, q0, r1 506; CHECK-NEXT: vstrw.32 q0, [r0], #16 507; CHECK-NEXT: letp lr, .LBB11_2 508; CHECK-NEXT: @ %bb.3: @ %while.end 509; CHECK-NEXT: pop {r7, pc} 510entry: 511 %cmp11 = icmp sgt i32 %N, 0 512 br i1 %cmp11, label %while.body.lr.ph, label %while.end 513 514while.body.lr.ph: ; preds = %entry 515 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 516 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 517 br label %while.body 518 519while.body: ; preds = %while.body.lr.ph, %while.body 520 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 521 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 522 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 523 %1 = bitcast float* %s1.addr.013 to <4 x float>* 524 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 525 %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) 526 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) 527 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 528 %sub = add nsw i32 %N.addr.012, -4 529 %cmp = icmp sgt i32 %N.addr.012, 4 530 br i1 %cmp, label %while.body, label %while.end 531 532while.end: ; preds = %while.body, %entry 533 ret void 534} 535 536define void @vmulf(float* %s1, float %c0, i32 %N) { 537; CHECK-LABEL: vmulf: 538; CHECK: @ %bb.0: @ %entry 539; CHECK-NEXT: .save {r7, lr} 540; CHECK-NEXT: push {r7, lr} 541; CHECK-NEXT: cmp r2, #1 542; CHECK-NEXT: it lt 543; CHECK-NEXT: poplt {r7, pc} 544; CHECK-NEXT: .LBB12_1: @ %while.body.lr.ph 545; CHECK-NEXT: dlstp.32 lr, r2 546; CHECK-NEXT: .LBB12_2: @ %while.body 547; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 548; CHECK-NEXT: vldrw.u32 q0, [r0] 549; CHECK-NEXT: vmul.f32 q0, q0, r1 550; CHECK-NEXT: vstrw.32 q0, [r0], #16 551; CHECK-NEXT: letp lr, .LBB12_2 552; CHECK-NEXT: @ %bb.3: @ %while.end 553; CHECK-NEXT: pop {r7, pc} 554entry: 555 %cmp11 = icmp sgt i32 %N, 0 556 br i1 %cmp11, label %while.body.lr.ph, label %while.end 557 558while.body.lr.ph: ; preds = %entry 559 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 560 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 561 br label %while.body 562 563while.body: ; preds = %while.body.lr.ph, %while.body 564 %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 565 %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 566 %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012) 567 %1 = bitcast float* %s1.addr.013 to <4 x float>* 568 %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer) 569 %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2) 570 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0) 571 %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4 572 %sub = add nsw i32 %N.addr.012, -4 573 %cmp = icmp sgt i32 %N.addr.012, 4 574 br i1 %cmp, label %while.body, label %while.end 575 576while.end: ; preds = %while.body, %entry 577 ret void 578} 579 580define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) { 581; CHECK-LABEL: vfma: 582; CHECK: @ %bb.0: @ %entry 583; CHECK-NEXT: .save {r7, lr} 584; CHECK-NEXT: push {r7, lr} 585; CHECK-NEXT: cmp r3, #1 586; CHECK-NEXT: it lt 587; CHECK-NEXT: poplt {r7, pc} 588; CHECK-NEXT: .LBB13_1: @ %while.body.lr.ph 589; CHECK-NEXT: dlstp.32 lr, r3 590; CHECK-NEXT: .LBB13_2: @ %while.body 591; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 592; CHECK-NEXT: vldrw.u32 q0, [r1] 593; CHECK-NEXT: vldrw.u32 q1, [r0] 594; CHECK-NEXT: vfma.f32 q1, q0, r2 595; CHECK-NEXT: vstrw.32 q1, [r0], #16 596; CHECK-NEXT: letp lr, .LBB13_2 597; CHECK-NEXT: @ %bb.3: @ %while.end 598; CHECK-NEXT: pop {r7, pc} 599entry: 600 %cmp12 = icmp sgt i32 %N, 0 601 br i1 %cmp12, label %while.body.lr.ph, label %while.end 602 603while.body.lr.ph: ; preds = %entry 604 %0 = bitcast float* %s2 to <4 x float>* 605 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 606 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 607 br label %while.body 608 609while.body: ; preds = %while.body.lr.ph, %while.body 610 %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 611 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 612 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) 613 %2 = bitcast float* %s1.addr.014 to <4 x float>* 614 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 615 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 616 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1) 617 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) 618 %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 619 %sub = add nsw i32 %N.addr.013, -4 620 %cmp = icmp sgt i32 %N.addr.013, 4 621 br i1 %cmp, label %while.body, label %while.end 622 623while.end: ; preds = %while.body, %entry 624 ret void 625} 626 627define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) { 628; CHECK-LABEL: vfmas: 629; CHECK: @ %bb.0: @ %entry 630; CHECK-NEXT: .save {r7, lr} 631; CHECK-NEXT: push {r7, lr} 632; CHECK-NEXT: cmp r3, #1 633; CHECK-NEXT: it lt 634; CHECK-NEXT: poplt {r7, pc} 635; CHECK-NEXT: .LBB14_1: @ %while.body.lr.ph 636; CHECK-NEXT: dlstp.32 lr, r3 637; CHECK-NEXT: .LBB14_2: @ %while.body 638; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 639; CHECK-NEXT: vldrw.u32 q0, [r1] 640; CHECK-NEXT: vldrw.u32 q1, [r0] 641; CHECK-NEXT: vfmas.f32 q1, q0, r2 642; CHECK-NEXT: vstrw.32 q1, [r0], #16 643; CHECK-NEXT: letp lr, .LBB14_2 644; CHECK-NEXT: @ %bb.3: @ %while.end 645; CHECK-NEXT: pop {r7, pc} 646entry: 647 %cmp12 = icmp sgt i32 %N, 0 648 br i1 %cmp12, label %while.body.lr.ph, label %while.end 649 650while.body.lr.ph: ; preds = %entry 651 %0 = bitcast float* %s2 to <4 x float>* 652 %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0 653 %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 654 br label %while.body 655 656while.body: ; preds = %while.body.lr.ph, %while.body 657 %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ] 658 %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ] 659 %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013) 660 %2 = bitcast float* %s1.addr.014 to <4 x float>* 661 %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 662 %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer) 663 %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1) 664 tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1) 665 %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4 666 %sub = add nsw i32 %N.addr.013, -4 667 %cmp = icmp sgt i32 %N.addr.013, 4 668 br i1 %cmp, label %while.body, label %while.end 669 670while.end: ; preds = %while.body, %entry 671 ret void 672} 673 674declare <4 x i1> @llvm.arm.mve.vctp32(i32) 675declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) 676declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 677declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) 678declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 679declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 680declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 681declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 682declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 683declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>) 684declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) 685declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>) 686declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 687declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>) 688declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) 689declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 690declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) 691declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 692declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>) 693declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>) 694