1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4define void @ptr_iv_v4i32(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { 5; CHECK-LABEL: ptr_iv_v4i32: 6; CHECK: @ %bb.0: @ %vector.ph 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: mov.w lr, #249 10; CHECK-NEXT: adr r3, .LCPI0_0 11; CHECK-NEXT: vldrw.u32 q0, [r3] 12; CHECK-NEXT: .LBB0_1: @ %vector.body 13; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 14; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] 15; CHECK-NEXT: adds r0, #64 16; CHECK-NEXT: vadd.i32 q1, q1, r2 17; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] 18; CHECK-NEXT: adds r1, #64 19; CHECK-NEXT: le lr, .LBB0_1 20; CHECK-NEXT: @ %bb.2: @ %end 21; CHECK-NEXT: pop {r7, pc} 22; CHECK-NEXT: .p2align 4 23; CHECK-NEXT: @ %bb.3: 24; CHECK-NEXT: .LCPI0_0: 25; CHECK-NEXT: .long 0 @ 0x0 26; CHECK-NEXT: .long 4 @ 0x4 27; CHECK-NEXT: .long 8 @ 0x8 28; CHECK-NEXT: .long 12 @ 0xc 29vector.ph: 30 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 31 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 32 br label %vector.body 33 34vector.body: 35 %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] 36 %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] 37 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 38 %0 = getelementptr i32, i32* %pointer.phi, i32 16 39 %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 40 %2 = getelementptr i32, i32* %pointer.phi13, i32 16 41 %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 42 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 43 %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat 44 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 45 %index.next = add i32 %index, 4 46 %5 = icmp eq i32 %index.next, 996 47 br i1 %5, label %end, label %vector.body 48 49end: 50 ret void 51} 52 53define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { 54; CHECK-LABEL: ptr_iv_v4i32_mult: 55; CHECK: @ %bb.0: @ %vector.ph 56; CHECK-NEXT: .save {r7, lr} 57; CHECK-NEXT: push {r7, lr} 58; CHECK-NEXT: mov.w lr, #249 59; CHECK-NEXT: adr r1, .LCPI1_0 60; CHECK-NEXT: adr r3, .LCPI1_1 61; CHECK-NEXT: vldrw.u32 q0, [r3] 62; CHECK-NEXT: vldrw.u32 q1, [r1] 63; CHECK-NEXT: .LBB1_1: @ %vector.body 64; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 65; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] 66; CHECK-NEXT: vadd.i32 q2, q2, r2 67; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] 68; CHECK-NEXT: adds r0, #64 69; CHECK-NEXT: le lr, .LBB1_1 70; CHECK-NEXT: @ %bb.2: @ %end 71; CHECK-NEXT: pop {r7, pc} 72; CHECK-NEXT: .p2align 4 73; CHECK-NEXT: @ %bb.3: 74; CHECK-NEXT: .LCPI1_0: 75; CHECK-NEXT: .long 5 @ 0x5 76; CHECK-NEXT: .long 9 @ 0x9 77; CHECK-NEXT: .long 13 @ 0xd 78; CHECK-NEXT: .long 17 @ 0x11 79; CHECK-NEXT: .LCPI1_1: 80; CHECK-NEXT: .long 3 @ 0x3 81; CHECK-NEXT: .long 7 @ 0x7 82; CHECK-NEXT: .long 11 @ 0xb 83; CHECK-NEXT: .long 15 @ 0xf 84vector.ph: 85 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 86 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 87 br label %vector.body 88 89vector.body: 90 %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] 91 %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] 92 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 93 %0 = getelementptr i32, i32* %pointer.phi, i32 16 94 %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 95 %gather.address = getelementptr i32, <4 x i32*> %1, i32 3 96 %2 = getelementptr i32, i32* %pointer.phi13, i32 16 97 %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 98 %scatter.address = getelementptr i32, <4 x i32*> %1, i32 5 99 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gather.address, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 100 %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat 101 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %scatter.address, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 102 %index.next = add i32 %index, 4 103 %5 = icmp eq i32 %index.next, 996 104 br i1 %5, label %end, label %vector.body 105 106end: 107 ret void 108} 109 110define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { 111; CHECK-LABEL: ptr_iv_v8i16: 112; CHECK: @ %bb.0: @ %vector.ph 113; CHECK-NEXT: .save {r7, lr} 114; CHECK-NEXT: push {r7, lr} 115; CHECK-NEXT: mov.w lr, #249 116; CHECK-NEXT: adr r3, .LCPI2_0 117; CHECK-NEXT: vldrw.u32 q0, [r3] 118; CHECK-NEXT: .LBB2_1: @ %vector.body 119; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 120; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] 121; CHECK-NEXT: adds r0, #64 122; CHECK-NEXT: vadd.i16 q1, q1, r2 123; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] 124; CHECK-NEXT: adds r1, #64 125; CHECK-NEXT: le lr, .LBB2_1 126; CHECK-NEXT: @ %bb.2: @ %end 127; CHECK-NEXT: pop {r7, pc} 128; CHECK-NEXT: .p2align 4 129; CHECK-NEXT: @ %bb.3: 130; CHECK-NEXT: .LCPI2_0: 131; CHECK-NEXT: .short 0 @ 0x0 132; CHECK-NEXT: .short 4 @ 0x4 133; CHECK-NEXT: .short 8 @ 0x8 134; CHECK-NEXT: .short 12 @ 0xc 135; CHECK-NEXT: .short 16 @ 0x10 136; CHECK-NEXT: .short 20 @ 0x14 137; CHECK-NEXT: .short 24 @ 0x18 138; CHECK-NEXT: .short 28 @ 0x1c 139vector.ph: 140 %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 141 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 142 br label %vector.body 143 144vector.body: 145 %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] 146 %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] 147 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 148 %0 = getelementptr i16, i16* %pointer.phi, i32 32 149 %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 150 %2 = getelementptr i16, i16* %pointer.phi13, i32 32 151 %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 152 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 153 %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat 154 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 155 %index.next = add i32 %index, 4 156 %5 = icmp eq i32 %index.next, 996 157 br i1 %5, label %end, label %vector.body 158 159end: 160 ret void 161} 162 163 164define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { 165; CHECK-LABEL: ptr_iv_v8i16_mult: 166; CHECK: @ %bb.0: @ %vector.ph 167; CHECK-NEXT: .save {r7, lr} 168; CHECK-NEXT: push {r7, lr} 169; CHECK-NEXT: mov.w lr, #249 170; CHECK-NEXT: adr.w r12, .LCPI3_0 171; CHECK-NEXT: adr r3, .LCPI3_1 172; CHECK-NEXT: vldrw.u32 q0, [r3] 173; CHECK-NEXT: vldrw.u32 q1, [r12] 174; CHECK-NEXT: .LBB3_1: @ %vector.body 175; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 176; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] 177; CHECK-NEXT: adds r0, #64 178; CHECK-NEXT: vadd.i16 q2, q2, r2 179; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] 180; CHECK-NEXT: adds r1, #64 181; CHECK-NEXT: le lr, .LBB3_1 182; CHECK-NEXT: @ %bb.2: @ %end 183; CHECK-NEXT: pop {r7, pc} 184; CHECK-NEXT: .p2align 4 185; CHECK-NEXT: @ %bb.3: 186; CHECK-NEXT: .LCPI3_0: 187; CHECK-NEXT: .short 5 @ 0x5 188; CHECK-NEXT: .short 9 @ 0x9 189; CHECK-NEXT: .short 13 @ 0xd 190; CHECK-NEXT: .short 17 @ 0x11 191; CHECK-NEXT: .short 21 @ 0x15 192; CHECK-NEXT: .short 25 @ 0x19 193; CHECK-NEXT: .short 29 @ 0x1d 194; CHECK-NEXT: .short 33 @ 0x21 195; CHECK-NEXT: .LCPI3_1: 196; CHECK-NEXT: .short 3 @ 0x3 197; CHECK-NEXT: .short 7 @ 0x7 198; CHECK-NEXT: .short 11 @ 0xb 199; CHECK-NEXT: .short 15 @ 0xf 200; CHECK-NEXT: .short 19 @ 0x13 201; CHECK-NEXT: .short 23 @ 0x17 202; CHECK-NEXT: .short 27 @ 0x1b 203; CHECK-NEXT: .short 31 @ 0x1f 204vector.ph: 205 %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 206 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer 207 br label %vector.body 208 209vector.body: 210 %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] 211 %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] 212 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 213 %0 = getelementptr i16, i16* %pointer.phi, i32 32 214 %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 215 %gather.address = getelementptr i16, <8 x i16*> %1, i16 3 216 %2 = getelementptr i16, i16* %pointer.phi13, i32 32 217 %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 218 %scatter.address = getelementptr i16, <8 x i16*> %3, i16 5 219 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gather.address, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 220 %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat 221 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %scatter.address, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 222 %index.next = add i32 %index, 4 223 %5 = icmp eq i32 %index.next, 996 224 br i1 %5, label %end, label %vector.body 225 226end: 227 ret void 228} 229 230define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { 231; CHECK-LABEL: ptr_iv_v16i8: 232; CHECK: @ %bb.0: @ %vector.ph 233; CHECK-NEXT: .save {r7, lr} 234; CHECK-NEXT: push {r7, lr} 235; CHECK-NEXT: mov.w lr, #249 236; CHECK-NEXT: adr r3, .LCPI4_0 237; CHECK-NEXT: vldrw.u32 q0, [r3] 238; CHECK-NEXT: .LBB4_1: @ %vector.body 239; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 240; CHECK-NEXT: vldrb.u8 q1, [r0, q0] 241; CHECK-NEXT: adds r0, #64 242; CHECK-NEXT: vadd.i8 q1, q1, r2 243; CHECK-NEXT: vstrb.8 q1, [r1, q0] 244; CHECK-NEXT: adds r1, #64 245; CHECK-NEXT: le lr, .LBB4_1 246; CHECK-NEXT: @ %bb.2: @ %end 247; CHECK-NEXT: pop {r7, pc} 248; CHECK-NEXT: .p2align 4 249; CHECK-NEXT: @ %bb.3: 250; CHECK-NEXT: .LCPI4_0: 251; CHECK-NEXT: .byte 0 @ 0x0 252; CHECK-NEXT: .byte 4 @ 0x4 253; CHECK-NEXT: .byte 8 @ 0x8 254; CHECK-NEXT: .byte 12 @ 0xc 255; CHECK-NEXT: .byte 16 @ 0x10 256; CHECK-NEXT: .byte 20 @ 0x14 257; CHECK-NEXT: .byte 24 @ 0x18 258; CHECK-NEXT: .byte 28 @ 0x1c 259; CHECK-NEXT: .byte 32 @ 0x20 260; CHECK-NEXT: .byte 36 @ 0x24 261; CHECK-NEXT: .byte 40 @ 0x28 262; CHECK-NEXT: .byte 44 @ 0x2c 263; CHECK-NEXT: .byte 48 @ 0x30 264; CHECK-NEXT: .byte 52 @ 0x34 265; CHECK-NEXT: .byte 56 @ 0x38 266; CHECK-NEXT: .byte 60 @ 0x3c 267vector.ph: ; preds = %entry 268 %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 269 %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 270 br label %vector.body 271 272vector.body: 273 %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] 274 %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] 275 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 276 %0 = getelementptr i8, i8* %pointer.phi, i32 64 277 %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60> 278 %2 = getelementptr i8, i8* %pointer.phi13, i32 64 279 %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60> 280 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 281 %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat 282 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %3, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 283 %index.next = add i32 %index, 4 284 %5 = icmp eq i32 %index.next, 996 285 br i1 %5, label %end, label %vector.body 286 287end: 288 ret void 289} 290 291 292define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { 293; CHECK-LABEL: ptr_iv_v16i8_mult: 294; CHECK: @ %bb.0: @ %vector.ph 295; CHECK-NEXT: .save {r7, lr} 296; CHECK-NEXT: push {r7, lr} 297; CHECK-NEXT: mov.w lr, #249 298; CHECK-NEXT: adr.w r12, .LCPI5_0 299; CHECK-NEXT: adr r3, .LCPI5_1 300; CHECK-NEXT: vldrw.u32 q0, [r3] 301; CHECK-NEXT: vldrw.u32 q1, [r12] 302; CHECK-NEXT: .LBB5_1: @ %vector.body 303; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 304; CHECK-NEXT: vldrb.u8 q2, [r0, q0] 305; CHECK-NEXT: adds r0, #64 306; CHECK-NEXT: vadd.i8 q2, q2, r2 307; CHECK-NEXT: vstrb.8 q2, [r1, q1] 308; CHECK-NEXT: adds r1, #64 309; CHECK-NEXT: le lr, .LBB5_1 310; CHECK-NEXT: @ %bb.2: @ %end 311; CHECK-NEXT: pop {r7, pc} 312; CHECK-NEXT: .p2align 4 313; CHECK-NEXT: @ %bb.3: 314; CHECK-NEXT: .LCPI5_0: 315; CHECK-NEXT: .byte 5 @ 0x5 316; CHECK-NEXT: .byte 9 @ 0x9 317; CHECK-NEXT: .byte 13 @ 0xd 318; CHECK-NEXT: .byte 17 @ 0x11 319; CHECK-NEXT: .byte 21 @ 0x15 320; CHECK-NEXT: .byte 25 @ 0x19 321; CHECK-NEXT: .byte 29 @ 0x1d 322; CHECK-NEXT: .byte 33 @ 0x21 323; CHECK-NEXT: .byte 37 @ 0x25 324; CHECK-NEXT: .byte 41 @ 0x29 325; CHECK-NEXT: .byte 45 @ 0x2d 326; CHECK-NEXT: .byte 49 @ 0x31 327; CHECK-NEXT: .byte 53 @ 0x35 328; CHECK-NEXT: .byte 57 @ 0x39 329; CHECK-NEXT: .byte 61 @ 0x3d 330; CHECK-NEXT: .byte 65 @ 0x41 331; CHECK-NEXT: .LCPI5_1: 332; CHECK-NEXT: .byte 3 @ 0x3 333; CHECK-NEXT: .byte 7 @ 0x7 334; CHECK-NEXT: .byte 11 @ 0xb 335; CHECK-NEXT: .byte 15 @ 0xf 336; CHECK-NEXT: .byte 19 @ 0x13 337; CHECK-NEXT: .byte 23 @ 0x17 338; CHECK-NEXT: .byte 27 @ 0x1b 339; CHECK-NEXT: .byte 31 @ 0x1f 340; CHECK-NEXT: .byte 35 @ 0x23 341; CHECK-NEXT: .byte 39 @ 0x27 342; CHECK-NEXT: .byte 43 @ 0x2b 343; CHECK-NEXT: .byte 47 @ 0x2f 344; CHECK-NEXT: .byte 51 @ 0x33 345; CHECK-NEXT: .byte 55 @ 0x37 346; CHECK-NEXT: .byte 59 @ 0x3b 347; CHECK-NEXT: .byte 63 @ 0x3f 348vector.ph: ; preds = %entry 349 %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 350 %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer 351 br label %vector.body 352 353vector.body: 354 %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] 355 %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] 356 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 357 %0 = getelementptr i8, i8* %pointer.phi, i32 64 358 %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60> 359 %gather.address = getelementptr i8, <16 x i8*> %1, i8 3 360 %2 = getelementptr i8, i8* %pointer.phi13, i32 64 361 %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> <i8 0, i8 4, i8 8, i8 12, i8 16, i8 20, i8 24, i8 28, i8 32, i8 36, i8 40, i8 44, i8 48, i8 52, i8 56, i8 60> 362 %scatter.address = getelementptr i8, <16 x i8*> %3, i8 5 363 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gather.address, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 364 %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat 365 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %scatter.address, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 366 %index.next = add i32 %index, 4 367 %5 = icmp eq i32 %index.next, 996 368 br i1 %5, label %end, label %vector.body 369 370end: 371 ret void 372} 373 374define void @ptr_iv_v4f32(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { 375; CHECK-LABEL: ptr_iv_v4f32: 376; CHECK: @ %bb.0: @ %vector.ph 377; CHECK-NEXT: .save {r7, lr} 378; CHECK-NEXT: push {r7, lr} 379; CHECK-NEXT: mov.w lr, #249 380; CHECK-NEXT: adr r3, .LCPI6_0 381; CHECK-NEXT: vldrw.u32 q0, [r3] 382; CHECK-NEXT: .LBB6_1: @ %vector.body 383; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 384; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] 385; CHECK-NEXT: adds r0, #64 386; CHECK-NEXT: vadd.f32 q1, q1, r2 387; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] 388; CHECK-NEXT: adds r1, #64 389; CHECK-NEXT: le lr, .LBB6_1 390; CHECK-NEXT: @ %bb.2: @ %end 391; CHECK-NEXT: pop {r7, pc} 392; CHECK-NEXT: .p2align 4 393; CHECK-NEXT: @ %bb.3: 394; CHECK-NEXT: .LCPI6_0: 395; CHECK-NEXT: .long 0 @ 0x0 396; CHECK-NEXT: .long 4 @ 0x4 397; CHECK-NEXT: .long 8 @ 0x8 398; CHECK-NEXT: .long 12 @ 0xc 399vector.ph: ; preds = %entry 400 %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 401 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 402 br label %vector.body 403 404vector.body: 405 %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] 406 %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] 407 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 408 %0 = getelementptr float, float* %pointer.phi, i32 16 409 %1 = getelementptr float, float* %pointer.phi, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 410 %2 = getelementptr float, float* %pointer.phi13, i32 16 411 %3 = getelementptr float, float* %pointer.phi13, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 412 %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 413 %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat 414 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 415 %index.next = add i32 %index, 4 416 %5 = icmp eq i32 %index.next, 996 417 br i1 %5, label %end, label %vector.body 418 419end: 420 ret void 421} 422 423define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { 424; CHECK-LABEL: ptr_iv_v4f32_mult: 425; CHECK: @ %bb.0: @ %vector.ph 426; CHECK-NEXT: .save {r7, lr} 427; CHECK-NEXT: push {r7, lr} 428; CHECK-NEXT: mov.w lr, #249 429; CHECK-NEXT: adr r1, .LCPI7_0 430; CHECK-NEXT: adr r3, .LCPI7_1 431; CHECK-NEXT: vldrw.u32 q0, [r3] 432; CHECK-NEXT: vldrw.u32 q1, [r1] 433; CHECK-NEXT: .LBB7_1: @ %vector.body 434; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 435; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] 436; CHECK-NEXT: vadd.f32 q2, q2, r2 437; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] 438; CHECK-NEXT: adds r0, #64 439; CHECK-NEXT: le lr, .LBB7_1 440; CHECK-NEXT: @ %bb.2: @ %end 441; CHECK-NEXT: pop {r7, pc} 442; CHECK-NEXT: .p2align 4 443; CHECK-NEXT: @ %bb.3: 444; CHECK-NEXT: .LCPI7_0: 445; CHECK-NEXT: .long 5 @ 0x5 446; CHECK-NEXT: .long 9 @ 0x9 447; CHECK-NEXT: .long 13 @ 0xd 448; CHECK-NEXT: .long 17 @ 0x11 449; CHECK-NEXT: .LCPI7_1: 450; CHECK-NEXT: .long 3 @ 0x3 451; CHECK-NEXT: .long 7 @ 0x7 452; CHECK-NEXT: .long 11 @ 0xb 453; CHECK-NEXT: .long 15 @ 0xf 454vector.ph: ; preds = %entry 455 %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 456 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer 457 br label %vector.body 458 459vector.body: 460 %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] 461 %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] 462 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 463 %0 = getelementptr float, float* %pointer.phi, i32 16 464 %1 = getelementptr float, float* %pointer.phi, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 465 %gather.address = getelementptr float, <4 x float*> %1, i32 3 466 %2 = getelementptr float, float* %pointer.phi13, i32 16 467 %3 = getelementptr float, float* %pointer.phi13, <4 x i32> <i32 0, i32 4, i32 8, i32 12> 468 %scatter.address = getelementptr float, <4 x float*> %1, i32 5 469 %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gather.address, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef) 470 %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat 471 call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %scatter.address, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 472 %index.next = add i32 %index, 4 473 %5 = icmp eq i32 %index.next, 996 474 br i1 %5, label %end, label %vector.body 475 476end: 477 ret void 478} 479 480define void @ptr_iv_v8f16(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { 481; CHECK-LABEL: ptr_iv_v8f16: 482; CHECK: @ %bb.0: @ %vector.ph 483; CHECK-NEXT: .save {r7, lr} 484; CHECK-NEXT: push {r7, lr} 485; CHECK-NEXT: vmov s0, r2 486; CHECK-NEXT: mov.w lr, #249 487; CHECK-NEXT: vcvtb.f16.f32 s0, s0 488; CHECK-NEXT: adr r3, .LCPI8_0 489; CHECK-NEXT: vmov.f16 r2, s0 490; CHECK-NEXT: vldrw.u32 q0, [r3] 491; CHECK-NEXT: .LBB8_1: @ %vector.body 492; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 493; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] 494; CHECK-NEXT: adds r0, #64 495; CHECK-NEXT: vadd.f16 q1, q1, r2 496; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] 497; CHECK-NEXT: adds r1, #64 498; CHECK-NEXT: le lr, .LBB8_1 499; CHECK-NEXT: @ %bb.2: @ %end 500; CHECK-NEXT: pop {r7, pc} 501; CHECK-NEXT: .p2align 4 502; CHECK-NEXT: @ %bb.3: 503; CHECK-NEXT: .LCPI8_0: 504; CHECK-NEXT: .short 0 @ 0x0 505; CHECK-NEXT: .short 4 @ 0x4 506; CHECK-NEXT: .short 8 @ 0x8 507; CHECK-NEXT: .short 12 @ 0xc 508; CHECK-NEXT: .short 16 @ 0x10 509; CHECK-NEXT: .short 20 @ 0x14 510; CHECK-NEXT: .short 24 @ 0x18 511; CHECK-NEXT: .short 28 @ 0x1c 512vector.ph: 513 %y.trunc = fptrunc float %y to half 514 %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 515 %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 516 br label %vector.body 517 518vector.body: 519 %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] 520 %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] 521 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 522 %0 = getelementptr half, half* %pointer.phi, i32 32 523 %1 = getelementptr half, half* %pointer.phi, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 524 %2 = getelementptr half, half* %pointer.phi13, i32 32 525 %3 = getelementptr half, half* %pointer.phi13, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 526 %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %1, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 527 %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat 528 call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %3, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 529 %index.next = add i32 %index, 4 530 %5 = icmp eq i32 %index.next, 996 531 br i1 %5, label %end, label %vector.body 532 533end: 534 ret void 535} 536 537define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { 538; CHECK-LABEL: ptr_iv_v8f16_mult: 539; CHECK: @ %bb.0: @ %vector.ph 540; CHECK-NEXT: .save {r7, lr} 541; CHECK-NEXT: push {r7, lr} 542; CHECK-NEXT: vmov s0, r2 543; CHECK-NEXT: adr r2, .LCPI9_0 544; CHECK-NEXT: vcvtb.f16.f32 s0, s0 545; CHECK-NEXT: mov.w lr, #249 546; CHECK-NEXT: vmov.f16 r1, s0 547; CHECK-NEXT: vldrw.u32 q0, [r2] 548; CHECK-NEXT: adr r2, .LCPI9_1 549; CHECK-NEXT: vldrw.u32 q1, [r2] 550; CHECK-NEXT: .LBB9_1: @ %vector.body 551; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 552; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] 553; CHECK-NEXT: vadd.f16 q2, q2, r1 554; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] 555; CHECK-NEXT: adds r0, #64 556; CHECK-NEXT: le lr, .LBB9_1 557; CHECK-NEXT: @ %bb.2: @ %end 558; CHECK-NEXT: pop {r7, pc} 559; CHECK-NEXT: .p2align 4 560; CHECK-NEXT: @ %bb.3: 561; CHECK-NEXT: .LCPI9_0: 562; CHECK-NEXT: .short 3 @ 0x3 563; CHECK-NEXT: .short 7 @ 0x7 564; CHECK-NEXT: .short 11 @ 0xb 565; CHECK-NEXT: .short 15 @ 0xf 566; CHECK-NEXT: .short 19 @ 0x13 567; CHECK-NEXT: .short 23 @ 0x17 568; CHECK-NEXT: .short 27 @ 0x1b 569; CHECK-NEXT: .short 31 @ 0x1f 570; CHECK-NEXT: .LCPI9_1: 571; CHECK-NEXT: .short 5 @ 0x5 572; CHECK-NEXT: .short 9 @ 0x9 573; CHECK-NEXT: .short 13 @ 0xd 574; CHECK-NEXT: .short 17 @ 0x11 575; CHECK-NEXT: .short 21 @ 0x15 576; CHECK-NEXT: .short 25 @ 0x19 577; CHECK-NEXT: .short 29 @ 0x1d 578; CHECK-NEXT: .short 33 @ 0x21 579vector.ph: 580 %y.trunc = fptrunc float %y to half 581 %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 582 %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer 583 br label %vector.body 584 585vector.body: 586 %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] 587 %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] 588 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 589 %0 = getelementptr half, half* %pointer.phi, i32 32 590 %1 = getelementptr half, half* %pointer.phi, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 591 %gather.address = getelementptr half, <8 x half*> %1, i32 3 592 %2 = getelementptr half, half* %pointer.phi13, i32 32 593 %3 = getelementptr half, half* %pointer.phi13, <8 x i16> <i16 0, i16 4, i16 8, i16 12, i16 16, i16 20, i16 24, i16 28> 594 %scatter.address = getelementptr half, <8 x half*> %1, i32 5 595 %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gather.address, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef) 596 %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat 597 call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %scatter.address, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 598 %index.next = add i32 %index, 4 599 %5 = icmp eq i32 %index.next, 996 600 br i1 %5, label %end, label %vector.body 601 602end: 603 ret void 604} 605 606 607define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, i32* nocapture %z, i32 %n) { 608; CHECK-LABEL: three_pointer_iv_v4i32: 609; CHECK: @ %bb.0: @ %vector.ph 610; CHECK-NEXT: .save {r7, lr} 611; CHECK-NEXT: push {r7, lr} 612; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 613; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 614; CHECK-NEXT: adr.w r12, .LCPI10_0 615; CHECK-NEXT: adr.w lr, .LCPI10_1 616; CHECK-NEXT: adr r3, .LCPI10_2 617; CHECK-NEXT: vldrw.u32 q2, [lr] 618; CHECK-NEXT: vldrw.u32 q1, [r3] 619; CHECK-NEXT: vldrw.u32 q3, [r12] 620; CHECK-NEXT: vmov.i32 q0, #0xa 621; CHECK-NEXT: .LBB10_1: @ %vector.body 622; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 623; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] 624; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] 625; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] 626; CHECK-NEXT: subs r2, #4 627; CHECK-NEXT: vmul.i32 q4, q5, q4 628; CHECK-NEXT: add.w r0, r0, #48 629; CHECK-NEXT: vmul.i32 q6, q5, q6 630; CHECK-NEXT: vmul.i32 q5, q5, q0 631; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] 632; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] 633; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] 634; CHECK-NEXT: add.w r1, r1, #48 635; CHECK-NEXT: bne .LBB10_1 636; CHECK-NEXT: @ %bb.2: @ %end 637; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 638; CHECK-NEXT: pop {r7, pc} 639; CHECK-NEXT: .p2align 4 640; CHECK-NEXT: @ %bb.3: 641; CHECK-NEXT: .LCPI10_0: 642; CHECK-NEXT: .long 1 @ 0x1 643; CHECK-NEXT: .long 4 @ 0x4 644; CHECK-NEXT: .long 7 @ 0x7 645; CHECK-NEXT: .long 10 @ 0xa 646; CHECK-NEXT: .LCPI10_1: 647; CHECK-NEXT: .long 0 @ 0x0 648; CHECK-NEXT: .long 3 @ 0x3 649; CHECK-NEXT: .long 6 @ 0x6 650; CHECK-NEXT: .long 9 @ 0x9 651; CHECK-NEXT: .LCPI10_2: 652; CHECK-NEXT: .long 2 @ 0x2 653; CHECK-NEXT: .long 5 @ 0x5 654; CHECK-NEXT: .long 8 @ 0x8 655; CHECK-NEXT: .long 11 @ 0xb 656vector.ph: 657 br label %vector.body 658 659vector.body: 660 %pointer.phi = phi i32* [ %x, %vector.ph ], [ %v3, %vector.body ] 661 %pointer.phi55 = phi i32* [ %z, %vector.ph ], [ %v4, %vector.body ] 662 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 663 %vector.gep = getelementptr i32, i32* %pointer.phi, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 664 %v3 = getelementptr i32, i32* %pointer.phi, i32 12 665 %vector.gep56 = getelementptr i32, i32* %pointer.phi55, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 666 %v4 = getelementptr i32, i32* %pointer.phi55, i32 12 667 %v5 = add i32 %index, 0 668 %v6 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 1 669 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %vector.gep, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 670 %v7 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 2 671 %wide.masked.gather57 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v6, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 672 %wide.masked.gather58 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v7, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 673 %v11 = mul nuw nsw <4 x i32> %wide.masked.gather, <i32 10, i32 10, i32 10, i32 10> 674 %v13 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather57 675 %v15 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather58 676 %v17 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 1 677 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v11, <4 x i32*> %vector.gep56, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 678 %v18 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 2 679 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v13, <4 x i32*> %v17, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 680 call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v15, <4 x i32*> %v18, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 681 %index.next = add i32 %index, 4 682 %v37 = icmp eq i32 %index.next, %n 683 br i1 %v37, label %end, label %vector.body 684 685end: 686 ret void; 687} 688 689define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { 690; CHECK-LABEL: three_pointer_iv_v4i8: 691; CHECK: @ %bb.0: @ %vector.ph 692; CHECK-NEXT: .save {r7, lr} 693; CHECK-NEXT: push {r7, lr} 694; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 695; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 696; CHECK-NEXT: adr.w r12, .LCPI11_0 697; CHECK-NEXT: adr.w lr, .LCPI11_1 698; CHECK-NEXT: adr r3, .LCPI11_2 699; CHECK-NEXT: vldrw.u32 q2, [lr] 700; CHECK-NEXT: vldrw.u32 q1, [r3] 701; CHECK-NEXT: vldrw.u32 q3, [r12] 702; CHECK-NEXT: vmov.i32 q0, #0xa 703; CHECK-NEXT: .LBB11_1: @ %vector.body 704; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 705; CHECK-NEXT: vldrb.u32 q4, [r0, q1] 706; CHECK-NEXT: vldrb.u32 q5, [r0, q2] 707; CHECK-NEXT: vldrb.u32 q6, [r0, q3] 708; CHECK-NEXT: subs r2, #4 709; CHECK-NEXT: vmul.i32 q4, q5, q4 710; CHECK-NEXT: add.w r0, r0, #12 711; CHECK-NEXT: vmul.i32 q6, q5, q6 712; CHECK-NEXT: vmul.i32 q5, q5, q0 713; CHECK-NEXT: vstrb.32 q5, [r1, q2] 714; CHECK-NEXT: vstrb.32 q6, [r1, q3] 715; CHECK-NEXT: vstrb.32 q4, [r1, q1] 716; CHECK-NEXT: add.w r1, r1, #12 717; CHECK-NEXT: bne .LBB11_1 718; CHECK-NEXT: @ %bb.2: @ %end 719; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 720; CHECK-NEXT: pop {r7, pc} 721; CHECK-NEXT: .p2align 4 722; CHECK-NEXT: @ %bb.3: 723; CHECK-NEXT: .LCPI11_0: 724; CHECK-NEXT: .long 1 @ 0x1 725; CHECK-NEXT: .long 4 @ 0x4 726; CHECK-NEXT: .long 7 @ 0x7 727; CHECK-NEXT: .long 10 @ 0xa 728; CHECK-NEXT: .LCPI11_1: 729; CHECK-NEXT: .long 0 @ 0x0 730; CHECK-NEXT: .long 3 @ 0x3 731; CHECK-NEXT: .long 6 @ 0x6 732; CHECK-NEXT: .long 9 @ 0x9 733; CHECK-NEXT: .LCPI11_2: 734; CHECK-NEXT: .long 2 @ 0x2 735; CHECK-NEXT: .long 5 @ 0x5 736; CHECK-NEXT: .long 8 @ 0x8 737; CHECK-NEXT: .long 11 @ 0xb 738vector.ph: 739 br label %vector.body 740 741vector.body: ; preds = %vector.body, %vector.ph 742 %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] 743 %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] 744 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 745 %vector.gep = getelementptr i8, i8* %pointer.phi, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 746 %v3 = getelementptr i8, i8* %pointer.phi, i32 12 747 %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 748 %v4 = getelementptr i8, i8* %pointer.phi55, i32 12 749 %v5 = add i32 %index, 0 750 %v6 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 1 751 %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %vector.gep, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 752 %v7 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 2 753 %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v6, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 754 %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v7, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef) 755 %v8 = zext <4 x i8> %wide.masked.gather to <4 x i32> 756 %v9 = zext <4 x i8> %wide.masked.gather57 to <4 x i32> 757 %v10 = zext <4 x i8> %wide.masked.gather58 to <4 x i32> 758 %v11 = mul nuw nsw <4 x i32> %v8, <i32 10, i32 10, i32 10, i32 10> 759 %v12 = trunc <4 x i32> %v11 to <4 x i8> 760 %v13 = mul nuw nsw <4 x i32> %v8, %v9 761 %v14 = trunc <4 x i32> %v13 to <4 x i8> 762 %v15 = mul nuw nsw <4 x i32> %v8, %v10 763 %v16 = trunc <4 x i32> %v15 to <4 x i8> 764 %v17 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 1 765 call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v12, <4 x i8*> %vector.gep56, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 766 %v18 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 2 767 call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v14, <4 x i8*> %v17, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 768 call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v16, <4 x i8*> %v18, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>) 769 %index.next = add i32 %index, 4 770 %v37 = icmp eq i32 %index.next, %n 771 br i1 %v37, label %end, label %vector.body 772 773end: 774 ret void; 775} 776 777define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { 778; CHECK-LABEL: three_pointer_iv_v8i16: 779; CHECK: @ %bb.0: @ %vector.ph 780; CHECK-NEXT: .save {r7, lr} 781; CHECK-NEXT: push {r7, lr} 782; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 783; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 784; CHECK-NEXT: adr.w r12, .LCPI12_0 785; CHECK-NEXT: adr.w lr, .LCPI12_1 786; CHECK-NEXT: adr r3, .LCPI12_2 787; CHECK-NEXT: vldrw.u32 q2, [lr] 788; CHECK-NEXT: vldrw.u32 q1, [r3] 789; CHECK-NEXT: vldrw.u32 q3, [r12] 790; CHECK-NEXT: vmov.i16 q0, #0xa 791; CHECK-NEXT: .LBB12_1: @ %vector.body 792; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 793; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] 794; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] 795; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] 796; CHECK-NEXT: subs r2, #4 797; CHECK-NEXT: vmul.i16 q4, q5, q4 798; CHECK-NEXT: add.w r0, r0, #48 799; CHECK-NEXT: vmul.i16 q6, q5, q6 800; CHECK-NEXT: vmul.i16 q5, q5, q0 801; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] 802; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] 803; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] 804; CHECK-NEXT: add.w r1, r1, #48 805; CHECK-NEXT: bne .LBB12_1 806; CHECK-NEXT: @ %bb.2: @ %end 807; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 808; CHECK-NEXT: pop {r7, pc} 809; CHECK-NEXT: .p2align 4 810; CHECK-NEXT: @ %bb.3: 811; CHECK-NEXT: .LCPI12_0: 812; CHECK-NEXT: .short 1 @ 0x1 813; CHECK-NEXT: .short 4 @ 0x4 814; CHECK-NEXT: .short 7 @ 0x7 815; CHECK-NEXT: .short 10 @ 0xa 816; CHECK-NEXT: .short 13 @ 0xd 817; CHECK-NEXT: .short 16 @ 0x10 818; CHECK-NEXT: .short 19 @ 0x13 819; CHECK-NEXT: .short 22 @ 0x16 820; CHECK-NEXT: .LCPI12_1: 821; CHECK-NEXT: .short 0 @ 0x0 822; CHECK-NEXT: .short 3 @ 0x3 823; CHECK-NEXT: .short 6 @ 0x6 824; CHECK-NEXT: .short 9 @ 0x9 825; CHECK-NEXT: .short 12 @ 0xc 826; CHECK-NEXT: .short 15 @ 0xf 827; CHECK-NEXT: .short 18 @ 0x12 828; CHECK-NEXT: .short 21 @ 0x15 829; CHECK-NEXT: .LCPI12_2: 830; CHECK-NEXT: .short 2 @ 0x2 831; CHECK-NEXT: .short 5 @ 0x5 832; CHECK-NEXT: .short 8 @ 0x8 833; CHECK-NEXT: .short 11 @ 0xb 834; CHECK-NEXT: .short 14 @ 0xe 835; CHECK-NEXT: .short 17 @ 0x11 836; CHECK-NEXT: .short 20 @ 0x14 837; CHECK-NEXT: .short 23 @ 0x17 838vector.ph: 839 br label %vector.body 840 841vector.body: 842 %pointer.phi = phi i16* [ %x, %vector.ph ], [ %v3, %vector.body ] 843 %pointer.phi55 = phi i16* [ %z, %vector.ph ], [ %v4, %vector.body ] 844 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 845 %vector.gep = getelementptr i16, i16* %pointer.phi, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21> 846 %v3 = getelementptr i16, i16* %pointer.phi, i32 24 847 %vector.gep56 = getelementptr i16, i16* %pointer.phi55, <8 x i16> <i16 0, i16 3, i16 6, i16 9, i16 12, i16 15, i16 18, i16 21> 848 %v4 = getelementptr i16, i16* %pointer.phi55, i32 24 849 %v5 = add i32 %index, 0 850 %v6 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 1 851 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %vector.gep, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 852 %v7 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 2 853 %wide.masked.gather57 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v6, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 854 %wide.masked.gather58 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v7, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 855 %v11 = mul nuw nsw <8 x i16> %wide.masked.gather, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10> 856 %v13 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather57 857 %v15 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather58 858 %v17 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 1 859 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v11, <8 x i16*> %vector.gep56, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 860 %v18 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 2 861 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v13, <8 x i16*> %v17, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 862 call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v15, <8 x i16*> %v18, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 863 %index.next = add i32 %index, 4 864 %v37 = icmp eq i32 %index.next, %n 865 br i1 %v37, label %end, label %vector.body 866 867end: 868 ret void; 869} 870 871define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { 872; CHECK-LABEL: three_pointer_iv_v16i8: 873; CHECK: @ %bb.0: @ %vector.ph 874; CHECK-NEXT: .save {r7, lr} 875; CHECK-NEXT: push {r7, lr} 876; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 877; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 878; CHECK-NEXT: adr.w r12, .LCPI13_0 879; CHECK-NEXT: adr.w lr, .LCPI13_1 880; CHECK-NEXT: adr r3, .LCPI13_2 881; CHECK-NEXT: vldrw.u32 q2, [lr] 882; CHECK-NEXT: vldrw.u32 q1, [r3] 883; CHECK-NEXT: vldrw.u32 q3, [r12] 884; CHECK-NEXT: vmov.i8 q0, #0xa 885; CHECK-NEXT: .LBB13_1: @ %vector.body 886; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 887; CHECK-NEXT: vldrb.u8 q4, [r0, q1] 888; CHECK-NEXT: vldrb.u8 q5, [r0, q2] 889; CHECK-NEXT: vldrb.u8 q6, [r0, q3] 890; CHECK-NEXT: subs r2, #4 891; CHECK-NEXT: vmul.i8 q4, q5, q4 892; CHECK-NEXT: add.w r0, r0, #48 893; CHECK-NEXT: vmul.i8 q6, q5, q6 894; CHECK-NEXT: vmul.i8 q5, q5, q0 895; CHECK-NEXT: vstrb.8 q5, [r1, q2] 896; CHECK-NEXT: vstrb.8 q6, [r1, q3] 897; CHECK-NEXT: vstrb.8 q4, [r1, q1] 898; CHECK-NEXT: add.w r1, r1, #48 899; CHECK-NEXT: bne .LBB13_1 900; CHECK-NEXT: @ %bb.2: @ %end 901; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 902; CHECK-NEXT: pop {r7, pc} 903; CHECK-NEXT: .p2align 4 904; CHECK-NEXT: @ %bb.3: 905; CHECK-NEXT: .LCPI13_0: 906; CHECK-NEXT: .byte 1 @ 0x1 907; CHECK-NEXT: .byte 4 @ 0x4 908; CHECK-NEXT: .byte 7 @ 0x7 909; CHECK-NEXT: .byte 10 @ 0xa 910; CHECK-NEXT: .byte 13 @ 0xd 911; CHECK-NEXT: .byte 16 @ 0x10 912; CHECK-NEXT: .byte 19 @ 0x13 913; CHECK-NEXT: .byte 22 @ 0x16 914; CHECK-NEXT: .byte 25 @ 0x19 915; CHECK-NEXT: .byte 28 @ 0x1c 916; CHECK-NEXT: .byte 31 @ 0x1f 917; CHECK-NEXT: .byte 34 @ 0x22 918; CHECK-NEXT: .byte 37 @ 0x25 919; CHECK-NEXT: .byte 40 @ 0x28 920; CHECK-NEXT: .byte 43 @ 0x2b 921; CHECK-NEXT: .byte 46 @ 0x2e 922; CHECK-NEXT: .LCPI13_1: 923; CHECK-NEXT: .byte 0 @ 0x0 924; CHECK-NEXT: .byte 3 @ 0x3 925; CHECK-NEXT: .byte 6 @ 0x6 926; CHECK-NEXT: .byte 9 @ 0x9 927; CHECK-NEXT: .byte 12 @ 0xc 928; CHECK-NEXT: .byte 15 @ 0xf 929; CHECK-NEXT: .byte 18 @ 0x12 930; CHECK-NEXT: .byte 21 @ 0x15 931; CHECK-NEXT: .byte 24 @ 0x18 932; CHECK-NEXT: .byte 27 @ 0x1b 933; CHECK-NEXT: .byte 30 @ 0x1e 934; CHECK-NEXT: .byte 33 @ 0x21 935; CHECK-NEXT: .byte 36 @ 0x24 936; CHECK-NEXT: .byte 39 @ 0x27 937; CHECK-NEXT: .byte 42 @ 0x2a 938; CHECK-NEXT: .byte 45 @ 0x2d 939; CHECK-NEXT: .LCPI13_2: 940; CHECK-NEXT: .byte 2 @ 0x2 941; CHECK-NEXT: .byte 5 @ 0x5 942; CHECK-NEXT: .byte 8 @ 0x8 943; CHECK-NEXT: .byte 11 @ 0xb 944; CHECK-NEXT: .byte 14 @ 0xe 945; CHECK-NEXT: .byte 17 @ 0x11 946; CHECK-NEXT: .byte 20 @ 0x14 947; CHECK-NEXT: .byte 23 @ 0x17 948; CHECK-NEXT: .byte 26 @ 0x1a 949; CHECK-NEXT: .byte 29 @ 0x1d 950; CHECK-NEXT: .byte 32 @ 0x20 951; CHECK-NEXT: .byte 35 @ 0x23 952; CHECK-NEXT: .byte 38 @ 0x26 953; CHECK-NEXT: .byte 41 @ 0x29 954; CHECK-NEXT: .byte 44 @ 0x2c 955; CHECK-NEXT: .byte 47 @ 0x2f 956vector.ph: 957 br label %vector.body 958 959vector.body: 960 %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] 961 %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] 962 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 963 %vector.gep = getelementptr i8, i8* %pointer.phi, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45> 964 %v3 = getelementptr i8, i8* %pointer.phi, i32 48 965 %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45> 966 %v4 = getelementptr i8, i8* %pointer.phi55, i32 48 967 %v5 = add i32 %index, 0 968 %v6 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 1 969 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %vector.gep, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 970 %v7 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 2 971 %wide.masked.gather57 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v6, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 972 %wide.masked.gather58 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v7, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 973 %v11 = mul nuw nsw <16 x i8> %wide.masked.gather, <i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10, i8 10> 974 %v13 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather57 975 %v15 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather58 976 %v17 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 1 977 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v11, <16 x i8*> %vector.gep56, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 978 %v18 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 2 979 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v13, <16 x i8*> %v17, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 980 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v15, <16 x i8*> %v18, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 981 %index.next = add i32 %index, 4 982 %v37 = icmp eq i32 %index.next, %n 983 br i1 %v37, label %end, label %vector.body 984 985end: 986 ret void; 987} 988 989declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) 990declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) 991declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) 992declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) 993declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) 994declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) 995 996declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) 997declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) 998declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) 999declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) 1000declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) 1001declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) 1002