1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4; VLDRB.8 5define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 6; CHECK-LABEL: unscaled_v16i8_i8: 7; CHECK: @ %bb.0: @ %entry 8; CHECK-NEXT: vldrb.u8 q1, [r1] 9; CHECK-NEXT: vstrb.8 q0, [r0, q1] 10; CHECK-NEXT: bx lr 11entry: 12 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 13 %offs.zext = zext <16 x i8> %offs to <16 x i32> 14 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 15 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 16 ret void 17} 18 19define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) { 20; CHECK-LABEL: unscaled_v8i8_i8: 21; CHECK: @ %bb.0: @ %entry 22; CHECK-NEXT: vldrb.u16 q1, [r1] 23; CHECK-NEXT: vmovlb.u8 q0, q0 24; CHECK-NEXT: vstrb.16 q0, [r0, q1] 25; CHECK-NEXT: bx lr 26entry: 27 %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 28 %offs.zext = zext <8 x i8> %offs to <8 x i32> 29 %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext 30 call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 31 ret void 32} 33 34; Expand 35define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) { 36; CHECK-LABEL: unscaled_v2i8_i8: 37; CHECK: @ %bb.0: @ %entry 38; CHECK-NEXT: ldrb r2, [r1] 39; CHECK-NEXT: vmov.i32 q1, #0xff 40; CHECK-NEXT: ldrb r1, [r1, #1] 41; CHECK-NEXT: vmov q2[2], q2[0], r2, r1 42; CHECK-NEXT: vmov r2, s0 43; CHECK-NEXT: vand q1, q2, q1 44; CHECK-NEXT: vmov r1, s4 45; CHECK-NEXT: strb r2, [r0, r1] 46; CHECK-NEXT: vmov r1, s6 47; CHECK-NEXT: vmov r2, s2 48; CHECK-NEXT: strb r2, [r0, r1] 49; CHECK-NEXT: bx lr 50entry: 51 %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 52 %offs.zext = zext <2 x i8> %offs to <2 x i32> 53 %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext 54 call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> <i1 true, i1 true>) 55 ret void 56} 57 58; Expand - sext offsets 59define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 60; CHECK-LABEL: unscaled_v16i8_sext: 61; CHECK: @ %bb.0: @ %entry 62; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 63; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 64; CHECK-NEXT: vldrb.s32 q1, [r1] 65; CHECK-NEXT: vldrb.s32 q3, [r1, #8] 66; CHECK-NEXT: vmov.u8 r6, q0[0] 67; CHECK-NEXT: vmov.u8 r5, q0[4] 68; CHECK-NEXT: vadd.i32 q1, q1, r0 69; CHECK-NEXT: vadd.i32 q3, q3, r0 70; CHECK-NEXT: vmov r2, r3, d2 71; CHECK-NEXT: vmov.u8 r7, q0[6] 72; CHECK-NEXT: vmov r12, lr, d3 73; CHECK-NEXT: vldrb.s32 q1, [r1, #4] 74; CHECK-NEXT: vadd.i32 q2, q1, r0 75; CHECK-NEXT: vldrb.s32 q1, [r1, #12] 76; CHECK-NEXT: vmov r4, r8, d4 77; CHECK-NEXT: vadd.i32 q1, q1, r0 78; CHECK-NEXT: vmov r0, r9, d5 79; CHECK-NEXT: strb r6, [r2] 80; CHECK-NEXT: vmov.u8 r2, q0[1] 81; CHECK-NEXT: strb r2, [r3] 82; CHECK-NEXT: vmov.u8 r6, q0[2] 83; CHECK-NEXT: vmov r2, r10, d6 84; CHECK-NEXT: strb.w r6, [r12] 85; CHECK-NEXT: vmov.u8 r6, q0[3] 86; CHECK-NEXT: vmov.u8 r3, q0[8] 87; CHECK-NEXT: strb.w r6, [lr] 88; CHECK-NEXT: vmov r6, r1, d7 89; CHECK-NEXT: strb r5, [r4] 90; CHECK-NEXT: vmov.u8 r5, q0[5] 91; CHECK-NEXT: strb.w r5, [r8] 92; CHECK-NEXT: vmov r5, r4, d2 93; CHECK-NEXT: strb r7, [r0] 94; CHECK-NEXT: vmov.u8 r0, q0[7] 95; CHECK-NEXT: strb.w r0, [r9] 96; CHECK-NEXT: vmov r0, r7, d3 97; CHECK-NEXT: strb r3, [r2] 98; CHECK-NEXT: vmov.u8 r2, q0[9] 99; CHECK-NEXT: strb.w r2, [r10] 100; CHECK-NEXT: vmov.u8 r2, q0[10] 101; CHECK-NEXT: strb r2, [r6] 102; CHECK-NEXT: vmov.u8 r2, q0[11] 103; CHECK-NEXT: strb r2, [r1] 104; CHECK-NEXT: vmov.u8 r1, q0[12] 105; CHECK-NEXT: strb r1, [r5] 106; CHECK-NEXT: vmov.u8 r1, q0[13] 107; CHECK-NEXT: strb r1, [r4] 108; CHECK-NEXT: vmov.u8 r1, q0[14] 109; CHECK-NEXT: strb r1, [r0] 110; CHECK-NEXT: vmov.u8 r0, q0[15] 111; CHECK-NEXT: strb r0, [r7] 112; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 113entry: 114 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 115 %offs.sext = sext <16 x i8> %offs to <16 x i32> 116 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext 117 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 118 ret void 119} 120 121; Expand - sext offsets 122define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) { 123; CHECK-LABEL: unscaled_v16i8_i16: 124; CHECK: @ %bb.0: @ %entry 125; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 126; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 127; CHECK-NEXT: vldrh.s32 q1, [r1] 128; CHECK-NEXT: vldrh.s32 q3, [r1, #16] 129; CHECK-NEXT: vmov.u8 r6, q0[0] 130; CHECK-NEXT: vmov.u8 r5, q0[4] 131; CHECK-NEXT: vadd.i32 q1, q1, r0 132; CHECK-NEXT: vadd.i32 q3, q3, r0 133; CHECK-NEXT: vmov r2, r3, d2 134; CHECK-NEXT: vmov.u8 r7, q0[6] 135; CHECK-NEXT: vmov r12, lr, d3 136; CHECK-NEXT: vldrh.s32 q1, [r1, #8] 137; CHECK-NEXT: vadd.i32 q2, q1, r0 138; CHECK-NEXT: vldrh.s32 q1, [r1, #24] 139; CHECK-NEXT: vmov r4, r8, d4 140; CHECK-NEXT: vadd.i32 q1, q1, r0 141; CHECK-NEXT: vmov r0, r9, d5 142; CHECK-NEXT: strb r6, [r2] 143; CHECK-NEXT: vmov.u8 r2, q0[1] 144; CHECK-NEXT: strb r2, [r3] 145; CHECK-NEXT: vmov.u8 r6, q0[2] 146; CHECK-NEXT: vmov r2, r10, d6 147; CHECK-NEXT: strb.w r6, [r12] 148; CHECK-NEXT: vmov.u8 r6, q0[3] 149; CHECK-NEXT: vmov.u8 r3, q0[8] 150; CHECK-NEXT: strb.w r6, [lr] 151; CHECK-NEXT: vmov r6, r1, d7 152; CHECK-NEXT: strb r5, [r4] 153; CHECK-NEXT: vmov.u8 r5, q0[5] 154; CHECK-NEXT: strb.w r5, [r8] 155; CHECK-NEXT: vmov r5, r4, d2 156; CHECK-NEXT: strb r7, [r0] 157; CHECK-NEXT: vmov.u8 r0, q0[7] 158; CHECK-NEXT: strb.w r0, [r9] 159; CHECK-NEXT: vmov r0, r7, d3 160; CHECK-NEXT: strb r3, [r2] 161; CHECK-NEXT: vmov.u8 r2, q0[9] 162; CHECK-NEXT: strb.w r2, [r10] 163; CHECK-NEXT: vmov.u8 r2, q0[10] 164; CHECK-NEXT: strb r2, [r6] 165; CHECK-NEXT: vmov.u8 r2, q0[11] 166; CHECK-NEXT: strb r2, [r1] 167; CHECK-NEXT: vmov.u8 r1, q0[12] 168; CHECK-NEXT: strb r1, [r5] 169; CHECK-NEXT: vmov.u8 r1, q0[13] 170; CHECK-NEXT: strb r1, [r4] 171; CHECK-NEXT: vmov.u8 r1, q0[14] 172; CHECK-NEXT: strb r1, [r0] 173; CHECK-NEXT: vmov.u8 r0, q0[15] 174; CHECK-NEXT: strb r0, [r7] 175; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 176entry: 177 %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 178 %offs.sext = sext <16 x i16> %offs to <16 x i32> 179 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext 180 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 181 ret void 182} 183 184; Could be manually scaled offsets 185define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) { 186; CHECK-LABEL: unscaled_v16i8_scaled: 187; CHECK: @ %bb.0: @ %entry 188; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 189; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 190; CHECK-NEXT: vldrb.u32 q1, [r1] 191; CHECK-NEXT: vldrb.u32 q3, [r1, #8] 192; CHECK-NEXT: vmov.u8 r6, q0[0] 193; CHECK-NEXT: vmov.u8 r7, q0[4] 194; CHECK-NEXT: vshl.i32 q1, q1, #2 195; CHECK-NEXT: vshl.i32 q3, q3, #2 196; CHECK-NEXT: vadd.i32 q1, q1, r0 197; CHECK-NEXT: vadd.i32 q3, q3, r0 198; CHECK-NEXT: vmov r2, r3, d2 199; CHECK-NEXT: vmov r12, lr, d3 200; CHECK-NEXT: vldrb.u32 q1, [r1, #4] 201; CHECK-NEXT: vshl.i32 q1, q1, #2 202; CHECK-NEXT: vadd.i32 q2, q1, r0 203; CHECK-NEXT: vldrb.u32 q1, [r1, #12] 204; CHECK-NEXT: vmov r4, r8, d4 205; CHECK-NEXT: vmov.u8 r1, q0[6] 206; CHECK-NEXT: vshl.i32 q1, q1, #2 207; CHECK-NEXT: vadd.i32 q1, q1, r0 208; CHECK-NEXT: vmov r0, r9, d5 209; CHECK-NEXT: strb r6, [r2] 210; CHECK-NEXT: vmov.u8 r2, q0[1] 211; CHECK-NEXT: strb r2, [r3] 212; CHECK-NEXT: vmov.u8 r6, q0[2] 213; CHECK-NEXT: vmov r2, r10, d6 214; CHECK-NEXT: strb.w r6, [r12] 215; CHECK-NEXT: vmov.u8 r6, q0[3] 216; CHECK-NEXT: vmov.u8 r3, q0[8] 217; CHECK-NEXT: strb.w r6, [lr] 218; CHECK-NEXT: vmov r6, r5, d7 219; CHECK-NEXT: strb r7, [r4] 220; CHECK-NEXT: vmov.u8 r7, q0[5] 221; CHECK-NEXT: strb.w r7, [r8] 222; CHECK-NEXT: vmov r7, r4, d2 223; CHECK-NEXT: strb r1, [r0] 224; CHECK-NEXT: vmov.u8 r0, q0[7] 225; CHECK-NEXT: strb.w r0, [r9] 226; CHECK-NEXT: vmov r0, r1, d3 227; CHECK-NEXT: strb r3, [r2] 228; CHECK-NEXT: vmov.u8 r2, q0[9] 229; CHECK-NEXT: strb.w r2, [r10] 230; CHECK-NEXT: vmov.u8 r2, q0[10] 231; CHECK-NEXT: strb r2, [r6] 232; CHECK-NEXT: vmov.u8 r2, q0[11] 233; CHECK-NEXT: strb r2, [r5] 234; CHECK-NEXT: vmov.u8 r2, q0[12] 235; CHECK-NEXT: strb r2, [r7] 236; CHECK-NEXT: vmov.u8 r2, q0[13] 237; CHECK-NEXT: strb r2, [r4] 238; CHECK-NEXT: vmov.u8 r2, q0[14] 239; CHECK-NEXT: strb r2, [r0] 240; CHECK-NEXT: vmov.u8 r0, q0[15] 241; CHECK-NEXT: strb r0, [r1] 242; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 243entry: 244 %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 245 %offs.zext = zext <16 x i8> %offs to <16 x i32> 246 %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext 247 %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*> 248 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 249 ret void 250} 251 252; Expand - large offsets 253define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) { 254; CHECK-LABEL: unscaled_v16i8_i8_next: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 257; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 258; CHECK-NEXT: vldrw.u32 q1, [r1] 259; CHECK-NEXT: vldrw.u32 q3, [r1, #32] 260; CHECK-NEXT: vmov.u8 r6, q0[0] 261; CHECK-NEXT: vmov.u8 r5, q0[4] 262; CHECK-NEXT: vadd.i32 q1, q1, r0 263; CHECK-NEXT: vadd.i32 q3, q3, r0 264; CHECK-NEXT: vmov r2, r3, d2 265; CHECK-NEXT: vmov.u8 r7, q0[6] 266; CHECK-NEXT: vmov r12, lr, d3 267; CHECK-NEXT: vldrw.u32 q1, [r1, #16] 268; CHECK-NEXT: vadd.i32 q2, q1, r0 269; CHECK-NEXT: vldrw.u32 q1, [r1, #48] 270; CHECK-NEXT: vmov r4, r8, d4 271; CHECK-NEXT: vadd.i32 q1, q1, r0 272; CHECK-NEXT: vmov r0, r9, d5 273; CHECK-NEXT: strb r6, [r2] 274; CHECK-NEXT: vmov.u8 r2, q0[1] 275; CHECK-NEXT: strb r2, [r3] 276; CHECK-NEXT: vmov.u8 r6, q0[2] 277; CHECK-NEXT: vmov r2, r10, d6 278; CHECK-NEXT: strb.w r6, [r12] 279; CHECK-NEXT: vmov.u8 r6, q0[3] 280; CHECK-NEXT: vmov.u8 r3, q0[8] 281; CHECK-NEXT: strb.w r6, [lr] 282; CHECK-NEXT: vmov r6, r1, d7 283; CHECK-NEXT: strb r5, [r4] 284; CHECK-NEXT: vmov.u8 r5, q0[5] 285; CHECK-NEXT: strb.w r5, [r8] 286; CHECK-NEXT: vmov r5, r4, d2 287; CHECK-NEXT: strb r7, [r0] 288; CHECK-NEXT: vmov.u8 r0, q0[7] 289; CHECK-NEXT: strb.w r0, [r9] 290; CHECK-NEXT: vmov r0, r7, d3 291; CHECK-NEXT: strb r3, [r2] 292; CHECK-NEXT: vmov.u8 r2, q0[9] 293; CHECK-NEXT: strb.w r2, [r10] 294; CHECK-NEXT: vmov.u8 r2, q0[10] 295; CHECK-NEXT: strb r2, [r6] 296; CHECK-NEXT: vmov.u8 r2, q0[11] 297; CHECK-NEXT: strb r2, [r1] 298; CHECK-NEXT: vmov.u8 r1, q0[12] 299; CHECK-NEXT: strb r1, [r5] 300; CHECK-NEXT: vmov.u8 r1, q0[13] 301; CHECK-NEXT: strb r1, [r4] 302; CHECK-NEXT: vmov.u8 r1, q0[14] 303; CHECK-NEXT: strb r1, [r0] 304; CHECK-NEXT: vmov.u8 r0, q0[15] 305; CHECK-NEXT: strb r0, [r7] 306; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 307entry: 308 %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 309 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs 310 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 311 ret void 312} 313 314define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) { 315; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: 316; CHECK: @ %bb.0: @ %entry 317; CHECK-NEXT: .save {r4, lr} 318; CHECK-NEXT: push {r4, lr} 319; CHECK-NEXT: .vsave {d8, d9, d10, d11} 320; CHECK-NEXT: vpush {d8, d9, d10, d11} 321; CHECK-NEXT: vmov r4, s0 322; CHECK-NEXT: add r3, sp, #40 323; CHECK-NEXT: vmov.8 q5[0], r4 324; CHECK-NEXT: vmov r4, s2 325; CHECK-NEXT: vmov.8 q5[1], r4 326; CHECK-NEXT: vmov r4, s4 327; CHECK-NEXT: vmov.8 q5[2], r4 328; CHECK-NEXT: vmov r4, s6 329; CHECK-NEXT: vmov.8 q5[3], r4 330; CHECK-NEXT: vmov r4, s8 331; CHECK-NEXT: vmov.8 q5[4], r4 332; CHECK-NEXT: vmov r4, s10 333; CHECK-NEXT: vldrw.u32 q0, [r3] 334; CHECK-NEXT: vmov.8 q5[5], r4 335; CHECK-NEXT: vmov r4, s12 336; CHECK-NEXT: add.w lr, sp, #56 337; CHECK-NEXT: vmov.8 q5[6], r4 338; CHECK-NEXT: vmov r4, s14 339; CHECK-NEXT: vmov.8 q5[7], r4 340; CHECK-NEXT: vmov r3, s0 341; CHECK-NEXT: vmov.8 q5[8], r3 342; CHECK-NEXT: vmov r3, s2 343; CHECK-NEXT: vldrw.u32 q0, [lr] 344; CHECK-NEXT: vmov.8 q5[9], r3 345; CHECK-NEXT: add.w r12, sp, #72 346; CHECK-NEXT: add r2, sp, #88 347; CHECK-NEXT: vmov r3, s0 348; CHECK-NEXT: vldrw.u32 q4, [r2] 349; CHECK-NEXT: vmov.8 q5[10], r3 350; CHECK-NEXT: vmov r3, s2 351; CHECK-NEXT: vldrw.u32 q0, [r12] 352; CHECK-NEXT: vmov.8 q5[11], r3 353; CHECK-NEXT: vmov r2, s18 354; CHECK-NEXT: vmov r3, s0 355; CHECK-NEXT: vmov.8 q5[12], r3 356; CHECK-NEXT: vmov r3, s2 357; CHECK-NEXT: vmov.8 q5[13], r3 358; CHECK-NEXT: vmov r3, s16 359; CHECK-NEXT: vmov.8 q5[14], r3 360; CHECK-NEXT: vldrb.u8 q0, [r1] 361; CHECK-NEXT: vmov.8 q5[15], r2 362; CHECK-NEXT: vstrb.8 q5, [r0, q0] 363; CHECK-NEXT: vpop {d8, d9, d10, d11} 364; CHECK-NEXT: pop {r4, pc} 365entry: 366 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 367 %offs.zext = zext <16 x i8> %offs to <16 x i32> 368 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 369 %input.trunc = trunc <16 x i64> %input to <16 x i8> 370 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 371 ret void 372} 373 374define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) { 375; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: 376; CHECK: @ %bb.0: @ %entry 377; CHECK-NEXT: .pad #16 378; CHECK-NEXT: sub sp, #16 379; CHECK-NEXT: mov r2, sp 380; CHECK-NEXT: vstrb.32 q3, [r2, #12] 381; CHECK-NEXT: vstrb.32 q2, [r2, #8] 382; CHECK-NEXT: vstrb.32 q1, [r2, #4] 383; CHECK-NEXT: vstrb.32 q0, [r2] 384; CHECK-NEXT: vldrb.u8 q0, [r1] 385; CHECK-NEXT: vldrw.u32 q1, [r2] 386; CHECK-NEXT: vstrb.8 q1, [r0, q0] 387; CHECK-NEXT: add sp, #16 388; CHECK-NEXT: bx lr 389entry: 390 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 391 %offs.zext = zext <16 x i8> %offs to <16 x i32> 392 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 393 %input.trunc = trunc <16 x i32> %input to <16 x i8> 394 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 395 ret void 396} 397 398define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) { 399; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: 400; CHECK: @ %bb.0: @ %entry 401; CHECK-NEXT: .pad #16 402; CHECK-NEXT: sub sp, #16 403; CHECK-NEXT: mov r2, sp 404; CHECK-NEXT: vstrb.16 q1, [r2, #8] 405; CHECK-NEXT: vstrb.16 q0, [r2] 406; CHECK-NEXT: vldrb.u8 q0, [r1] 407; CHECK-NEXT: vldrw.u32 q1, [r2] 408; CHECK-NEXT: vstrb.8 q1, [r0, q0] 409; CHECK-NEXT: add sp, #16 410; CHECK-NEXT: bx lr 411entry: 412 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 413 %offs.zext = zext <16 x i8> %offs to <16 x i32> 414 %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext 415 %input.trunc = trunc <16 x i16> %input to <16 x i8> 416 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 417 ret void 418} 419 420define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 421; CHECK-LABEL: unscaled_v16i8_i8_2gep: 422; CHECK: @ %bb.0: @ %entry 423; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 424; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 425; CHECK-NEXT: .vsave {d8, d9} 426; CHECK-NEXT: vpush {d8, d9} 427; CHECK-NEXT: vldrb.s32 q2, [r1] 428; CHECK-NEXT: vmov.i32 q1, #0x5 429; CHECK-NEXT: vldrb.s32 q4, [r1, #8] 430; CHECK-NEXT: vmov.u8 r6, q0[0] 431; CHECK-NEXT: vadd.i32 q2, q2, r0 432; CHECK-NEXT: vmov.u8 r5, q0[4] 433; CHECK-NEXT: vadd.i32 q2, q2, q1 434; CHECK-NEXT: vadd.i32 q4, q4, r0 435; CHECK-NEXT: vmov r2, r3, d4 436; CHECK-NEXT: vmov.u8 r7, q0[6] 437; CHECK-NEXT: vmov r12, lr, d5 438; CHECK-NEXT: vldrb.s32 q2, [r1, #4] 439; CHECK-NEXT: vadd.i32 q2, q2, r0 440; CHECK-NEXT: vadd.i32 q3, q2, q1 441; CHECK-NEXT: vldrb.s32 q2, [r1, #12] 442; CHECK-NEXT: vmov r4, r8, d6 443; CHECK-NEXT: vadd.i32 q2, q2, r0 444; CHECK-NEXT: vmov r0, r9, d7 445; CHECK-NEXT: vadd.i32 q3, q4, q1 446; CHECK-NEXT: vadd.i32 q1, q2, q1 447; CHECK-NEXT: strb r6, [r2] 448; CHECK-NEXT: vmov.u8 r2, q0[1] 449; CHECK-NEXT: strb r2, [r3] 450; CHECK-NEXT: vmov.u8 r6, q0[2] 451; CHECK-NEXT: vmov r2, r10, d6 452; CHECK-NEXT: strb.w r6, [r12] 453; CHECK-NEXT: vmov.u8 r6, q0[3] 454; CHECK-NEXT: vmov.u8 r3, q0[8] 455; CHECK-NEXT: strb.w r6, [lr] 456; CHECK-NEXT: vmov r6, r1, d7 457; CHECK-NEXT: strb r5, [r4] 458; CHECK-NEXT: vmov.u8 r5, q0[5] 459; CHECK-NEXT: strb.w r5, [r8] 460; CHECK-NEXT: vmov r5, r4, d2 461; CHECK-NEXT: strb r7, [r0] 462; CHECK-NEXT: vmov.u8 r0, q0[7] 463; CHECK-NEXT: strb.w r0, [r9] 464; CHECK-NEXT: vmov r0, r7, d3 465; CHECK-NEXT: strb r3, [r2] 466; CHECK-NEXT: vmov.u8 r2, q0[9] 467; CHECK-NEXT: strb.w r2, [r10] 468; CHECK-NEXT: vmov.u8 r2, q0[10] 469; CHECK-NEXT: strb r2, [r6] 470; CHECK-NEXT: vmov.u8 r2, q0[11] 471; CHECK-NEXT: strb r2, [r1] 472; CHECK-NEXT: vmov.u8 r1, q0[12] 473; CHECK-NEXT: strb r1, [r5] 474; CHECK-NEXT: vmov.u8 r1, q0[13] 475; CHECK-NEXT: strb r1, [r4] 476; CHECK-NEXT: vmov.u8 r1, q0[14] 477; CHECK-NEXT: strb r1, [r0] 478; CHECK-NEXT: vmov.u8 r0, q0[15] 479; CHECK-NEXT: strb r0, [r7] 480; CHECK-NEXT: vpop {d8, d9} 481; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 482entry: 483 %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 484 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs 485 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 486 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 487 ret void 488} 489 490define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { 491; CHECK-LABEL: unscaled_v16i8_i8_2gep2: 492; CHECK: @ %bb.0: @ %entry 493; CHECK-NEXT: adr r1, .LCPI11_0 494; CHECK-NEXT: vldrw.u32 q1, [r1] 495; CHECK-NEXT: vstrb.8 q0, [r0, q1] 496; CHECK-NEXT: bx lr 497; CHECK-NEXT: .p2align 4 498; CHECK-NEXT: @ %bb.1: 499; CHECK-NEXT: .LCPI11_0: 500; CHECK-NEXT: .byte 5 @ 0x5 501; CHECK-NEXT: .byte 8 @ 0x8 502; CHECK-NEXT: .byte 11 @ 0xb 503; CHECK-NEXT: .byte 14 @ 0xe 504; CHECK-NEXT: .byte 17 @ 0x11 505; CHECK-NEXT: .byte 20 @ 0x14 506; CHECK-NEXT: .byte 23 @ 0x17 507; CHECK-NEXT: .byte 26 @ 0x1a 508; CHECK-NEXT: .byte 29 @ 0x1d 509; CHECK-NEXT: .byte 32 @ 0x20 510; CHECK-NEXT: .byte 35 @ 0x23 511; CHECK-NEXT: .byte 38 @ 0x26 512; CHECK-NEXT: .byte 41 @ 0x29 513; CHECK-NEXT: .byte 44 @ 0x2c 514; CHECK-NEXT: .byte 47 @ 0x2f 515; CHECK-NEXT: .byte 50 @ 0x32 516entry: 517 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> <i8 0, i8 3, i8 6, i8 9, i8 12, i8 15, i8 18, i8 21, i8 24, i8 27, i8 30, i8 33, i8 36, i8 39, i8 42, i8 45> 518 %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 519 call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 520 ret void 521} 522 523 524declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) 525declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) 526declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) 527