1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=LE %s 3; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=BE %s 4 5define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) { 6; LE-LABEL: test_vmovnbq_s16: 7; LE: @ %bb.0: @ %entry 8; LE-NEXT: vmovnb.i16 q0, q1 9; LE-NEXT: bx lr 10; 11; BE-LABEL: test_vmovnbq_s16: 12; BE: @ %bb.0: @ %entry 13; BE-NEXT: vrev64.16 q2, q1 14; BE-NEXT: vrev64.8 q1, q0 15; BE-NEXT: vmovnb.i16 q1, q2 16; BE-NEXT: vrev64.8 q0, q1 17; BE-NEXT: bx lr 18entry: 19 %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 20 %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0) 21 %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 22 %3 = trunc <16 x i16> %2 to <16 x i8> 23 ret <16 x i8> %3 24} 25 26define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) { 27; LE-LABEL: test_vmovnbq_s32: 28; LE: @ %bb.0: @ %entry 29; LE-NEXT: vmovnb.i32 q0, q1 30; LE-NEXT: bx lr 31; 32; BE-LABEL: test_vmovnbq_s32: 33; BE: @ %bb.0: @ %entry 34; BE-NEXT: vrev64.32 q2, q1 35; BE-NEXT: vrev64.16 q1, q0 36; BE-NEXT: vmovnb.i32 q1, q2 37; BE-NEXT: vrev64.16 q0, q1 38; BE-NEXT: bx lr 39entry: 40 %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 41 %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0) 42 %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 43 %3 = trunc <8 x i32> %2 to <8 x i16> 44 ret <8 x i16> %3 45} 46 47define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) { 48; LE-LABEL: test_vmovnbq_u16: 49; LE: @ %bb.0: @ %entry 50; LE-NEXT: vmovnb.i16 q0, q1 51; LE-NEXT: bx lr 52; 53; BE-LABEL: test_vmovnbq_u16: 54; BE: @ %bb.0: @ %entry 55; BE-NEXT: vrev64.16 q2, q1 56; BE-NEXT: vrev64.8 q1, q0 57; BE-NEXT: vmovnb.i16 q1, q2 58; BE-NEXT: vrev64.8 q0, q1 59; BE-NEXT: bx lr 60entry: 61 %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 62 %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0) 63 %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 64 %3 = trunc <16 x i16> %2 to <16 x i8> 65 ret <16 x i8> %3 66} 67 68define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) { 69; LE-LABEL: test_vmovnbq_u32: 70; LE: @ %bb.0: @ %entry 71; LE-NEXT: vmovnb.i32 q0, q1 72; LE-NEXT: bx lr 73; 74; BE-LABEL: test_vmovnbq_u32: 75; BE: @ %bb.0: @ %entry 76; BE-NEXT: vrev64.32 q2, q1 77; BE-NEXT: vrev64.16 q1, q0 78; BE-NEXT: vmovnb.i32 q1, q2 79; BE-NEXT: vrev64.16 q0, q1 80; BE-NEXT: bx lr 81entry: 82 %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 83 %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0) 84 %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 85 %3 = trunc <8 x i32> %2 to <8 x i16> 86 ret <8 x i16> %3 87} 88 89define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) { 90; LE-LABEL: test_vmovntq_s16: 91; LE: @ %bb.0: @ %entry 92; LE-NEXT: vmovnt.i16 q0, q1 93; LE-NEXT: bx lr 94; 95; BE-LABEL: test_vmovntq_s16: 96; BE: @ %bb.0: @ %entry 97; BE-NEXT: vrev64.16 q2, q1 98; BE-NEXT: vrev64.8 q1, q0 99; BE-NEXT: vmovnt.i16 q1, q2 100; BE-NEXT: vrev64.8 q0, q1 101; BE-NEXT: bx lr 102entry: 103 %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a) 104 %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 105 %2 = trunc <16 x i16> %1 to <16 x i8> 106 ret <16 x i8> %2 107} 108 109define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) { 110; LE-LABEL: test_vmovntq_s32: 111; LE: @ %bb.0: @ %entry 112; LE-NEXT: vmovnt.i32 q0, q1 113; LE-NEXT: bx lr 114; 115; BE-LABEL: test_vmovntq_s32: 116; BE: @ %bb.0: @ %entry 117; BE-NEXT: vrev64.32 q2, q1 118; BE-NEXT: vrev64.16 q1, q0 119; BE-NEXT: vmovnt.i32 q1, q2 120; BE-NEXT: vrev64.16 q0, q1 121; BE-NEXT: bx lr 122entry: 123 %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a) 124 %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 125 %2 = trunc <8 x i32> %1 to <8 x i16> 126 ret <8 x i16> %2 127} 128 129define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) { 130; LE-LABEL: test_vmovntq_u16: 131; LE: @ %bb.0: @ %entry 132; LE-NEXT: vmovnt.i16 q0, q1 133; LE-NEXT: bx lr 134; 135; BE-LABEL: test_vmovntq_u16: 136; BE: @ %bb.0: @ %entry 137; BE-NEXT: vrev64.16 q2, q1 138; BE-NEXT: vrev64.8 q1, q0 139; BE-NEXT: vmovnt.i16 q1, q2 140; BE-NEXT: vrev64.8 q0, q1 141; BE-NEXT: bx lr 142entry: 143 %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a) 144 %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 145 %2 = trunc <16 x i16> %1 to <16 x i8> 146 ret <16 x i8> %2 147} 148 149define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) { 150; LE-LABEL: test_vmovntq_u32: 151; LE: @ %bb.0: @ %entry 152; LE-NEXT: vmovnt.i32 q0, q1 153; LE-NEXT: bx lr 154; 155; BE-LABEL: test_vmovntq_u32: 156; BE: @ %bb.0: @ %entry 157; BE-NEXT: vrev64.32 q2, q1 158; BE-NEXT: vrev64.16 q1, q0 159; BE-NEXT: vmovnt.i32 q1, q2 160; BE-NEXT: vrev64.16 q0, q1 161; BE-NEXT: bx lr 162entry: 163 %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a) 164 %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 165 %2 = trunc <8 x i32> %1 to <8 x i16> 166 ret <8 x i16> %2 167} 168 169define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_m_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { 170; LE-LABEL: test_vmovnbq_m_s16: 171; LE: @ %bb.0: @ %entry 172; LE-NEXT: vmsr p0, r0 173; LE-NEXT: vpst 174; LE-NEXT: vmovnbt.i16 q0, q1 175; LE-NEXT: bx lr 176; 177; BE-LABEL: test_vmovnbq_m_s16: 178; BE: @ %bb.0: @ %entry 179; BE-NEXT: vrev64.16 q2, q1 180; BE-NEXT: vrev64.8 q1, q0 181; BE-NEXT: vmsr p0, r0 182; BE-NEXT: vpst 183; BE-NEXT: vmovnbt.i16 q1, q2 184; BE-NEXT: vrev64.8 q0, q1 185; BE-NEXT: bx lr 186entry: 187 %0 = zext i16 %p to i32 188 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 189 %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 0, <8 x i1> %1) 190 ret <16 x i8> %2 191} 192 193define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_m_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { 194; LE-LABEL: test_vmovnbq_m_s32: 195; LE: @ %bb.0: @ %entry 196; LE-NEXT: vmsr p0, r0 197; LE-NEXT: vpst 198; LE-NEXT: vmovnbt.i32 q0, q1 199; LE-NEXT: bx lr 200; 201; BE-LABEL: test_vmovnbq_m_s32: 202; BE: @ %bb.0: @ %entry 203; BE-NEXT: vrev64.32 q2, q1 204; BE-NEXT: vrev64.16 q1, q0 205; BE-NEXT: vmsr p0, r0 206; BE-NEXT: vpst 207; BE-NEXT: vmovnbt.i32 q1, q2 208; BE-NEXT: vrev64.16 q0, q1 209; BE-NEXT: bx lr 210entry: 211 %0 = zext i16 %p to i32 212 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 213 %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 0, <4 x i1> %1) 214 ret <8 x i16> %2 215} 216 217define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_m_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { 218; LE-LABEL: test_vmovnbq_m_u16: 219; LE: @ %bb.0: @ %entry 220; LE-NEXT: vmsr p0, r0 221; LE-NEXT: vpst 222; LE-NEXT: vmovnbt.i16 q0, q1 223; LE-NEXT: bx lr 224; 225; BE-LABEL: test_vmovnbq_m_u16: 226; BE: @ %bb.0: @ %entry 227; BE-NEXT: vrev64.16 q2, q1 228; BE-NEXT: vrev64.8 q1, q0 229; BE-NEXT: vmsr p0, r0 230; BE-NEXT: vpst 231; BE-NEXT: vmovnbt.i16 q1, q2 232; BE-NEXT: vrev64.8 q0, q1 233; BE-NEXT: bx lr 234entry: 235 %0 = zext i16 %p to i32 236 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 237 %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 0, <8 x i1> %1) 238 ret <16 x i8> %2 239} 240 241define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_m_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { 242; LE-LABEL: test_vmovnbq_m_u32: 243; LE: @ %bb.0: @ %entry 244; LE-NEXT: vmsr p0, r0 245; LE-NEXT: vpst 246; LE-NEXT: vmovnbt.i32 q0, q1 247; LE-NEXT: bx lr 248; 249; BE-LABEL: test_vmovnbq_m_u32: 250; BE: @ %bb.0: @ %entry 251; BE-NEXT: vrev64.32 q2, q1 252; BE-NEXT: vrev64.16 q1, q0 253; BE-NEXT: vmsr p0, r0 254; BE-NEXT: vpst 255; BE-NEXT: vmovnbt.i32 q1, q2 256; BE-NEXT: vrev64.16 q0, q1 257; BE-NEXT: bx lr 258entry: 259 %0 = zext i16 %p to i32 260 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 261 %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 0, <4 x i1> %1) 262 ret <8 x i16> %2 263} 264 265define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_m_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { 266; LE-LABEL: test_vmovntq_m_s16: 267; LE: @ %bb.0: @ %entry 268; LE-NEXT: vmsr p0, r0 269; LE-NEXT: vpst 270; LE-NEXT: vmovntt.i16 q0, q1 271; LE-NEXT: bx lr 272; 273; BE-LABEL: test_vmovntq_m_s16: 274; BE: @ %bb.0: @ %entry 275; BE-NEXT: vrev64.16 q2, q1 276; BE-NEXT: vrev64.8 q1, q0 277; BE-NEXT: vmsr p0, r0 278; BE-NEXT: vpst 279; BE-NEXT: vmovntt.i16 q1, q2 280; BE-NEXT: vrev64.8 q0, q1 281; BE-NEXT: bx lr 282entry: 283 %0 = zext i16 %p to i32 284 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 285 %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, <8 x i1> %1) 286 ret <16 x i8> %2 287} 288 289define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_m_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { 290; LE-LABEL: test_vmovntq_m_s32: 291; LE: @ %bb.0: @ %entry 292; LE-NEXT: vmsr p0, r0 293; LE-NEXT: vpst 294; LE-NEXT: vmovntt.i32 q0, q1 295; LE-NEXT: bx lr 296; 297; BE-LABEL: test_vmovntq_m_s32: 298; BE: @ %bb.0: @ %entry 299; BE-NEXT: vrev64.32 q2, q1 300; BE-NEXT: vrev64.16 q1, q0 301; BE-NEXT: vmsr p0, r0 302; BE-NEXT: vpst 303; BE-NEXT: vmovntt.i32 q1, q2 304; BE-NEXT: vrev64.16 q0, q1 305; BE-NEXT: bx lr 306entry: 307 %0 = zext i16 %p to i32 308 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 309 %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, <4 x i1> %1) 310 ret <8 x i16> %2 311} 312 313define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_m_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) { 314; LE-LABEL: test_vmovntq_m_u16: 315; LE: @ %bb.0: @ %entry 316; LE-NEXT: vmsr p0, r0 317; LE-NEXT: vpst 318; LE-NEXT: vmovntt.i16 q0, q1 319; LE-NEXT: bx lr 320; 321; BE-LABEL: test_vmovntq_m_u16: 322; BE: @ %bb.0: @ %entry 323; BE-NEXT: vrev64.16 q2, q1 324; BE-NEXT: vrev64.8 q1, q0 325; BE-NEXT: vmsr p0, r0 326; BE-NEXT: vpst 327; BE-NEXT: vmovntt.i16 q1, q2 328; BE-NEXT: vrev64.8 q0, q1 329; BE-NEXT: bx lr 330entry: 331 %0 = zext i16 %p to i32 332 %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) 333 %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, <8 x i1> %1) 334 ret <16 x i8> %2 335} 336 337define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_m_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) { 338; LE-LABEL: test_vmovntq_m_u32: 339; LE: @ %bb.0: @ %entry 340; LE-NEXT: vmsr p0, r0 341; LE-NEXT: vpst 342; LE-NEXT: vmovntt.i32 q0, q1 343; LE-NEXT: bx lr 344; 345; BE-LABEL: test_vmovntq_m_u32: 346; BE: @ %bb.0: @ %entry 347; BE-NEXT: vrev64.32 q2, q1 348; BE-NEXT: vrev64.16 q1, q0 349; BE-NEXT: vmsr p0, r0 350; BE-NEXT: vpst 351; BE-NEXT: vmovntt.i32 q1, q2 352; BE-NEXT: vrev64.16 q0, q1 353; BE-NEXT: bx lr 354entry: 355 %0 = zext i16 %p to i32 356 %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) 357 %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, <4 x i1> %1) 358 ret <8 x i16> %2 359} 360 361declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>) 362declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>) 363declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) 364declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) 365declare <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8>, <8 x i16>, i32, <8 x i1>) 366declare <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16>, <4 x i32>, i32, <4 x i1>) 367