1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) { 5; CHECK-LABEL: fpext_4: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vcvtt.f32.f16 s7, s1 8; CHECK-NEXT: vcvtb.f32.f16 s6, s1 9; CHECK-NEXT: vcvtt.f32.f16 s5, s0 10; CHECK-NEXT: vcvtb.f32.f16 s4, s0 11; CHECK-NEXT: vmov q0, q1 12; CHECK-NEXT: bx lr 13entry: 14 %out = fpext <4 x half> %src1 to <4 x float> 15 ret <4 x float> %out 16} 17 18define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) { 19; CHECK-LABEL: fpext_8: 20; CHECK: @ %bb.0: @ %entry 21; CHECK-NEXT: vcvtt.f32.f16 s11, s1 22; CHECK-NEXT: vcvtt.f32.f16 s7, s3 23; CHECK-NEXT: vcvtb.f32.f16 s10, s1 24; CHECK-NEXT: vcvtb.f32.f16 s6, s3 25; CHECK-NEXT: vcvtt.f32.f16 s9, s0 26; CHECK-NEXT: vcvtt.f32.f16 s5, s2 27; CHECK-NEXT: vcvtb.f32.f16 s8, s0 28; CHECK-NEXT: vcvtb.f32.f16 s4, s2 29; CHECK-NEXT: vmov q0, q2 30; CHECK-NEXT: bx lr 31entry: 32 %out = fpext <8 x half> %src1 to <8 x float> 33 ret <8 x float> %out 34} 35 36 37define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) { 38; CHECK-LABEL: fptrunc_4: 39; CHECK: @ %bb.0: @ %entry 40; CHECK-NEXT: vcvtb.f16.f32 s4, s0 41; CHECK-NEXT: vcvtt.f16.f32 s4, s1 42; CHECK-NEXT: vcvtb.f16.f32 s5, s2 43; CHECK-NEXT: vcvtt.f16.f32 s5, s3 44; CHECK-NEXT: vmov q0, q1 45; CHECK-NEXT: bx lr 46entry: 47 %out = fptrunc <4 x float> %src1 to <4 x half> 48 ret <4 x half> %out 49} 50 51define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) { 52; CHECK-LABEL: fptrunc_8: 53; CHECK: @ %bb.0: @ %entry 54; CHECK-NEXT: vmov q2, q0 55; CHECK-NEXT: vcvtb.f16.f32 s0, s8 56; CHECK-NEXT: vcvtt.f16.f32 s0, s9 57; CHECK-NEXT: vcvtb.f16.f32 s1, s10 58; CHECK-NEXT: vcvtt.f16.f32 s1, s11 59; CHECK-NEXT: vcvtb.f16.f32 s2, s4 60; CHECK-NEXT: vcvtt.f16.f32 s2, s5 61; CHECK-NEXT: vcvtb.f16.f32 s3, s6 62; CHECK-NEXT: vcvtt.f16.f32 s3, s7 63; CHECK-NEXT: bx lr 64entry: 65 %out = fptrunc <8 x float> %src1 to <8 x half> 66 ret <8 x half> %out 67} 68 69 70define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) { 71; CHECK-LABEL: shuffle_trunc1: 72; CHECK: @ %bb.0: @ %entry 73; CHECK-NEXT: vcvtb.f16.f32 q0, q0 74; CHECK-NEXT: vcvtt.f16.f32 q0, q1 75; CHECK-NEXT: bx lr 76entry: 77 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 78 %out = fptrunc <8 x float> %strided.vec to <8 x half> 79 ret <8 x half> %out 80} 81 82define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) { 83; CHECK-LABEL: shuffle_trunc2: 84; CHECK: @ %bb.0: @ %entry 85; CHECK-NEXT: vcvtb.f16.f32 q1, q1 86; CHECK-NEXT: vcvtt.f16.f32 q1, q0 87; CHECK-NEXT: vmov q0, q1 88; CHECK-NEXT: bx lr 89entry: 90 %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 91 %out = fptrunc <8 x float> %strided.vec to <8 x half> 92 ret <8 x half> %out 93} 94 95define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) { 96; CHECK-LABEL: shuffle_trunc3: 97; CHECK: @ %bb.0: @ %entry 98; CHECK-NEXT: vcvtb.f16.f32 q0, q0 99; CHECK-NEXT: vcvtb.f16.f32 q1, q1 100; CHECK-NEXT: vcvtt.f16.f32 q0, q2 101; CHECK-NEXT: vcvtt.f16.f32 q1, q3 102; CHECK-NEXT: bx lr 103entry: 104 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 105 %out = fptrunc <16 x float> %strided.vec to <16 x half> 106 ret <16 x half> %out 107} 108 109define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) { 110; CHECK-LABEL: shuffle_trunc4: 111; CHECK: @ %bb.0: @ %entry 112; CHECK-NEXT: vcvtb.f16.f32 q2, q2 113; CHECK-NEXT: vcvtb.f16.f32 q3, q3 114; CHECK-NEXT: vcvtt.f16.f32 q2, q0 115; CHECK-NEXT: vcvtt.f16.f32 q3, q1 116; CHECK-NEXT: vmov q0, q2 117; CHECK-NEXT: vmov q1, q3 118; CHECK-NEXT: bx lr 119entry: 120 %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> 121 %out = fptrunc <16 x float> %strided.vec to <16 x half> 122 ret <16 x half> %out 123} 124 125define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) { 126; CHECK-LABEL: shuffle_trunc5: 127; CHECK: @ %bb.0: @ %entry 128; CHECK-NEXT: vcvtb.f16.f32 q0, q0 129; CHECK-NEXT: vcvtt.f16.f32 q0, q1 130; CHECK-NEXT: bx lr 131entry: 132 %out1 = fptrunc <4 x float> %src1 to <4 x half> 133 %out2 = fptrunc <4 x float> %src2 to <4 x half> 134 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 135 ret <8 x half> %s 136} 137 138define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) { 139; CHECK-LABEL: shuffle_trunc6: 140; CHECK: @ %bb.0: @ %entry 141; CHECK-NEXT: vcvtb.f16.f32 q1, q1 142; CHECK-NEXT: vcvtt.f16.f32 q1, q0 143; CHECK-NEXT: vmov q0, q1 144; CHECK-NEXT: bx lr 145entry: 146 %out1 = fptrunc <4 x float> %src1 to <4 x half> 147 %out2 = fptrunc <4 x float> %src2 to <4 x half> 148 %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3> 149 ret <8 x half> %s 150} 151 152define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) { 153; CHECK-LABEL: shuffle_trunc7: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: vcvtb.f16.f32 q0, q0 156; CHECK-NEXT: vcvtb.f16.f32 q1, q1 157; CHECK-NEXT: vcvtt.f16.f32 q0, q2 158; CHECK-NEXT: vcvtt.f16.f32 q1, q3 159; CHECK-NEXT: bx lr 160entry: 161 %out1 = fptrunc <8 x float> %src1 to <8 x half> 162 %out2 = fptrunc <8 x float> %src2 to <8 x half> 163 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 164 ret <16 x half> %s 165} 166 167define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) { 168; CHECK-LABEL: shuffle_trunc8: 169; CHECK: @ %bb.0: @ %entry 170; CHECK-NEXT: vcvtb.f16.f32 q2, q2 171; CHECK-NEXT: vcvtb.f16.f32 q3, q3 172; CHECK-NEXT: vcvtt.f16.f32 q2, q0 173; CHECK-NEXT: vcvtt.f16.f32 q3, q1 174; CHECK-NEXT: vmov q0, q2 175; CHECK-NEXT: vmov q1, q3 176; CHECK-NEXT: bx lr 177entry: 178 %out1 = fptrunc <8 x float> %src1 to <8 x half> 179 %out2 = fptrunc <8 x float> %src2 to <8 x half> 180 %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7> 181 ret <16 x half> %s 182} 183 184 185 186 187define arm_aapcs_vfpcc <4 x float> @load_ext_4(<4 x half>* %src) { 188; CHECK-LABEL: load_ext_4: 189; CHECK: @ %bb.0: @ %entry 190; CHECK-NEXT: vldrh.u32 q0, [r0] 191; CHECK-NEXT: vcvtb.f32.f16 q0, q0 192; CHECK-NEXT: bx lr 193entry: 194 %wide.load = load <4 x half>, <4 x half>* %src, align 4 195 %e = fpext <4 x half> %wide.load to <4 x float> 196 ret <4 x float> %e 197} 198 199define arm_aapcs_vfpcc <8 x float> @load_ext_8(<8 x half>* %src) { 200; CHECK-LABEL: load_ext_8: 201; CHECK: @ %bb.0: @ %entry 202; CHECK-NEXT: vldrh.u32 q0, [r0] 203; CHECK-NEXT: vldrh.u32 q1, [r0, #8] 204; CHECK-NEXT: vcvtb.f32.f16 q0, q0 205; CHECK-NEXT: vcvtb.f32.f16 q1, q1 206; CHECK-NEXT: bx lr 207entry: 208 %wide.load = load <8 x half>, <8 x half>* %src, align 4 209 %e = fpext <8 x half> %wide.load to <8 x float> 210 ret <8 x float> %e 211} 212 213define arm_aapcs_vfpcc <16 x float> @load_ext_16(<16 x half>* %src) { 214; CHECK-LABEL: load_ext_16: 215; CHECK: @ %bb.0: @ %entry 216; CHECK-NEXT: vldrh.u32 q0, [r0] 217; CHECK-NEXT: vldrh.u32 q1, [r0, #8] 218; CHECK-NEXT: vldrh.u32 q2, [r0, #16] 219; CHECK-NEXT: vldrh.u32 q3, [r0, #24] 220; CHECK-NEXT: vcvtb.f32.f16 q0, q0 221; CHECK-NEXT: vcvtb.f32.f16 q1, q1 222; CHECK-NEXT: vcvtb.f32.f16 q2, q2 223; CHECK-NEXT: vcvtb.f32.f16 q3, q3 224; CHECK-NEXT: bx lr 225entry: 226 %wide.load = load <16 x half>, <16 x half>* %src, align 4 227 %e = fpext <16 x half> %wide.load to <16 x float> 228 ret <16 x float> %e 229} 230 231define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(<8 x half>* %src) { 232; CHECK-LABEL: load_shuffleext_8: 233; CHECK: @ %bb.0: @ %entry 234; CHECK-NEXT: vldrw.u32 q0, [r0] 235; CHECK-NEXT: vcvtb.f32.f16 q0, q0 236; CHECK-NEXT: bx lr 237entry: 238 %wide.load = load <8 x half>, <8 x half>* %src, align 4 239 %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 240 %e = fpext <4 x half> %sh to <4 x float> 241 ret <4 x float> %e 242} 243 244define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) { 245; CHECK-LABEL: load_shuffleext_16: 246; CHECK: @ %bb.0: @ %entry 247; CHECK-NEXT: vld20.16 {q2, q3}, [r0] 248; CHECK-NEXT: vld21.16 {q2, q3}, [r0] 249; CHECK-NEXT: vcvtt.f32.f16 s3, s9 250; CHECK-NEXT: vcvtt.f32.f16 s7, s11 251; CHECK-NEXT: vcvtb.f32.f16 s2, s9 252; CHECK-NEXT: vcvtb.f32.f16 s6, s11 253; CHECK-NEXT: vcvtt.f32.f16 s1, s8 254; CHECK-NEXT: vcvtt.f32.f16 s5, s10 255; CHECK-NEXT: vcvtb.f32.f16 s0, s8 256; CHECK-NEXT: vcvtb.f32.f16 s4, s10 257; CHECK-NEXT: bx lr 258entry: 259 %wide.load = load <16 x half>, <16 x half>* %src, align 4 260 %sh = shufflevector <16 x half> %wide.load, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 261 %e = fpext <8 x half> %sh to <8 x float> 262 ret <8 x float> %e 263} 264 265 266 267 268define arm_aapcs_vfpcc void @store_trunc_4(<4 x half>* %src, <4 x float> %val) { 269; CHECK-LABEL: store_trunc_4: 270; CHECK: @ %bb.0: @ %entry 271; CHECK-NEXT: vcvtb.f16.f32 q0, q0 272; CHECK-NEXT: vstrh.32 q0, [r0] 273; CHECK-NEXT: bx lr 274entry: 275 %e = fptrunc <4 x float> %val to <4 x half> 276 store <4 x half> %e, <4 x half>* %src, align 4 277 ret void 278} 279 280define arm_aapcs_vfpcc void @store_trunc_8(<8 x half>* %src, <8 x float> %val) { 281; CHECK-LABEL: store_trunc_8: 282; CHECK: @ %bb.0: @ %entry 283; CHECK-NEXT: vcvtb.f16.f32 q1, q1 284; CHECK-NEXT: vcvtb.f16.f32 q0, q0 285; CHECK-NEXT: vstrh.32 q1, [r0, #8] 286; CHECK-NEXT: vstrh.32 q0, [r0] 287; CHECK-NEXT: bx lr 288entry: 289 %e = fptrunc <8 x float> %val to <8 x half> 290 store <8 x half> %e, <8 x half>* %src, align 4 291 ret void 292} 293 294define arm_aapcs_vfpcc void @store_trunc_16(<16 x half>* %src, <16 x float> %val) { 295; CHECK-LABEL: store_trunc_16: 296; CHECK: @ %bb.0: @ %entry 297; CHECK-NEXT: vcvtb.f16.f32 q3, q3 298; CHECK-NEXT: vcvtb.f16.f32 q2, q2 299; CHECK-NEXT: vcvtb.f16.f32 q1, q1 300; CHECK-NEXT: vcvtb.f16.f32 q0, q0 301; CHECK-NEXT: vstrh.32 q3, [r0, #24] 302; CHECK-NEXT: vstrh.32 q2, [r0, #16] 303; CHECK-NEXT: vstrh.32 q1, [r0, #8] 304; CHECK-NEXT: vstrh.32 q0, [r0] 305; CHECK-NEXT: bx lr 306entry: 307 %e = fptrunc <16 x float> %val to <16 x half> 308 store <16 x half> %e, <16 x half>* %src, align 4 309 ret void 310} 311 312define arm_aapcs_vfpcc void @store_shuffletrunc_8(<8 x half>* %src, <4 x float> %val1, <4 x float> %val2) { 313; CHECK-LABEL: store_shuffletrunc_8: 314; CHECK: @ %bb.0: @ %entry 315; CHECK-NEXT: vcvtb.f16.f32 q0, q0 316; CHECK-NEXT: vcvtt.f16.f32 q0, q1 317; CHECK-NEXT: vstrw.32 q0, [r0] 318; CHECK-NEXT: bx lr 319entry: 320 %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 321 %out = fptrunc <8 x float> %strided.vec to <8 x half> 322 store <8 x half> %out, <8 x half>* %src, align 4 323 ret void 324} 325 326define arm_aapcs_vfpcc void @store_shuffletrunc_16(<16 x half>* %src, <8 x float> %val1, <8 x float> %val2) { 327; CHECK-LABEL: store_shuffletrunc_16: 328; CHECK: @ %bb.0: @ %entry 329; CHECK-NEXT: vcvtb.f16.f32 q1, q1 330; CHECK-NEXT: vcvtb.f16.f32 q0, q0 331; CHECK-NEXT: vcvtt.f16.f32 q1, q3 332; CHECK-NEXT: vcvtt.f16.f32 q0, q2 333; CHECK-NEXT: vstrw.32 q1, [r0, #16] 334; CHECK-NEXT: vstrw.32 q0, [r0] 335; CHECK-NEXT: bx lr 336entry: 337 %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 338 %out = fptrunc <16 x float> %strided.vec to <16 x half> 339 store <16 x half> %out, <16 x half>* %src, align 4 340 ret void 341} 342