1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @to_4(float* nocapture readonly %x, half* noalias nocapture %y) { 5; CHECK-LABEL: to_4: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: mov.w lr, #256 10; CHECK-NEXT: movw r2, #26214 11; CHECK-NEXT: movt r2, #16390 12; CHECK-NEXT: .LBB0_1: @ %vector.body 13; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 14; CHECK-NEXT: vldrw.u32 q0, [r0], #16 15; CHECK-NEXT: vmul.f32 q0, q0, r2 16; CHECK-NEXT: vcvtb.f16.f32 q0, q0 17; CHECK-NEXT: vstrh.32 q0, [r1], #8 18; CHECK-NEXT: le lr, .LBB0_1 19; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 20; CHECK-NEXT: pop {r7, pc} 21entry: 22 br label %vector.body 23 24vector.body: ; preds = %vector.body, %entry 25 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 26 %0 = getelementptr inbounds float, float* %x, i32 %index 27 %1 = bitcast float* %0 to <4 x float>* 28 %wide.load = load <4 x float>, <4 x float>* %1, align 4 29 %2 = fmul <4 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 30 %3 = fptrunc <4 x float> %2 to <4 x half> 31 %4 = getelementptr inbounds half, half* %y, i32 %index 32 %5 = bitcast half* %4 to <4 x half>* 33 store <4 x half> %3, <4 x half>* %5, align 2 34 %index.next = add i32 %index, 4 35 %6 = icmp eq i32 %index.next, 1024 36 br i1 %6, label %for.cond.cleanup, label %vector.body 37 38for.cond.cleanup: ; preds = %vector.body 39 ret void 40} 41 42define void @to_8(float* nocapture readonly %x, half* noalias nocapture %y) { 43; CHECK-LABEL: to_8: 44; CHECK: @ %bb.0: @ %entry 45; CHECK-NEXT: .save {r7, lr} 46; CHECK-NEXT: push {r7, lr} 47; CHECK-NEXT: mov.w lr, #128 48; CHECK-NEXT: movw r2, #26214 49; CHECK-NEXT: movt r2, #16390 50; CHECK-NEXT: .LBB1_1: @ %vector.body 51; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 52; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 53; CHECK-NEXT: vmul.f32 q0, q0, r2 54; CHECK-NEXT: vcvtb.f16.f32 q0, q0 55; CHECK-NEXT: vstrh.32 q0, [r1, #8] 56; CHECK-NEXT: vldrw.u32 q0, [r0], #32 57; CHECK-NEXT: vmul.f32 q0, q0, r2 58; CHECK-NEXT: vcvtb.f16.f32 q0, q0 59; CHECK-NEXT: vstrh.32 q0, [r1], #16 60; CHECK-NEXT: le lr, .LBB1_1 61; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 62; CHECK-NEXT: pop {r7, pc} 63entry: 64 br label %vector.body 65 66vector.body: ; preds = %vector.body, %entry 67 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 68 %0 = getelementptr inbounds float, float* %x, i32 %index 69 %1 = bitcast float* %0 to <8 x float>* 70 %wide.load = load <8 x float>, <8 x float>* %1, align 4 71 %2 = fmul <8 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 72 %3 = fptrunc <8 x float> %2 to <8 x half> 73 %4 = getelementptr inbounds half, half* %y, i32 %index 74 %5 = bitcast half* %4 to <8 x half>* 75 store <8 x half> %3, <8 x half>* %5, align 2 76 %index.next = add i32 %index, 8 77 %6 = icmp eq i32 %index.next, 1024 78 br i1 %6, label %for.cond.cleanup, label %vector.body 79 80for.cond.cleanup: ; preds = %vector.body 81 ret void 82} 83 84define void @to_16(float* nocapture readonly %x, half* noalias nocapture %y) { 85; CHECK-LABEL: to_16: 86; CHECK: @ %bb.0: @ %entry 87; CHECK-NEXT: .save {r7, lr} 88; CHECK-NEXT: push {r7, lr} 89; CHECK-NEXT: mov.w lr, #64 90; CHECK-NEXT: movw r2, #26214 91; CHECK-NEXT: movt r2, #16390 92; CHECK-NEXT: .LBB2_1: @ %vector.body 93; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 94; CHECK-NEXT: vldrw.u32 q0, [r0, #48] 95; CHECK-NEXT: vmul.f32 q0, q0, r2 96; CHECK-NEXT: vcvtb.f16.f32 q0, q0 97; CHECK-NEXT: vstrh.32 q0, [r1, #24] 98; CHECK-NEXT: vldrw.u32 q0, [r0, #32] 99; CHECK-NEXT: vmul.f32 q0, q0, r2 100; CHECK-NEXT: vcvtb.f16.f32 q0, q0 101; CHECK-NEXT: vstrh.32 q0, [r1, #16] 102; CHECK-NEXT: vldrw.u32 q0, [r0, #16] 103; CHECK-NEXT: vmul.f32 q0, q0, r2 104; CHECK-NEXT: vcvtb.f16.f32 q0, q0 105; CHECK-NEXT: vstrh.32 q0, [r1, #8] 106; CHECK-NEXT: vldrw.u32 q0, [r0], #64 107; CHECK-NEXT: vmul.f32 q0, q0, r2 108; CHECK-NEXT: vcvtb.f16.f32 q0, q0 109; CHECK-NEXT: vstrh.32 q0, [r1], #32 110; CHECK-NEXT: le lr, .LBB2_1 111; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 112; CHECK-NEXT: pop {r7, pc} 113entry: 114 br label %vector.body 115 116vector.body: ; preds = %vector.body, %entry 117 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 118 %0 = getelementptr inbounds float, float* %x, i32 %index 119 %1 = bitcast float* %0 to <16 x float>* 120 %wide.load = load <16 x float>, <16 x float>* %1, align 4 121 %2 = fmul <16 x float> %wide.load, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 122 %3 = fptrunc <16 x float> %2 to <16 x half> 123 %4 = getelementptr inbounds half, half* %y, i32 %index 124 %5 = bitcast half* %4 to <16 x half>* 125 store <16 x half> %3, <16 x half>* %5, align 2 126 %index.next = add i32 %index, 16 127 %6 = icmp eq i32 %index.next, 1024 128 br i1 %6, label %for.cond.cleanup, label %vector.body 129 130for.cond.cleanup: ; preds = %vector.body 131 ret void 132} 133 134define void @from_4(half* nocapture readonly %x, float* noalias nocapture %y) { 135; CHECK-LABEL: from_4: 136; CHECK: @ %bb.0: @ %entry 137; CHECK-NEXT: .save {r7, lr} 138; CHECK-NEXT: push {r7, lr} 139; CHECK-NEXT: mov.w lr, #256 140; CHECK-NEXT: movw r2, #26214 141; CHECK-NEXT: movt r2, #16390 142; CHECK-NEXT: .LBB3_1: @ %vector.body 143; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 144; CHECK-NEXT: vldrh.u32 q0, [r0], #8 145; CHECK-NEXT: vcvtb.f32.f16 q0, q0 146; CHECK-NEXT: vmul.f32 q0, q0, r2 147; CHECK-NEXT: vstrb.8 q0, [r1], #16 148; CHECK-NEXT: le lr, .LBB3_1 149; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 150; CHECK-NEXT: pop {r7, pc} 151entry: 152 br label %vector.body 153 154vector.body: ; preds = %vector.body, %entry 155 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 156 %0 = getelementptr inbounds half, half* %x, i32 %index 157 %1 = bitcast half* %0 to <4 x half>* 158 %wide.load = load <4 x half>, <4 x half>* %1, align 2 159 %2 = fpext <4 x half> %wide.load to <4 x float> 160 %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 161 %4 = getelementptr inbounds float, float* %y, i32 %index 162 %5 = bitcast float* %4 to <4 x float>* 163 store <4 x float> %3, <4 x float>* %5, align 4 164 %index.next = add i32 %index, 4 165 %6 = icmp eq i32 %index.next, 1024 166 br i1 %6, label %for.cond.cleanup, label %vector.body 167 168for.cond.cleanup: ; preds = %vector.body 169 ret void 170} 171 172define void @from_8(half* nocapture readonly %x, float* noalias nocapture %y) { 173; CHECK-LABEL: from_8: 174; CHECK: @ %bb.0: @ %entry 175; CHECK-NEXT: .save {r7, lr} 176; CHECK-NEXT: push {r7, lr} 177; CHECK-NEXT: mov.w lr, #128 178; CHECK-NEXT: movw r2, #26214 179; CHECK-NEXT: movt r2, #16390 180; CHECK-NEXT: .LBB4_1: @ %vector.body 181; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 182; CHECK-NEXT: vldrh.u32 q0, [r0], #16 183; CHECK-NEXT: vldrh.u32 q1, [r0, #-8] 184; CHECK-NEXT: vcvtb.f32.f16 q0, q0 185; CHECK-NEXT: vmul.f32 q0, q0, r2 186; CHECK-NEXT: vcvtb.f32.f16 q1, q1 187; CHECK-NEXT: vmul.f32 q1, q1, r2 188; CHECK-NEXT: vstrw.32 q1, [r1, #16] 189; CHECK-NEXT: vstrw.32 q0, [r1], #32 190; CHECK-NEXT: le lr, .LBB4_1 191; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 192; CHECK-NEXT: pop {r7, pc} 193entry: 194 br label %vector.body 195 196vector.body: ; preds = %vector.body, %entry 197 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 198 %0 = getelementptr inbounds half, half* %x, i32 %index 199 %1 = bitcast half* %0 to <8 x half>* 200 %wide.load = load <8 x half>, <8 x half>* %1, align 2 201 %2 = fpext <8 x half> %wide.load to <8 x float> 202 %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 203 %4 = getelementptr inbounds float, float* %y, i32 %index 204 %5 = bitcast float* %4 to <8 x float>* 205 store <8 x float> %3, <8 x float>* %5, align 4 206 %index.next = add i32 %index, 8 207 %6 = icmp eq i32 %index.next, 1024 208 br i1 %6, label %for.cond.cleanup, label %vector.body 209 210for.cond.cleanup: ; preds = %vector.body 211 ret void 212} 213 214define void @from_16(half* nocapture readonly %x, float* noalias nocapture %y) { 215; CHECK-LABEL: from_16: 216; CHECK: @ %bb.0: @ %entry 217; CHECK-NEXT: .save {r7, lr} 218; CHECK-NEXT: push {r7, lr} 219; CHECK-NEXT: mov.w lr, #64 220; CHECK-NEXT: movw r2, #26214 221; CHECK-NEXT: movt r2, #16390 222; CHECK-NEXT: .LBB5_1: @ %vector.body 223; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 224; CHECK-NEXT: vldrh.u32 q0, [r0], #32 225; CHECK-NEXT: vldrh.u32 q1, [r0, #-24] 226; CHECK-NEXT: vldrh.u32 q2, [r0, #-16] 227; CHECK-NEXT: vldrh.u32 q3, [r0, #-8] 228; CHECK-NEXT: vcvtb.f32.f16 q0, q0 229; CHECK-NEXT: vcvtb.f32.f16 q1, q1 230; CHECK-NEXT: vcvtb.f32.f16 q2, q2 231; CHECK-NEXT: vcvtb.f32.f16 q3, q3 232; CHECK-NEXT: vmul.f32 q2, q2, r2 233; CHECK-NEXT: vmul.f32 q3, q3, r2 234; CHECK-NEXT: vmul.f32 q1, q1, r2 235; CHECK-NEXT: vmul.f32 q0, q0, r2 236; CHECK-NEXT: vstrw.32 q3, [r1, #48] 237; CHECK-NEXT: vstrw.32 q2, [r1, #32] 238; CHECK-NEXT: vstrw.32 q1, [r1, #16] 239; CHECK-NEXT: vstrw.32 q0, [r1], #64 240; CHECK-NEXT: le lr, .LBB5_1 241; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 242; CHECK-NEXT: pop {r7, pc} 243entry: 244 br label %vector.body 245 246vector.body: ; preds = %vector.body, %entry 247 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 248 %0 = getelementptr inbounds half, half* %x, i32 %index 249 %1 = bitcast half* %0 to <16 x half>* 250 %wide.load = load <16 x half>, <16 x half>* %1, align 2 251 %2 = fpext <16 x half> %wide.load to <16 x float> 252 %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 253 %4 = getelementptr inbounds float, float* %y, i32 %index 254 %5 = bitcast float* %4 to <16 x float>* 255 store <16 x float> %3, <16 x float>* %5, align 4 256 %index.next = add i32 %index, 16 257 %6 = icmp eq i32 %index.next, 1024 258 br i1 %6, label %for.cond.cleanup, label %vector.body 259 260for.cond.cleanup: ; preds = %vector.body 261 ret void 262} 263 264define void @both_4(half* nocapture readonly %x, half* noalias nocapture %y) { 265; CHECK-LABEL: both_4: 266; CHECK: @ %bb.0: @ %entry 267; CHECK-NEXT: .save {r7, lr} 268; CHECK-NEXT: push {r7, lr} 269; CHECK-NEXT: mov.w lr, #256 270; CHECK-NEXT: movw r2, #26214 271; CHECK-NEXT: movt r2, #16390 272; CHECK-NEXT: .LBB6_1: @ %vector.body 273; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 274; CHECK-NEXT: vldrh.u32 q0, [r0], #8 275; CHECK-NEXT: vcvtb.f32.f16 q0, q0 276; CHECK-NEXT: vmul.f32 q0, q0, r2 277; CHECK-NEXT: vcvtb.f16.f32 q0, q0 278; CHECK-NEXT: vstrh.32 q0, [r1], #8 279; CHECK-NEXT: le lr, .LBB6_1 280; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 281; CHECK-NEXT: pop {r7, pc} 282entry: 283 br label %vector.body 284 285vector.body: ; preds = %vector.body, %entry 286 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 287 %0 = getelementptr inbounds half, half* %x, i32 %index 288 %1 = bitcast half* %0 to <4 x half>* 289 %wide.load = load <4 x half>, <4 x half>* %1, align 2 290 %2 = fpext <4 x half> %wide.load to <4 x float> 291 %3 = fmul <4 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 292 %4 = fptrunc <4 x float> %3 to <4 x half> 293 %5 = getelementptr inbounds half, half* %y, i32 %index 294 %6 = bitcast half* %5 to <4 x half>* 295 store <4 x half> %4, <4 x half>* %6, align 2 296 %index.next = add i32 %index, 4 297 %7 = icmp eq i32 %index.next, 1024 298 br i1 %7, label %for.cond.cleanup, label %vector.body 299 300for.cond.cleanup: ; preds = %vector.body 301 ret void 302} 303 304define void @both_8(half* nocapture readonly %x, half* noalias nocapture %y) { 305; CHECK-LABEL: both_8: 306; CHECK: @ %bb.0: @ %entry 307; CHECK-NEXT: .save {r7, lr} 308; CHECK-NEXT: push {r7, lr} 309; CHECK-NEXT: mov.w lr, #128 310; CHECK-NEXT: movw r2, #26214 311; CHECK-NEXT: movt r2, #16390 312; CHECK-NEXT: .LBB7_1: @ %vector.body 313; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 314; CHECK-NEXT: vldrh.u16 q0, [r0], #16 315; CHECK-NEXT: vcvtb.f32.f16 q1, q0 316; CHECK-NEXT: vcvtt.f32.f16 q0, q0 317; CHECK-NEXT: vmul.f32 q1, q1, r2 318; CHECK-NEXT: vmul.f32 q0, q0, r2 319; CHECK-NEXT: vcvtb.f16.f32 q1, q1 320; CHECK-NEXT: vcvtt.f16.f32 q1, q0 321; CHECK-NEXT: vstrb.8 q1, [r1], #16 322; CHECK-NEXT: le lr, .LBB7_1 323; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 324; CHECK-NEXT: pop {r7, pc} 325entry: 326 br label %vector.body 327 328vector.body: ; preds = %vector.body, %entry 329 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 330 %0 = getelementptr inbounds half, half* %x, i32 %index 331 %1 = bitcast half* %0 to <8 x half>* 332 %wide.load = load <8 x half>, <8 x half>* %1, align 2 333 %2 = fpext <8 x half> %wide.load to <8 x float> 334 %3 = fmul <8 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 335 %4 = fptrunc <8 x float> %3 to <8 x half> 336 %5 = getelementptr inbounds half, half* %y, i32 %index 337 %6 = bitcast half* %5 to <8 x half>* 338 store <8 x half> %4, <8 x half>* %6, align 2 339 %index.next = add i32 %index, 8 340 %7 = icmp eq i32 %index.next, 1024 341 br i1 %7, label %for.cond.cleanup, label %vector.body 342 343for.cond.cleanup: ; preds = %vector.body 344 ret void 345} 346 347define void @both_16(half* nocapture readonly %x, half* noalias nocapture %y) { 348; CHECK-LABEL: both_16: 349; CHECK: @ %bb.0: @ %entry 350; CHECK-NEXT: .save {r7, lr} 351; CHECK-NEXT: push {r7, lr} 352; CHECK-NEXT: mov.w lr, #64 353; CHECK-NEXT: movw r2, #26214 354; CHECK-NEXT: movt r2, #16390 355; CHECK-NEXT: .LBB8_1: @ %vector.body 356; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 357; CHECK-NEXT: vldrh.u16 q0, [r0, #16] 358; CHECK-NEXT: vcvtb.f32.f16 q1, q0 359; CHECK-NEXT: vcvtt.f32.f16 q0, q0 360; CHECK-NEXT: vmul.f32 q1, q1, r2 361; CHECK-NEXT: vmul.f32 q0, q0, r2 362; CHECK-NEXT: vcvtb.f16.f32 q1, q1 363; CHECK-NEXT: vcvtt.f16.f32 q1, q0 364; CHECK-NEXT: vldrh.u16 q0, [r0], #32 365; CHECK-NEXT: vstrh.16 q1, [r1, #16] 366; CHECK-NEXT: vcvtb.f32.f16 q1, q0 367; CHECK-NEXT: vcvtt.f32.f16 q0, q0 368; CHECK-NEXT: vmul.f32 q1, q1, r2 369; CHECK-NEXT: vmul.f32 q0, q0, r2 370; CHECK-NEXT: vcvtb.f16.f32 q1, q1 371; CHECK-NEXT: vcvtt.f16.f32 q1, q0 372; CHECK-NEXT: vstrh.16 q1, [r1], #32 373; CHECK-NEXT: le lr, .LBB8_1 374; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 375; CHECK-NEXT: pop {r7, pc} 376entry: 377 br label %vector.body 378 379vector.body: ; preds = %vector.body, %entry 380 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 381 %0 = getelementptr inbounds half, half* %x, i32 %index 382 %1 = bitcast half* %0 to <16 x half>* 383 %wide.load = load <16 x half>, <16 x half>* %1, align 2 384 %2 = fpext <16 x half> %wide.load to <16 x float> 385 %3 = fmul <16 x float> %2, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 386 %4 = fptrunc <16 x float> %3 to <16 x half> 387 %5 = getelementptr inbounds half, half* %y, i32 %index 388 %6 = bitcast half* %5 to <16 x half>* 389 store <16 x half> %4, <16 x half>* %6, align 2 390 %index.next = add i32 %index, 16 391 %7 = icmp eq i32 %index.next, 1024 392 br i1 %7, label %for.cond.cleanup, label %vector.body 393 394for.cond.cleanup: ; preds = %vector.body 395 ret void 396} 397 398define void @both_8_I(half* nocapture readonly %x, half* noalias nocapture %y) { 399; CHECK-LABEL: both_8_I: 400; CHECK: @ %bb.0: @ %entry 401; CHECK-NEXT: .save {r7, lr} 402; CHECK-NEXT: push {r7, lr} 403; CHECK-NEXT: mov.w lr, #128 404; CHECK-NEXT: movw r2, #26214 405; CHECK-NEXT: movt r2, #16390 406; CHECK-NEXT: .LBB9_1: @ %vector.body 407; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 408; CHECK-NEXT: vldrh.u16 q0, [r0], #16 409; CHECK-NEXT: vcvtb.f32.f16 q1, q0 410; CHECK-NEXT: vcvtt.f32.f16 q0, q0 411; CHECK-NEXT: vmul.f32 q1, q1, r2 412; CHECK-NEXT: vmul.f32 q0, q0, r2 413; CHECK-NEXT: vcvtb.f16.f32 q1, q1 414; CHECK-NEXT: vcvtt.f16.f32 q1, q0 415; CHECK-NEXT: vstrb.8 q1, [r1], #16 416; CHECK-NEXT: le lr, .LBB9_1 417; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 418; CHECK-NEXT: pop {r7, pc} 419entry: 420 br label %vector.body 421 422vector.body: ; preds = %vector.body, %entry 423 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 424 %0 = getelementptr inbounds half, half* %x, i32 %index 425 %1 = bitcast half* %0 to <8 x half>* 426 %wide.load = load <8 x half>, <8 x half>* %1, align 2 427 %2 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 428 %3 = shufflevector <8 x half> %wide.load, <8 x half> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 429 %4 = fpext <4 x half> %2 to <4 x float> 430 %5 = fpext <4 x half> %3 to <4 x float> 431 %6 = fmul <4 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 432 %7 = fmul <4 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 433 %8 = shufflevector <4 x float> %6, <4 x float> %7, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 434 %9 = fptrunc <8 x float> %8 to <8 x half> 435 %10 = getelementptr inbounds half, half* %y, i32 %index 436 %11 = bitcast half* %10 to <8 x half>* 437 store <8 x half> %9, <8 x half>* %11, align 2 438 %index.next = add i32 %index, 8 439 %12 = icmp eq i32 %index.next, 1024 440 br i1 %12, label %for.cond.cleanup, label %vector.body 441 442for.cond.cleanup: ; preds = %vector.body 443 ret void 444} 445 446define void @both_16_I(half* nocapture readonly %x, half* noalias nocapture %y) { 447; CHECK-LABEL: both_16_I: 448; CHECK: @ %bb.0: @ %entry 449; CHECK-NEXT: .save {r7, lr} 450; CHECK-NEXT: push {r7, lr} 451; CHECK-NEXT: mov.w lr, #128 452; CHECK-NEXT: movw r2, #26214 453; CHECK-NEXT: movt r2, #16390 454; CHECK-NEXT: .LBB10_1: @ %vector.body 455; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 456; CHECK-NEXT: vldrh.u16 q0, [r0] 457; CHECK-NEXT: vcvtb.f32.f16 q1, q0 458; CHECK-NEXT: vcvtt.f32.f16 q0, q0 459; CHECK-NEXT: vmul.f32 q1, q1, r2 460; CHECK-NEXT: vmul.f32 q0, q0, r2 461; CHECK-NEXT: vcvtb.f16.f32 q1, q1 462; CHECK-NEXT: vcvtt.f16.f32 q1, q0 463; CHECK-NEXT: vldrh.u16 q0, [r0, #16]! 464; CHECK-NEXT: vstrh.16 q1, [r1] 465; CHECK-NEXT: vcvtb.f32.f16 q1, q0 466; CHECK-NEXT: vcvtt.f32.f16 q0, q0 467; CHECK-NEXT: vmul.f32 q1, q1, r2 468; CHECK-NEXT: vmul.f32 q0, q0, r2 469; CHECK-NEXT: vcvtb.f16.f32 q1, q1 470; CHECK-NEXT: vcvtt.f16.f32 q1, q0 471; CHECK-NEXT: vstrb.8 q1, [r1, #16]! 472; CHECK-NEXT: le lr, .LBB10_1 473; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 474; CHECK-NEXT: pop {r7, pc} 475entry: 476 br label %vector.body 477 478vector.body: ; preds = %vector.body, %entry 479 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 480 %0 = getelementptr inbounds half, half* %x, i32 %index 481 %1 = bitcast half* %0 to <16 x half>* 482 %wide.load = load <16 x half>, <16 x half>* %1, align 2 483 %2 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 484 %3 = shufflevector <16 x half> %wide.load, <16 x half> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 485 %4 = fpext <8 x half> %2 to <8 x float> 486 %5 = fpext <8 x half> %3 to <8 x float> 487 %6 = fmul <8 x float> %4, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 488 %7 = fmul <8 x float> %5, <float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000, float 0x4000CCCCC0000000> 489 %8 = shufflevector <8 x float> %6, <8 x float> %7, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 490 %9 = fptrunc <16 x float> %8 to <16 x half> 491 %10 = getelementptr inbounds half, half* %y, i32 %index 492 %11 = bitcast half* %10 to <16 x half>* 493 store <16 x half> %9, <16 x half>* %11, align 2 494 %index.next = add i32 %index, 8 495 %12 = icmp eq i32 %index.next, 1024 496 br i1 %12, label %for.cond.cleanup, label %vector.body 497 498for.cond.cleanup: ; preds = %vector.body 499 ret void 500} 501