1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple aarch64-arm-none-eabi -mattr=+bf16 %s -o - | FileCheck %s 3 4define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { 5; CHECK-LABEL: test_vbfdot_f32: 6; CHECK: // %bb.0: // %entry 7; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.4h 8; CHECK-NEXT: ret 9entry: 10 %0 = bitcast <4 x bfloat> %a to <8 x i8> 11 %1 = bitcast <4 x bfloat> %b to <8 x i8> 12 %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1) 13 ret <2 x float> %vbfdot1.i 14} 15 16define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 17; CHECK-LABEL: test_vbfdotq_f32: 18; CHECK: // %bb.0: // %entry 19; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.8h 20; CHECK-NEXT: ret 21entry: 22 %0 = bitcast <8 x bfloat> %a to <16 x i8> 23 %1 = bitcast <8 x bfloat> %b to <16 x i8> 24 %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 25 ret <4 x float> %vbfdot1.i 26} 27 28define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) { 29; CHECK-LABEL: test_vbfdot_lane_f32: 30; CHECK: // %bb.0: // %entry 31; CHECK: bfdot v0.2s, v1.4h, v2.2h[0] 32; CHECK-NEXT: ret 33entry: 34 %0 = bitcast <4 x bfloat> %b to <2 x float> 35 %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer 36 %1 = bitcast <4 x bfloat> %a to <8 x i8> 37 %2 = bitcast <2 x float> %shuffle to <8 x i8> 38 %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) 39 ret <2 x float> %vbfdot1.i 40} 41 42define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 43; CHECK-LABEL: test_vbfdotq_laneq_f32: 44; CHECK: // %bb.0: // %entry 45; CHECK-NEXT: bfdot v0.4s, v1.8h, v2.2h[3] 46; CHECK-NEXT: ret 47entry: 48 %0 = bitcast <8 x bfloat> %b to <4 x float> 49 %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3> 50 %1 = bitcast <8 x bfloat> %a to <16 x i8> 51 %2 = bitcast <4 x float> %shuffle to <16 x i8> 52 %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) 53 ret <4 x float> %vbfdot1.i 54} 55 56define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) { 57; CHECK-LABEL: test_vbfdot_laneq_f32: 58; CHECK: // %bb.0: // %entry 59; CHECK-NEXT: bfdot v0.2s, v1.4h, v2.2h[3] 60; CHECK-NEXT: ret 61entry: 62 %0 = bitcast <8 x bfloat> %b to <4 x float> 63 %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3> 64 %1 = bitcast <4 x bfloat> %a to <8 x i8> 65 %2 = bitcast <2 x float> %shuffle to <8 x i8> 66 %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2) 67 ret <2 x float> %vbfdot1.i 68} 69 70define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { 71; CHECK-LABEL: test_vbfdotq_lane_f32: 72; CHECK: // %bb.0: // %entry 73; CHECK: bfdot v0.4s, v1.8h, v2.2h[0] 74; CHECK-NEXT: ret 75entry: 76 %0 = bitcast <4 x bfloat> %b to <2 x float> 77 %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer 78 %1 = bitcast <8 x bfloat> %a to <16 x i8> 79 %2 = bitcast <4 x float> %shuffle to <16 x i8> 80 %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2) 81 ret <4 x float> %vbfdot1.i 82} 83 84define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 85; CHECK-LABEL: test_vbfmmlaq_f32: 86; CHECK: // %bb.0: // %entry 87; CHECK-NEXT: bfmmla v0.4s, v1.8h, v2.8h 88; CHECK-NEXT: ret 89entry: 90 %0 = bitcast <8 x bfloat> %a to <16 x i8> 91 %1 = bitcast <8 x bfloat> %b to <16 x i8> 92 %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 93 ret <4 x float> %vbfmmla1.i 94} 95 96define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 97; CHECK-LABEL: test_vbfmlalbq_f32: 98; CHECK: // %bb.0: // %entry 99; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.8h 100; CHECK-NEXT: ret 101entry: 102 %0 = bitcast <8 x bfloat> %a to <16 x i8> 103 %1 = bitcast <8 x bfloat> %b to <16 x i8> 104 %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 105 ret <4 x float> %vbfmlalb1.i 106} 107 108define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 109; CHECK-LABEL: test_vbfmlaltq_f32: 110; CHECK: // %bb.0: // %entry 111; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.8h 112; CHECK-NEXT: ret 113entry: 114 %0 = bitcast <8 x bfloat> %a to <16 x i8> 115 %1 = bitcast <8 x bfloat> %b to <16 x i8> 116 %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 117 ret <4 x float> %vbfmlalt1.i 118} 119 120define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { 121; CHECK-LABEL: test_vbfmlalbq_lane_f32: 122; CHECK: // %bb.0: // %entry 123; CHECK: bfmlalb v0.4s, v1.8h, v2.h[0] 124; CHECK-NEXT: ret 125entry: 126 %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer 127 %0 = bitcast <8 x bfloat> %a to <16 x i8> 128 %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> 129 %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 130 ret <4 x float> %vbfmlalb1.i 131} 132 133define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 134; CHECK-LABEL: test_vbfmlalbq_laneq_f32: 135; CHECK: // %bb.0: // %entry 136; CHECK-NEXT: bfmlalb v0.4s, v1.8h, v2.h[3] 137; CHECK-NEXT: ret 138entry: 139 %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 140 %0 = bitcast <8 x bfloat> %a to <16 x i8> 141 %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> 142 %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 143 ret <4 x float> %vbfmlalb1.i 144} 145 146define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) { 147; CHECK-LABEL: test_vbfmlaltq_lane_f32: 148; CHECK: // %bb.0: // %entry 149; CHECK: bfmlalt v0.4s, v1.8h, v2.h[0] 150; CHECK-NEXT: ret 151entry: 152 %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer 153 %0 = bitcast <8 x bfloat> %a to <16 x i8> 154 %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> 155 %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 156 ret <4 x float> %vbfmlalt1.i 157} 158 159define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) { 160; CHECK-LABEL: test_vbfmlaltq_laneq_f32: 161; CHECK: // %bb.0: // %entry 162; CHECK-NEXT: bfmlalt v0.4s, v1.8h, v2.h[3] 163; CHECK-NEXT: ret 164entry: 165 %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 166 %0 = bitcast <8 x bfloat> %a to <16 x i8> 167 %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8> 168 %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1) 169 ret <4 x float> %vbfmlalt1.i 170} 171 172declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2 173declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 174declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 175declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 176declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2 177