1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple aarch64-arm-none-eabi  -mattr=+bf16 %s -o - | FileCheck %s
3
4define <2 x float> @test_vbfdot_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
5; CHECK-LABEL: test_vbfdot_f32:
6; CHECK:       // %bb.0: // %entry
7; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.4h
8; CHECK-NEXT:    ret
9entry:
10  %0 = bitcast <4 x bfloat> %a to <8 x i8>
11  %1 = bitcast <4 x bfloat> %b to <8 x i8>
12  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %0, <8 x i8> %1)
13  ret <2 x float> %vbfdot1.i
14}
15
16define <4 x float> @test_vbfdotq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
17; CHECK-LABEL: test_vbfdotq_f32:
18; CHECK:       // %bb.0: // %entry
19; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.8h
20; CHECK-NEXT:    ret
21entry:
22  %0 = bitcast <8 x bfloat> %a to <16 x i8>
23  %1 = bitcast <8 x bfloat> %b to <16 x i8>
24  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
25  ret <4 x float> %vbfdot1.i
26}
27
28define <2 x float> @test_vbfdot_lane_f32(<2 x float> %r, <4 x bfloat> %a, <4 x bfloat> %b) {
29; CHECK-LABEL: test_vbfdot_lane_f32:
30; CHECK:       // %bb.0: // %entry
31; CHECK:    bfdot v0.2s, v1.4h, v2.2h[0]
32; CHECK-NEXT:    ret
33entry:
34  %0 = bitcast <4 x bfloat> %b to <2 x float>
35  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <2 x i32> zeroinitializer
36  %1 = bitcast <4 x bfloat> %a to <8 x i8>
37  %2 = bitcast <2 x float> %shuffle to <8 x i8>
38  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
39  ret <2 x float> %vbfdot1.i
40}
41
42define <4 x float> @test_vbfdotq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
43; CHECK-LABEL: test_vbfdotq_laneq_f32:
44; CHECK:       // %bb.0: // %entry
45; CHECK-NEXT:    bfdot v0.4s, v1.8h, v2.2h[3]
46; CHECK-NEXT:    ret
47entry:
48  %0 = bitcast <8 x bfloat> %b to <4 x float>
49  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
50  %1 = bitcast <8 x bfloat> %a to <16 x i8>
51  %2 = bitcast <4 x float> %shuffle to <16 x i8>
52  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
53  ret <4 x float> %vbfdot1.i
54}
55
56define <2 x float> @test_vbfdot_laneq_f32(<2 x float> %r, <4 x bfloat> %a, <8 x bfloat> %b) {
57; CHECK-LABEL: test_vbfdot_laneq_f32:
58; CHECK:       // %bb.0: // %entry
59; CHECK-NEXT:    bfdot v0.2s, v1.4h, v2.2h[3]
60; CHECK-NEXT:    ret
61entry:
62  %0 = bitcast <8 x bfloat> %b to <4 x float>
63  %shuffle = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> <i32 3, i32 3>
64  %1 = bitcast <4 x bfloat> %a to <8 x i8>
65  %2 = bitcast <2 x float> %shuffle to <8 x i8>
66  %vbfdot1.i = tail call <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float> %r, <8 x i8> %1, <8 x i8> %2)
67  ret <2 x float> %vbfdot1.i
68}
69
70define <4 x float> @test_vbfdotq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
71; CHECK-LABEL: test_vbfdotq_lane_f32:
72; CHECK:       // %bb.0: // %entry
73; CHECK:    bfdot v0.4s, v1.8h, v2.2h[0]
74; CHECK-NEXT:    ret
75entry:
76  %0 = bitcast <4 x bfloat> %b to <2 x float>
77  %shuffle = shufflevector <2 x float> %0, <2 x float> undef, <4 x i32> zeroinitializer
78  %1 = bitcast <8 x bfloat> %a to <16 x i8>
79  %2 = bitcast <4 x float> %shuffle to <16 x i8>
80  %vbfdot1.i = tail call <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float> %r, <16 x i8> %1, <16 x i8> %2)
81  ret <4 x float> %vbfdot1.i
82}
83
84define <4 x float> @test_vbfmmlaq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
85; CHECK-LABEL: test_vbfmmlaq_f32:
86; CHECK:       // %bb.0: // %entry
87; CHECK-NEXT:    bfmmla v0.4s, v1.8h, v2.8h
88; CHECK-NEXT:    ret
89entry:
90  %0 = bitcast <8 x bfloat> %a to <16 x i8>
91  %1 = bitcast <8 x bfloat> %b to <16 x i8>
92  %vbfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
93  ret <4 x float> %vbfmmla1.i
94}
95
96define <4 x float> @test_vbfmlalbq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
97; CHECK-LABEL: test_vbfmlalbq_f32:
98; CHECK:       // %bb.0: // %entry
99; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.8h
100; CHECK-NEXT:    ret
101entry:
102  %0 = bitcast <8 x bfloat> %a to <16 x i8>
103  %1 = bitcast <8 x bfloat> %b to <16 x i8>
104  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
105  ret <4 x float> %vbfmlalb1.i
106}
107
108define <4 x float> @test_vbfmlaltq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
109; CHECK-LABEL: test_vbfmlaltq_f32:
110; CHECK:       // %bb.0: // %entry
111; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.8h
112; CHECK-NEXT:    ret
113entry:
114  %0 = bitcast <8 x bfloat> %a to <16 x i8>
115  %1 = bitcast <8 x bfloat> %b to <16 x i8>
116  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
117  ret <4 x float> %vbfmlalt1.i
118}
119
120define <4 x float> @test_vbfmlalbq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
121; CHECK-LABEL: test_vbfmlalbq_lane_f32:
122; CHECK:       // %bb.0: // %entry
123; CHECK:    bfmlalb v0.4s, v1.8h, v2.h[0]
124; CHECK-NEXT:    ret
125entry:
126  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
127  %0 = bitcast <8 x bfloat> %a to <16 x i8>
128  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
129  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
130  ret <4 x float> %vbfmlalb1.i
131}
132
133define <4 x float> @test_vbfmlalbq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
134; CHECK-LABEL: test_vbfmlalbq_laneq_f32:
135; CHECK:       // %bb.0: // %entry
136; CHECK-NEXT:    bfmlalb v0.4s, v1.8h, v2.h[3]
137; CHECK-NEXT:    ret
138entry:
139  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
140  %0 = bitcast <8 x bfloat> %a to <16 x i8>
141  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
142  %vbfmlalb1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
143  ret <4 x float> %vbfmlalb1.i
144}
145
146define <4 x float> @test_vbfmlaltq_lane_f32(<4 x float> %r, <8 x bfloat> %a, <4 x bfloat> %b) {
147; CHECK-LABEL: test_vbfmlaltq_lane_f32:
148; CHECK:       // %bb.0: // %entry
149; CHECK:    bfmlalt v0.4s, v1.8h, v2.h[0]
150; CHECK-NEXT:    ret
151entry:
152  %vecinit35 = shufflevector <4 x bfloat> %b, <4 x bfloat> undef, <8 x i32> zeroinitializer
153  %0 = bitcast <8 x bfloat> %a to <16 x i8>
154  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
155  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
156  ret <4 x float> %vbfmlalt1.i
157}
158
159define <4 x float> @test_vbfmlaltq_laneq_f32(<4 x float> %r, <8 x bfloat> %a, <8 x bfloat> %b) {
160; CHECK-LABEL: test_vbfmlaltq_laneq_f32:
161; CHECK:       // %bb.0: // %entry
162; CHECK-NEXT:    bfmlalt v0.4s, v1.8h, v2.h[3]
163; CHECK-NEXT:    ret
164entry:
165  %vecinit35 = shufflevector <8 x bfloat> %b, <8 x bfloat> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
166  %0 = bitcast <8 x bfloat> %a to <16 x i8>
167  %1 = bitcast <8 x bfloat> %vecinit35 to <16 x i8>
168  %vbfmlalt1.i = tail call <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float> %r, <16 x i8> %0, <16 x i8> %1)
169  ret <4 x float> %vbfmlalt1.i
170}
171
172declare <2 x float> @llvm.aarch64.neon.bfdot.v2f32.v8i8(<2 x float>, <8 x i8>, <8 x i8>) #2
173declare <4 x float> @llvm.aarch64.neon.bfdot.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
174declare <4 x float> @llvm.aarch64.neon.bfmmla.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
175declare <4 x float> @llvm.aarch64.neon.bfmlalb.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
176declare <4 x float> @llvm.aarch64.neon.bfmlalt.v4f32.v16i8(<4 x float>, <16 x i8>, <16 x i8>) #2
177