1 // RUN: %clang_cc1 -triple aarch64-arm-none-eabi -target-feature +neon -target-feature +bf16 \
2 // RUN:  -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK64
3 // RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +bf16 -mfloat-abi hard \
4 // RUN:  -O2 -emit-llvm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK32
5 
6 // REQUIRES: arm-registered-target,aarch64-registered-target
7 
8 #include "arm_neon.h"
9 
test_vld1_bf16(bfloat16_t const * ptr)10 bfloat16x4_t test_vld1_bf16(bfloat16_t const *ptr) {
11   return vld1_bf16(ptr);
12 }
13 // CHECK-LABEL: test_vld1_bf16
14 // CHECK64: %1 = load <4 x bfloat>, <4 x bfloat>* %0
15 // CHECK64-NEXT: ret <4 x bfloat> %1
16 // CHECK32: %1 = load <4 x bfloat>, <4 x bfloat>* %0, align 2
17 // CHECK32-NEXT: ret <4 x bfloat> %1
18 
test_vld1q_bf16(bfloat16_t const * ptr)19 bfloat16x8_t test_vld1q_bf16(bfloat16_t const *ptr) {
20   return vld1q_bf16(ptr);
21 }
22 // CHECK-LABEL: test_vld1q_bf16
23 // CHECK64: %1 = load <8 x bfloat>, <8 x bfloat>* %0
24 // CHECK64-NEXT: ret <8 x bfloat> %1
25 // CHECK32: %1 = load <8 x bfloat>, <8 x bfloat>* %0, align 2
26 // CHECK32-NEXT: ret <8 x bfloat> %1
27 
test_vld1_lane_bf16(bfloat16_t const * ptr,bfloat16x4_t src)28 bfloat16x4_t test_vld1_lane_bf16(bfloat16_t const *ptr, bfloat16x4_t src) {
29   return vld1_lane_bf16(ptr, src, 0);
30 }
31 // CHECK-LABEL: test_vld1_lane_bf16
32 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
33 // CHECK64-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
34 // CHECK64-NEXT: ret <4 x bfloat> %vld1_lane
35 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
36 // CHECK32-NEXT: %vld1_lane = insertelement <4 x bfloat> %src, bfloat %0, i32 0
37 // CHECK32-NEXT: ret <4 x bfloat> %vld1_lane
38 
test_vld1q_lane_bf16(bfloat16_t const * ptr,bfloat16x8_t src)39 bfloat16x8_t test_vld1q_lane_bf16(bfloat16_t const *ptr, bfloat16x8_t src) {
40   return vld1q_lane_bf16(ptr, src, 7);
41 }
42 // CHECK-LABEL: test_vld1q_lane_bf16
43 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
44 // CHECK64-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
45 // CHECK64-NEXT: ret <8 x bfloat> %vld1_lane
46 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
47 // CHECK32-NEXT: %vld1_lane = insertelement <8 x bfloat> %src, bfloat %0, i32 7
48 // CHECK32-NEXT: ret <8 x bfloat> %vld1_lane
49 
test_vld1_dup_bf16(bfloat16_t const * ptr)50 bfloat16x4_t test_vld1_dup_bf16(bfloat16_t const *ptr) {
51   return vld1_dup_bf16(ptr);
52 }
53 // CHECK-LABEL: test_vld1_dup_bf16
54 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
55 // CHECK64-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
56 // CHECK64-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
57 // CHECK64-NEXT: ret <4 x bfloat> %lane
58 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
59 // CHECK32-NEXT: %1 = insertelement <4 x bfloat> undef, bfloat %0, i32 0
60 // CHECK32-NEXT: %lane = shufflevector <4 x bfloat> %1, <4 x bfloat> undef, <4 x i32> zeroinitializer
61 // CHECK32-NEXT: ret <4 x bfloat> %lane
62 
test_vld1_bf16_x2(bfloat16_t const * ptr)63 bfloat16x4x2_t test_vld1_bf16_x2(bfloat16_t const *ptr) {
64   return vld1_bf16_x2(ptr);
65 }
66 // CHECK-LABEL: test_vld1_bf16_x2
67 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x2.v4bf16.p0bf16(bfloat* %ptr)
68 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x2.v4bf16.p0bf16(bfloat* %ptr)
69 
test_vld1q_bf16_x2(bfloat16_t const * ptr)70 bfloat16x8x2_t test_vld1q_bf16_x2(bfloat16_t const *ptr) {
71   return vld1q_bf16_x2(ptr);
72 }
73 // CHECK-LABEL: test_vld1q_bf16_x2
74 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x2.v8bf16.p0bf16(bfloat* %ptr)
75 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x2.v8bf16.p0bf16(bfloat* %ptr)
76 
test_vld1_bf16_x3(bfloat16_t const * ptr)77 bfloat16x4x3_t test_vld1_bf16_x3(bfloat16_t const *ptr) {
78   return vld1_bf16_x3(ptr);
79 }
80 // CHECK-LABEL: test_vld1_bf16_x3
81 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x3.v4bf16.p0bf16(bfloat* %ptr)
82 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x3.v4bf16.p0bf16(bfloat* %ptr)
83 
test_vld1q_bf16_x3(bfloat16_t const * ptr)84 bfloat16x8x3_t test_vld1q_bf16_x3(bfloat16_t const *ptr) {
85   return vld1q_bf16_x3(ptr);
86 }
87 // CHECK-LABEL: test_vld1q_bf16_x3
88 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x3.v8bf16.p0bf16(bfloat* %ptr)
89 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x3.v8bf16.p0bf16(bfloat* %ptr)
90 
test_vld1_bf16_x4(bfloat16_t const * ptr)91 bfloat16x4x4_t test_vld1_bf16_x4(bfloat16_t const *ptr) {
92   return vld1_bf16_x4(ptr);
93 }
94 // CHECK-LABEL: test_vld1_bf16_x4
95 // CHECK64: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld1x4.v4bf16.p0bf16(bfloat* %ptr)
96 // CHECK32: %vld1xN = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld1x4.v4bf16.p0bf16(bfloat* %ptr)
97 
test_vld1q_bf16_x4(bfloat16_t const * ptr)98 bfloat16x8x4_t test_vld1q_bf16_x4(bfloat16_t const *ptr) {
99   return vld1q_bf16_x4(ptr);
100 }
101 // CHECK-LABEL: test_vld1q_bf16_x4
102 // CHECK64: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld1x4.v8bf16.p0bf16(bfloat* %ptr)
103 // CHECK32: %vld1xN = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld1x4.v8bf16.p0bf16(bfloat* %ptr)
104 
test_vld1q_dup_bf16(bfloat16_t const * ptr)105 bfloat16x8_t test_vld1q_dup_bf16(bfloat16_t const *ptr) {
106   return vld1q_dup_bf16(ptr);
107 }
108 // CHECK-LABEL: test_vld1q_dup_bf16
109 // CHECK64: %0 = load bfloat, bfloat* %ptr, align 2
110 // CHECK64-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
111 // CHECK64-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
112 // CHECK64-NEXT: ret <8 x bfloat> %lane
113 // CHECK32: %0 = load bfloat, bfloat* %ptr, align 2
114 // CHECK32-NEXT: %1 = insertelement <8 x bfloat> undef, bfloat %0, i32 0
115 // CHECK32-NEXT: %lane = shufflevector <8 x bfloat> %1, <8 x bfloat> undef, <8 x i32> zeroinitializer
116 // CHECK32-NEXT: ret <8 x bfloat> %lane
117 
test_vld2_bf16(bfloat16_t const * ptr)118 bfloat16x4x2_t test_vld2_bf16(bfloat16_t const *ptr) {
119   return vld2_bf16(ptr);
120 }
121 // CHECK-LABEL: test_vld2_bf16
122 // CHECK64:  %0 = bitcast bfloat* %ptr to <4 x bfloat>*
123 // CHECK64-NEXT:  %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2.v4bf16.p0v4bf16(<4 x bfloat>* %0)
124 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
125 // CHECK32-NEXT: %vld2_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2.v4bf16.p0i8(i8* %0, i32 2)
126 
test_vld2q_bf16(bfloat16_t const * ptr)127 bfloat16x8x2_t test_vld2q_bf16(bfloat16_t const *ptr) {
128   return vld2q_bf16(ptr);
129 }
130 // CHECK-LABEL: test_vld2q_bf16
131 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
132 // CHECK64-NEXT: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2.v8bf16.p0v8bf16(<8 x bfloat>* %0)
133 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
134 // CHECK32-NEXT: %vld2q_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2.v8bf16.p0i8(i8* %0, i32 2)
135 
test_vld2_lane_bf16(bfloat16_t const * ptr,bfloat16x4x2_t src)136 bfloat16x4x2_t test_vld2_lane_bf16(bfloat16_t const *ptr, bfloat16x4x2_t src) {
137   return vld2_lane_bf16(ptr, src, 1);
138 }
139 // CHECK-LABEL: test_vld2_lane_bf16
140 // CHECK64: %vld2_lane = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, i64 1, i8* %0)
141 // CHECK32: %vld2_lane_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2lane.v4bf16.p0i8(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
142 
test_vld2q_lane_bf16(bfloat16_t const * ptr,bfloat16x8x2_t src)143 bfloat16x8x2_t test_vld2q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x2_t src) {
144   return vld2q_lane_bf16(ptr, src, 7);
145 }
146 // CHECK-LABEL: test_vld2q_lane_bf16
147 // CHECK64: %vld2_lane = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, i64 7, i8* %0)
148 // CHECK32: %vld2q_lane_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2lane.v8bf16.p0i8(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
149 
test_vld3_bf16(bfloat16_t const * ptr)150 bfloat16x4x3_t test_vld3_bf16(bfloat16_t const *ptr) {
151   return vld3_bf16(ptr);
152 }
153 // CHECK-LABEL: test_vld3_bf16
154 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3.v4bf16.p0v4bf16(<4 x bfloat>* %0)
155 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
156 // CHECK32-NEXT: %vld3_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3.v4bf16.p0i8(i8* %0, i32 2)
157 
test_vld3q_bf16(bfloat16_t const * ptr)158 bfloat16x8x3_t test_vld3q_bf16(bfloat16_t const *ptr) {
159   return vld3q_bf16(ptr);
160 }
161 // CHECK-LABEL: test_vld3q_bf16
162 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3.v8bf16.p0v8bf16(<8 x bfloat>* %0)
163 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
164 // CHECK32-NEXT: %vld3q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3.v8bf16.p0i8(i8* %0, i32 2)
165 
test_vld3_lane_bf16(bfloat16_t const * ptr,bfloat16x4x3_t src)166 bfloat16x4x3_t test_vld3_lane_bf16(bfloat16_t const *ptr, bfloat16x4x3_t src) {
167   return vld3_lane_bf16(ptr, src, 1);
168 }
169 // CHECK-LABEL: test_vld3_lane_bf16
170 // CHECK64: %vld3_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, i64 1, i8* %0)
171 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
172 // CHECK32-NEXT: %vld3_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3lane.v4bf16.p0i8(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
173 
test_vld3q_lane_bf16(bfloat16_t const * ptr,bfloat16x8x3_t src)174 bfloat16x8x3_t test_vld3q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x3_t src) {
175   return vld3q_lane_bf16(ptr, src, 7);
176   // return vld3q_lane_bf16(ptr, src, 8);
177 }
178 // CHECK-LABEL: test_vld3q_lane_bf16
179 // CHECK64: %vld3_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, i64 7, i8* %0)
180 // CHECK32: %3 = bitcast bfloat* %ptr to i8*
181 // CHECK32-NEXT: %vld3q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3lane.v8bf16.p0i8(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
182 
test_vld4_bf16(bfloat16_t const * ptr)183 bfloat16x4x4_t test_vld4_bf16(bfloat16_t const *ptr) {
184   return vld4_bf16(ptr);
185 }
186 // CHECK-LABEL: test_vld4_bf16
187 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4.v4bf16.p0v4bf16(<4 x bfloat>* %0)
188 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
189 // CHECK32-NEXT: %vld4_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4.v4bf16.p0i8(i8* %0, i32 2)
190 
test_vld4q_bf16(bfloat16_t const * ptr)191 bfloat16x8x4_t test_vld4q_bf16(bfloat16_t const *ptr) {
192   return vld4q_bf16(ptr);
193 }
194 // CHECK-LABEL: test_vld4q_bf16
195 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4.v8bf16.p0v8bf16(<8 x bfloat>* %0)
196 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
197 // CHECK32-NEXT: %vld4q_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4.v8bf16.p0i8(i8* %0, i32 2)
198 
test_vld4_lane_bf16(bfloat16_t const * ptr,bfloat16x4x4_t src)199 bfloat16x4x4_t test_vld4_lane_bf16(bfloat16_t const *ptr, bfloat16x4x4_t src) {
200   return vld4_lane_bf16(ptr, src, 1);
201 }
202 // CHECK-LABEL: test_vld4_lane_bf16
203 // CHECK64: %vld4_lane = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4lane.v4bf16.p0i8(<4 x bfloat> %src.coerce.fca.0.extract, <4 x bfloat> %src.coerce.fca.1.extract, <4 x bfloat> %src.coerce.fca.2.extract, <4 x bfloat> %src.coerce.fca.3.extract, i64 1, i8* %0)
204 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
205 // CHECK32-NEXT: %vld4_lane_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4lane.v4bf16.p0i8(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
206 
test_vld4q_lane_bf16(bfloat16_t const * ptr,bfloat16x8x4_t src)207 bfloat16x8x4_t test_vld4q_lane_bf16(bfloat16_t const *ptr, bfloat16x8x4_t src) {
208   return vld4q_lane_bf16(ptr, src, 7);
209 }
210 // CHECK-LABEL: test_vld4q_lane_bf16
211 // CHECK64: %vld4_lane = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4lane.v8bf16.p0i8(<8 x bfloat> %src.coerce.fca.0.extract, <8 x bfloat> %src.coerce.fca.1.extract, <8 x bfloat> %src.coerce.fca.2.extract, <8 x bfloat> %src.coerce.fca.3.extract, i64 7, i8* %0)
212 // CHECK32: %4 = bitcast bfloat* %ptr to i8*
213 // CHECK32-NEXT: %vld4q_lane_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4lane.v8bf16.p0i8(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
214 
test_vld2_dup_bf16(bfloat16_t const * ptr)215 bfloat16x4x2_t test_vld2_dup_bf16(bfloat16_t const *ptr) {
216   return vld2_dup_bf16(ptr);
217 }
218 // CHECK-LABEL: test_vld2_dup_bf16
219 // CHECK64: %vld2 = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld2r.v4bf16.p0bf16(bfloat* %ptr)
220 // CHECK32: %vld2_dup_v = tail call { <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld2dup.v4bf16.p0i8(i8* %0, i32 2)
221 
test_vld2q_dup_bf16(bfloat16_t const * ptr)222 bfloat16x8x2_t test_vld2q_dup_bf16(bfloat16_t const *ptr) {
223   return vld2q_dup_bf16(ptr);
224 }
225 // CHECK-LABEL: test_vld2q_dup_bf16
226 // CHECK64: %vld2 = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld2r.v8bf16.p0bf16(bfloat* %ptr)
227 // CHECK32: %vld2q_dup_v = tail call { <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld2dup.v8bf16.p0i8(i8* %0, i32 2)
228 
test_vld3_dup_bf16(bfloat16_t const * ptr)229 bfloat16x4x3_t test_vld3_dup_bf16(bfloat16_t const *ptr) {
230   return vld3_dup_bf16(ptr);
231 }
232 // CHECK-LABEL: test_vld3_dup_bf16
233 // CHECK64: %vld3 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld3r.v4bf16.p0bf16(bfloat* %ptr)
234 // CHECK32: %vld3_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld3dup.v4bf16.p0i8(i8* %0, i32 2)
235 
test_vld3q_dup_bf16(bfloat16_t const * ptr)236 bfloat16x8x3_t test_vld3q_dup_bf16(bfloat16_t const *ptr) {
237   return vld3q_dup_bf16(ptr);
238 }
239 // CHECK-LABEL: test_vld3q_dup_bf16
240 // CHECK64: %vld3 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld3r.v8bf16.p0bf16(bfloat* %ptr)
241 // CHECK32: %vld3q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld3dup.v8bf16.p0i8(i8* %0, i32 2)
242 
test_vld4_dup_bf16(bfloat16_t const * ptr)243 bfloat16x4x4_t test_vld4_dup_bf16(bfloat16_t const *ptr) {
244   return vld4_dup_bf16(ptr);
245 }
246 // CHECK-LABEL: test_vld4_dup_bf16
247 // CHECK64: %vld4 = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.aarch64.neon.ld4r.v4bf16.p0bf16(bfloat* %ptr)
248 // CHECK32: %vld4_dup_v = tail call { <4 x bfloat>, <4 x bfloat>, <4 x bfloat>, <4 x bfloat> } @llvm.arm.neon.vld4dup.v4bf16.p0i8(i8* %0, i32 2)
249 
test_vld4q_dup_bf16(bfloat16_t const * ptr)250 bfloat16x8x4_t test_vld4q_dup_bf16(bfloat16_t const *ptr) {
251   return vld4q_dup_bf16(ptr);
252 }
253 // CHECK-LABEL: test_vld4q_dup_bf16
254 // CHECK64: %vld4 = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.aarch64.neon.ld4r.v8bf16.p0bf16(bfloat* %ptr)
255 // CHECK32: %vld4q_dup_v = tail call { <8 x bfloat>, <8 x bfloat>, <8 x bfloat>, <8 x bfloat> } @llvm.arm.neon.vld4dup.v8bf16.p0i8(i8* %0, i32 2)
256 
test_vst1_bf16(bfloat16_t * ptr,bfloat16x4_t val)257 void test_vst1_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
258   vst1_bf16(ptr, val);
259 }
260 // CHECK-LABEL: test_vst1_bf16
261 // CHECK64: %0 = bitcast bfloat* %ptr to <4 x bfloat>*
262 // CHECK64-NEXT: store <4 x bfloat> %val, <4 x bfloat>* %0, align 2
263 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
264 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v4bf16(i8* %0, <4 x bfloat> %val, i32 2)
265 
test_vst1q_bf16(bfloat16_t * ptr,bfloat16x8_t val)266 void test_vst1q_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
267   vst1q_bf16(ptr, val);
268 }
269 // CHECK-LABEL: test_vst1q_bf16
270 // CHECK64: %0 = bitcast bfloat* %ptr to <8 x bfloat>*
271 // CHECK64-NEXT: store <8 x bfloat> %val, <8 x bfloat>* %0, align 2
272 // CHECK32: %0 = bitcast bfloat* %ptr to i8*
273 // CHECK32-NEXT: tail call void @llvm.arm.neon.vst1.p0i8.v8bf16(i8* %0, <8 x bfloat> %val, i32 2)
274 
test_vst1_lane_bf16(bfloat16_t * ptr,bfloat16x4_t val)275 void test_vst1_lane_bf16(bfloat16_t *ptr, bfloat16x4_t val) {
276   vst1_lane_bf16(ptr, val, 1);
277 }
278 // CHECK-LABEL: test_vst1_lane_bf16
279 // CHECK64: %0 = extractelement <4 x bfloat> %val, i32 1
280 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
281 // CHECK32: %0 = extractelement <4 x bfloat> %val, i32 1
282 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
283 
test_vst1q_lane_bf16(bfloat16_t * ptr,bfloat16x8_t val)284 void test_vst1q_lane_bf16(bfloat16_t *ptr, bfloat16x8_t val) {
285   vst1q_lane_bf16(ptr, val, 7);
286 }
287 // CHECK-LABEL: test_vst1q_lane_bf16
288 // CHECK64: %0 = extractelement <8 x bfloat> %val, i32 7
289 // CHECK64-NEXT: store bfloat %0, bfloat* %ptr, align 2
290 // CHECK32: %0 = extractelement <8 x bfloat> %val, i32 7
291 // CHECK32-NEXT: store bfloat %0, bfloat* %ptr, align 2
292 
test_vst1_bf16_x2(bfloat16_t * ptr,bfloat16x4x2_t val)293 void test_vst1_bf16_x2(bfloat16_t *ptr, bfloat16x4x2_t val) {
294   vst1_bf16_x2(ptr, val);
295 }
296 // CHECK-LABEL: test_vst1_bf16_x2
297 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
298 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1)
299 
test_vst1q_bf16_x2(bfloat16_t * ptr,bfloat16x8x2_t val)300 void test_vst1q_bf16_x2(bfloat16_t *ptr, bfloat16x8x2_t val) {
301   vst1q_bf16_x2(ptr, val);
302 }
303 // CHECK-LABEL: test_vst1q_bf16_x2
304 // CHECK64: tail call void @llvm.aarch64.neon.st1x2.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, bfloat* %ptr)
305 // CHECK32: tail call void @llvm.arm.neon.vst1x2.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1)
306 
test_vst1_bf16_x3(bfloat16_t * ptr,bfloat16x4x3_t val)307 void test_vst1_bf16_x3(bfloat16_t *ptr, bfloat16x4x3_t val) {
308   vst1_bf16_x3(ptr, val);
309 }
310 // CHECK-LABEL: test_vst1_bf16_x3
311 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
312 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2)
313 
test_vst1q_bf16_x3(bfloat16_t * ptr,bfloat16x8x3_t val)314 void test_vst1q_bf16_x3(bfloat16_t *ptr, bfloat16x8x3_t val) {
315   vst1q_bf16_x3(ptr, val);
316 }
317 // CHECK-LABEL: test_vst1q_bf16_x3
318 // CHECK64: tail call void @llvm.aarch64.neon.st1x3.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, bfloat* %ptr)
319 // CHECK32: tail call void @llvm.arm.neon.vst1x3.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2)
320 
test_vst1_bf16_x4(bfloat16_t * ptr,bfloat16x4x4_t val)321 void test_vst1_bf16_x4(bfloat16_t *ptr, bfloat16x4x4_t val) {
322   vst1_bf16_x4(ptr, val);
323 }
324 // CHECK-LABEL: test_vst1_bf16_x4
325 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v4bf16.p0bf16(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
326 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v4bf16(bfloat* %ptr, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3)
327 
test_vst1q_bf16_x4(bfloat16_t * ptr,bfloat16x8x4_t val)328 void test_vst1q_bf16_x4(bfloat16_t *ptr, bfloat16x8x4_t val) {
329   vst1q_bf16_x4(ptr, val);
330 }
331 // CHECK-LABEL: test_vst1q_bf16_x4
332 // CHECK64: tail call void @llvm.aarch64.neon.st1x4.v8bf16.p0bf16(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, bfloat* %ptr)
333 // CHECK32: tail call void @llvm.arm.neon.vst1x4.p0bf16.v8bf16(bfloat* %ptr, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3)
334 
test_vst2_bf16(bfloat16_t * ptr,bfloat16x4x2_t val)335 void test_vst2_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
336   vst2_bf16(ptr, val);
337 }
338 // CHECK-LABEL: test_vst2_bf16
339 // CHECK64: tail call void @llvm.aarch64.neon.st2.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i8* %0)
340 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 2)
341 
test_vst2q_bf16(bfloat16_t * ptr,bfloat16x8x2_t val)342 void test_vst2q_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
343   vst2q_bf16(ptr, val);
344 }
345 // CHECK-LABEL: test_vst2q_bf16
346 // CHECK64: tail call void @llvm.aarch64.neon.st2.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i8* %0)
347 // CHECK32: tail call void @llvm.arm.neon.vst2.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 2)
348 
test_vst2_lane_bf16(bfloat16_t * ptr,bfloat16x4x2_t val)349 void test_vst2_lane_bf16(bfloat16_t *ptr, bfloat16x4x2_t val) {
350   vst2_lane_bf16(ptr, val, 1);
351 }
352 // CHECK-LABEL: test_vst2_lane_bf16
353 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, i64 1, i8* %0)
354 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v4bf16(i8* %2, <4 x bfloat> %0, <4 x bfloat> %1, i32 1, i32 2)
355 
test_vst2q_lane_bf16(bfloat16_t * ptr,bfloat16x8x2_t val)356 void test_vst2q_lane_bf16(bfloat16_t *ptr, bfloat16x8x2_t val) {
357   vst2q_lane_bf16(ptr, val, 7);
358 }
359 // CHECK-LABEL: test_vst2q_lane_bf16
360 // CHECK64: tail call void @llvm.aarch64.neon.st2lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, i64 7, i8* %0)
361 // CHECK32: tail call void @llvm.arm.neon.vst2lane.p0i8.v8bf16(i8* %2, <8 x bfloat> %0, <8 x bfloat> %1, i32 7, i32 2)
362 
test_vst3_bf16(bfloat16_t * ptr,bfloat16x4x3_t val)363 void test_vst3_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
364   vst3_bf16(ptr, val);
365 }
366 // CHECK-LABEL: test_vst3_bf16
367 // CHECK64: tail call void @llvm.aarch64.neon.st3.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i8* %0)
368 // CHECK32: tail call void @llvm.arm.neon.vst3.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 2)
369 
test_vst3q_bf16(bfloat16_t * ptr,bfloat16x8x3_t val)370 void test_vst3q_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
371   vst3q_bf16(ptr, val);
372 }
373 // CHECK-LABEL: test_vst3q_bf16
374 // CHECK64: tail call void @llvm.aarch64.neon.st3.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i8* %0)
375 // CHECK32:  tail call void @llvm.arm.neon.vst3.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 2)
376 
test_vst3_lane_bf16(bfloat16_t * ptr,bfloat16x4x3_t val)377 void test_vst3_lane_bf16(bfloat16_t *ptr, bfloat16x4x3_t val) {
378   vst3_lane_bf16(ptr, val, 1);
379 }
380 // CHECK-LABEL: test_vst3_lane_bf16
381 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, i64 1, i8* %0)
382 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v4bf16(i8* %3, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, i32 1, i32 2)
383 
test_vst3q_lane_bf16(bfloat16_t * ptr,bfloat16x8x3_t val)384 void test_vst3q_lane_bf16(bfloat16_t *ptr, bfloat16x8x3_t val) {
385   vst3q_lane_bf16(ptr, val, 7);
386 }
387 // CHECK-LABEL: test_vst3q_lane_bf16
388 // CHECK64: tail call void @llvm.aarch64.neon.st3lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, i64 7, i8* %0)
389 // CHECK32: tail call void @llvm.arm.neon.vst3lane.p0i8.v8bf16(i8* %3, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, i32 7, i32 2)
390 
test_vst4_bf16(bfloat16_t * ptr,bfloat16x4x4_t val)391 void test_vst4_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
392   vst4_bf16(ptr, val);
393 }
394 // CHECK-LABEL: test_vst4_bf16
395 // CHECK64: tail call void @llvm.aarch64.neon.st4.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i8* %0)
396 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 2)
397 
test_vst4q_bf16(bfloat16_t * ptr,bfloat16x8x4_t val)398 void test_vst4q_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
399   vst4q_bf16(ptr, val);
400 }
401 // CHECK-LABEL: test_vst4q_bf16
402 // CHECK64: tail call void @llvm.aarch64.neon.st4.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i8* %0)
403 // CHECK32: tail call void @llvm.arm.neon.vst4.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 2)
404 
test_vst4_lane_bf16(bfloat16_t * ptr,bfloat16x4x4_t val)405 void test_vst4_lane_bf16(bfloat16_t *ptr, bfloat16x4x4_t val) {
406   vst4_lane_bf16(ptr, val, 1);
407 }
408 // CHECK-LABEL: test_vst4_lane_bf16
409 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v4bf16.p0i8(<4 x bfloat> %val.coerce.fca.0.extract, <4 x bfloat> %val.coerce.fca.1.extract, <4 x bfloat> %val.coerce.fca.2.extract, <4 x bfloat> %val.coerce.fca.3.extract, i64 1, i8* %0)
410 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v4bf16(i8* %4, <4 x bfloat> %0, <4 x bfloat> %1, <4 x bfloat> %2, <4 x bfloat> %3, i32 1, i32 2)
411 
test_vst4q_lane_bf16(bfloat16_t * ptr,bfloat16x8x4_t val)412 void test_vst4q_lane_bf16(bfloat16_t *ptr, bfloat16x8x4_t val) {
413   vst4q_lane_bf16(ptr, val, 7);
414 }
415 // CHECK-LABEL: test_vst4q_lane_bf16
416 // CHECK64: tail call void @llvm.aarch64.neon.st4lane.v8bf16.p0i8(<8 x bfloat> %val.coerce.fca.0.extract, <8 x bfloat> %val.coerce.fca.1.extract, <8 x bfloat> %val.coerce.fca.2.extract, <8 x bfloat> %val.coerce.fca.3.extract, i64 7, i8* %0)
417 // CHECK32: tail call void @llvm.arm.neon.vst4lane.p0i8.v8bf16(i8* %4, <8 x bfloat> %0, <8 x bfloat> %1, <8 x bfloat> %2, <8 x bfloat> %3, i32 7, i32 2)
418