1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x float> @fpext_4(<4 x half> %src1) {
5; CHECK-LABEL: fpext_4:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vcvtt.f32.f16 s7, s1
8; CHECK-NEXT:    vcvtb.f32.f16 s6, s1
9; CHECK-NEXT:    vcvtt.f32.f16 s5, s0
10; CHECK-NEXT:    vcvtb.f32.f16 s4, s0
11; CHECK-NEXT:    vmov q0, q1
12; CHECK-NEXT:    bx lr
13entry:
14  %out = fpext <4 x half> %src1 to <4 x float>
15  ret <4 x float> %out
16}
17
18define arm_aapcs_vfpcc <8 x float> @fpext_8(<8 x half> %src1) {
19; CHECK-LABEL: fpext_8:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vcvtt.f32.f16 s11, s1
22; CHECK-NEXT:    vcvtt.f32.f16 s7, s3
23; CHECK-NEXT:    vcvtb.f32.f16 s10, s1
24; CHECK-NEXT:    vcvtb.f32.f16 s6, s3
25; CHECK-NEXT:    vcvtt.f32.f16 s9, s0
26; CHECK-NEXT:    vcvtt.f32.f16 s5, s2
27; CHECK-NEXT:    vcvtb.f32.f16 s8, s0
28; CHECK-NEXT:    vcvtb.f32.f16 s4, s2
29; CHECK-NEXT:    vmov q0, q2
30; CHECK-NEXT:    bx lr
31entry:
32  %out = fpext <8 x half> %src1 to <8 x float>
33  ret <8 x float> %out
34}
35
36
37define arm_aapcs_vfpcc <4 x half> @fptrunc_4(<4 x float> %src1) {
38; CHECK-LABEL: fptrunc_4:
39; CHECK:       @ %bb.0: @ %entry
40; CHECK-NEXT:    vcvtb.f16.f32 s4, s0
41; CHECK-NEXT:    vcvtt.f16.f32 s4, s1
42; CHECK-NEXT:    vcvtb.f16.f32 s5, s2
43; CHECK-NEXT:    vcvtt.f16.f32 s5, s3
44; CHECK-NEXT:    vmov q0, q1
45; CHECK-NEXT:    bx lr
46entry:
47  %out = fptrunc <4 x float> %src1 to <4 x half>
48  ret <4 x half> %out
49}
50
51define arm_aapcs_vfpcc <8 x half> @fptrunc_8(<8 x float> %src1) {
52; CHECK-LABEL: fptrunc_8:
53; CHECK:       @ %bb.0: @ %entry
54; CHECK-NEXT:    vmov q2, q0
55; CHECK-NEXT:    vcvtb.f16.f32 s0, s8
56; CHECK-NEXT:    vcvtt.f16.f32 s0, s9
57; CHECK-NEXT:    vcvtb.f16.f32 s1, s10
58; CHECK-NEXT:    vcvtt.f16.f32 s1, s11
59; CHECK-NEXT:    vcvtb.f16.f32 s2, s4
60; CHECK-NEXT:    vcvtt.f16.f32 s2, s5
61; CHECK-NEXT:    vcvtb.f16.f32 s3, s6
62; CHECK-NEXT:    vcvtt.f16.f32 s3, s7
63; CHECK-NEXT:    bx lr
64entry:
65  %out = fptrunc <8 x float> %src1 to <8 x half>
66  ret <8 x half> %out
67}
68
69
70define arm_aapcs_vfpcc <8 x half> @shuffle_trunc1(<4 x float> %src1, <4 x float> %src2) {
71; CHECK-LABEL: shuffle_trunc1:
72; CHECK:       @ %bb.0: @ %entry
73; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
74; CHECK-NEXT:    vcvtt.f16.f32 q0, q1
75; CHECK-NEXT:    bx lr
76entry:
77  %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
78  %out = fptrunc <8 x float> %strided.vec to <8 x half>
79  ret <8 x half> %out
80}
81
82define arm_aapcs_vfpcc <8 x half> @shuffle_trunc2(<4 x float> %src1, <4 x float> %src2) {
83; CHECK-LABEL: shuffle_trunc2:
84; CHECK:       @ %bb.0: @ %entry
85; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
86; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
87; CHECK-NEXT:    vmov q0, q1
88; CHECK-NEXT:    bx lr
89entry:
90  %strided.vec = shufflevector <4 x float> %src1, <4 x float> %src2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
91  %out = fptrunc <8 x float> %strided.vec to <8 x half>
92  ret <8 x half> %out
93}
94
95define arm_aapcs_vfpcc <16 x half> @shuffle_trunc3(<8 x float> %src1, <8 x float> %src2) {
96; CHECK-LABEL: shuffle_trunc3:
97; CHECK:       @ %bb.0: @ %entry
98; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
99; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
100; CHECK-NEXT:    vcvtt.f16.f32 q0, q2
101; CHECK-NEXT:    vcvtt.f16.f32 q1, q3
102; CHECK-NEXT:    bx lr
103entry:
104  %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
105  %out = fptrunc <16 x float> %strided.vec to <16 x half>
106  ret <16 x half> %out
107}
108
109define arm_aapcs_vfpcc <16 x half> @shuffle_trunc4(<8 x float> %src1, <8 x float> %src2) {
110; CHECK-LABEL: shuffle_trunc4:
111; CHECK:       @ %bb.0: @ %entry
112; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
113; CHECK-NEXT:    vcvtb.f16.f32 q3, q3
114; CHECK-NEXT:    vcvtt.f16.f32 q2, q0
115; CHECK-NEXT:    vcvtt.f16.f32 q3, q1
116; CHECK-NEXT:    vmov q0, q2
117; CHECK-NEXT:    vmov q1, q3
118; CHECK-NEXT:    bx lr
119entry:
120  %strided.vec = shufflevector <8 x float> %src1, <8 x float> %src2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
121  %out = fptrunc <16 x float> %strided.vec to <16 x half>
122  ret <16 x half> %out
123}
124
125define arm_aapcs_vfpcc <8 x half> @shuffle_trunc5(<4 x float> %src1, <4 x float> %src2) {
126; CHECK-LABEL: shuffle_trunc5:
127; CHECK:       @ %bb.0: @ %entry
128; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
129; CHECK-NEXT:    vcvtt.f16.f32 q0, q1
130; CHECK-NEXT:    bx lr
131entry:
132  %out1 = fptrunc <4 x float> %src1 to <4 x half>
133  %out2 = fptrunc <4 x float> %src2 to <4 x half>
134  %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
135  ret <8 x half> %s
136}
137
138define arm_aapcs_vfpcc <8 x half> @shuffle_trunc6(<4 x float> %src1, <4 x float> %src2) {
139; CHECK-LABEL: shuffle_trunc6:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
142; CHECK-NEXT:    vcvtt.f16.f32 q1, q0
143; CHECK-NEXT:    vmov q0, q1
144; CHECK-NEXT:    bx lr
145entry:
146  %out1 = fptrunc <4 x float> %src1 to <4 x half>
147  %out2 = fptrunc <4 x float> %src2 to <4 x half>
148  %s = shufflevector <4 x half> %out1, <4 x half> %out2, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
149  ret <8 x half> %s
150}
151
152define arm_aapcs_vfpcc <16 x half> @shuffle_trunc7(<8 x float> %src1, <8 x float> %src2) {
153; CHECK-LABEL: shuffle_trunc7:
154; CHECK:       @ %bb.0: @ %entry
155; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
156; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
157; CHECK-NEXT:    vcvtt.f16.f32 q0, q2
158; CHECK-NEXT:    vcvtt.f16.f32 q1, q3
159; CHECK-NEXT:    bx lr
160entry:
161  %out1 = fptrunc <8 x float> %src1 to <8 x half>
162  %out2 = fptrunc <8 x float> %src2 to <8 x half>
163  %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
164  ret <16 x half> %s
165}
166
167define arm_aapcs_vfpcc <16 x half> @shuffle_trunc8(<8 x float> %src1, <8 x float> %src2) {
168; CHECK-LABEL: shuffle_trunc8:
169; CHECK:       @ %bb.0: @ %entry
170; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
171; CHECK-NEXT:    vcvtb.f16.f32 q3, q3
172; CHECK-NEXT:    vcvtt.f16.f32 q2, q0
173; CHECK-NEXT:    vcvtt.f16.f32 q3, q1
174; CHECK-NEXT:    vmov q0, q2
175; CHECK-NEXT:    vmov q1, q3
176; CHECK-NEXT:    bx lr
177entry:
178  %out1 = fptrunc <8 x float> %src1 to <8 x half>
179  %out2 = fptrunc <8 x float> %src2 to <8 x half>
180  %s = shufflevector <8 x half> %out1, <8 x half> %out2, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
181  ret <16 x half> %s
182}
183
184
185
186
187define arm_aapcs_vfpcc <4 x float> @load_ext_4(<4 x half>* %src) {
188; CHECK-LABEL: load_ext_4:
189; CHECK:       @ %bb.0: @ %entry
190; CHECK-NEXT:    vldrh.u32 q0, [r0]
191; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
192; CHECK-NEXT:    bx lr
193entry:
194  %wide.load = load <4 x half>, <4 x half>* %src, align 4
195  %e = fpext <4 x half> %wide.load to <4 x float>
196  ret <4 x float> %e
197}
198
199define arm_aapcs_vfpcc <8 x float> @load_ext_8(<8 x half>* %src) {
200; CHECK-LABEL: load_ext_8:
201; CHECK:       @ %bb.0: @ %entry
202; CHECK-NEXT:    vldrh.u32 q0, [r0]
203; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
204; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
205; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
206; CHECK-NEXT:    bx lr
207entry:
208  %wide.load = load <8 x half>, <8 x half>* %src, align 4
209  %e = fpext <8 x half> %wide.load to <8 x float>
210  ret <8 x float> %e
211}
212
213define arm_aapcs_vfpcc <16 x float> @load_ext_16(<16 x half>* %src) {
214; CHECK-LABEL: load_ext_16:
215; CHECK:       @ %bb.0: @ %entry
216; CHECK-NEXT:    vldrh.u32 q0, [r0]
217; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
218; CHECK-NEXT:    vldrh.u32 q2, [r0, #16]
219; CHECK-NEXT:    vldrh.u32 q3, [r0, #24]
220; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
221; CHECK-NEXT:    vcvtb.f32.f16 q1, q1
222; CHECK-NEXT:    vcvtb.f32.f16 q2, q2
223; CHECK-NEXT:    vcvtb.f32.f16 q3, q3
224; CHECK-NEXT:    bx lr
225entry:
226  %wide.load = load <16 x half>, <16 x half>* %src, align 4
227  %e = fpext <16 x half> %wide.load to <16 x float>
228  ret <16 x float> %e
229}
230
231define arm_aapcs_vfpcc <4 x float> @load_shuffleext_8(<8 x half>* %src) {
232; CHECK-LABEL: load_shuffleext_8:
233; CHECK:       @ %bb.0: @ %entry
234; CHECK-NEXT:    vldrw.u32 q0, [r0]
235; CHECK-NEXT:    vcvtb.f32.f16 q0, q0
236; CHECK-NEXT:    bx lr
237entry:
238  %wide.load = load <8 x half>, <8 x half>* %src, align 4
239  %sh = shufflevector <8 x half> %wide.load, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
240  %e = fpext <4 x half> %sh to <4 x float>
241  ret <4 x float> %e
242}
243
244define arm_aapcs_vfpcc <8 x float> @load_shuffleext_16(<16 x half>* %src) {
245; CHECK-LABEL: load_shuffleext_16:
246; CHECK:       @ %bb.0: @ %entry
247; CHECK-NEXT:    vld20.16 {q2, q3}, [r0]
248; CHECK-NEXT:    vld21.16 {q2, q3}, [r0]
249; CHECK-NEXT:    vcvtt.f32.f16 s3, s9
250; CHECK-NEXT:    vcvtt.f32.f16 s7, s11
251; CHECK-NEXT:    vcvtb.f32.f16 s2, s9
252; CHECK-NEXT:    vcvtb.f32.f16 s6, s11
253; CHECK-NEXT:    vcvtt.f32.f16 s1, s8
254; CHECK-NEXT:    vcvtt.f32.f16 s5, s10
255; CHECK-NEXT:    vcvtb.f32.f16 s0, s8
256; CHECK-NEXT:    vcvtb.f32.f16 s4, s10
257; CHECK-NEXT:    bx lr
258entry:
259  %wide.load = load <16 x half>, <16 x half>* %src, align 4
260  %sh = shufflevector <16 x half> %wide.load, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
261  %e = fpext <8 x half> %sh to <8 x float>
262  ret <8 x float> %e
263}
264
265
266
267
268define arm_aapcs_vfpcc void @store_trunc_4(<4 x half>* %src, <4 x float> %val) {
269; CHECK-LABEL: store_trunc_4:
270; CHECK:       @ %bb.0: @ %entry
271; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
272; CHECK-NEXT:    vstrh.32 q0, [r0]
273; CHECK-NEXT:    bx lr
274entry:
275  %e = fptrunc <4 x float> %val to <4 x half>
276  store <4 x half> %e, <4 x half>* %src, align 4
277  ret void
278}
279
280define arm_aapcs_vfpcc void @store_trunc_8(<8 x half>* %src, <8 x float> %val) {
281; CHECK-LABEL: store_trunc_8:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
284; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
285; CHECK-NEXT:    vstrh.32 q1, [r0, #8]
286; CHECK-NEXT:    vstrh.32 q0, [r0]
287; CHECK-NEXT:    bx lr
288entry:
289  %e = fptrunc <8 x float> %val to <8 x half>
290  store <8 x half> %e, <8 x half>* %src, align 4
291  ret void
292}
293
294define arm_aapcs_vfpcc void @store_trunc_16(<16 x half>* %src, <16 x float> %val) {
295; CHECK-LABEL: store_trunc_16:
296; CHECK:       @ %bb.0: @ %entry
297; CHECK-NEXT:    vcvtb.f16.f32 q3, q3
298; CHECK-NEXT:    vcvtb.f16.f32 q2, q2
299; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
300; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
301; CHECK-NEXT:    vstrh.32 q3, [r0, #24]
302; CHECK-NEXT:    vstrh.32 q2, [r0, #16]
303; CHECK-NEXT:    vstrh.32 q1, [r0, #8]
304; CHECK-NEXT:    vstrh.32 q0, [r0]
305; CHECK-NEXT:    bx lr
306entry:
307  %e = fptrunc <16 x float> %val to <16 x half>
308  store <16 x half> %e, <16 x half>* %src, align 4
309  ret void
310}
311
312define arm_aapcs_vfpcc void @store_shuffletrunc_8(<8 x half>* %src, <4 x float> %val1, <4 x float> %val2) {
313; CHECK-LABEL: store_shuffletrunc_8:
314; CHECK:       @ %bb.0: @ %entry
315; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
316; CHECK-NEXT:    vcvtt.f16.f32 q0, q1
317; CHECK-NEXT:    vstrw.32 q0, [r0]
318; CHECK-NEXT:    bx lr
319entry:
320  %strided.vec = shufflevector <4 x float> %val1, <4 x float> %val2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
321  %out = fptrunc <8 x float> %strided.vec to <8 x half>
322  store <8 x half> %out, <8 x half>* %src, align 4
323  ret void
324}
325
326define arm_aapcs_vfpcc void @store_shuffletrunc_16(<16 x half>* %src, <8 x float> %val1, <8 x float> %val2) {
327; CHECK-LABEL: store_shuffletrunc_16:
328; CHECK:       @ %bb.0: @ %entry
329; CHECK-NEXT:    vcvtb.f16.f32 q1, q1
330; CHECK-NEXT:    vcvtb.f16.f32 q0, q0
331; CHECK-NEXT:    vcvtt.f16.f32 q1, q3
332; CHECK-NEXT:    vcvtt.f16.f32 q0, q2
333; CHECK-NEXT:    vstrw.32 q1, [r0, #16]
334; CHECK-NEXT:    vstrw.32 q0, [r0]
335; CHECK-NEXT:    bx lr
336entry:
337  %strided.vec = shufflevector <8 x float> %val1, <8 x float> %val2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
338  %out = fptrunc <16 x float> %strided.vec to <16 x half>
339  store <16 x half> %out, <16 x half>* %src, align 4
340  ret void
341}
342