1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=LE %s
3; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=BE %s
4
5define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {
6; LE-LABEL: test_vmovnbq_s16:
7; LE:       @ %bb.0: @ %entry
8; LE-NEXT:    vmovnb.i16 q0, q1
9; LE-NEXT:    bx lr
10;
11; BE-LABEL: test_vmovnbq_s16:
12; BE:       @ %bb.0: @ %entry
13; BE-NEXT:    vrev64.16 q2, q1
14; BE-NEXT:    vrev64.8 q1, q0
15; BE-NEXT:    vmovnb.i16 q1, q2
16; BE-NEXT:    vrev64.8 q0, q1
17; BE-NEXT:    bx lr
18entry:
19  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
20  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
21  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
22  %3 = trunc <16 x i16> %2 to <16 x i8>
23  ret <16 x i8> %3
24}
25
26define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {
27; LE-LABEL: test_vmovnbq_s32:
28; LE:       @ %bb.0: @ %entry
29; LE-NEXT:    vmovnb.i32 q0, q1
30; LE-NEXT:    bx lr
31;
32; BE-LABEL: test_vmovnbq_s32:
33; BE:       @ %bb.0: @ %entry
34; BE-NEXT:    vrev64.32 q2, q1
35; BE-NEXT:    vrev64.16 q1, q0
36; BE-NEXT:    vmovnb.i32 q1, q2
37; BE-NEXT:    vrev64.16 q0, q1
38; BE-NEXT:    bx lr
39entry:
40  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
41  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
42  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
43  %3 = trunc <8 x i32> %2 to <8 x i16>
44  ret <8 x i16> %3
45}
46
47define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {
48; LE-LABEL: test_vmovnbq_u16:
49; LE:       @ %bb.0: @ %entry
50; LE-NEXT:    vmovnb.i16 q0, q1
51; LE-NEXT:    bx lr
52;
53; BE-LABEL: test_vmovnbq_u16:
54; BE:       @ %bb.0: @ %entry
55; BE-NEXT:    vrev64.16 q2, q1
56; BE-NEXT:    vrev64.8 q1, q0
57; BE-NEXT:    vmovnb.i16 q1, q2
58; BE-NEXT:    vrev64.8 q0, q1
59; BE-NEXT:    bx lr
60entry:
61  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
62  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
63  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
64  %3 = trunc <16 x i16> %2 to <16 x i8>
65  ret <16 x i8> %3
66}
67
68define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {
69; LE-LABEL: test_vmovnbq_u32:
70; LE:       @ %bb.0: @ %entry
71; LE-NEXT:    vmovnb.i32 q0, q1
72; LE-NEXT:    bx lr
73;
74; BE-LABEL: test_vmovnbq_u32:
75; BE:       @ %bb.0: @ %entry
76; BE-NEXT:    vrev64.32 q2, q1
77; BE-NEXT:    vrev64.16 q1, q0
78; BE-NEXT:    vmovnb.i32 q1, q2
79; BE-NEXT:    vrev64.16 q0, q1
80; BE-NEXT:    bx lr
81entry:
82  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
83  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
84  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
85  %3 = trunc <8 x i32> %2 to <8 x i16>
86  ret <8 x i16> %3
87}
88
89define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {
90; LE-LABEL: test_vmovntq_s16:
91; LE:       @ %bb.0: @ %entry
92; LE-NEXT:    vmovnt.i16 q0, q1
93; LE-NEXT:    bx lr
94;
95; BE-LABEL: test_vmovntq_s16:
96; BE:       @ %bb.0: @ %entry
97; BE-NEXT:    vrev64.16 q2, q1
98; BE-NEXT:    vrev64.8 q1, q0
99; BE-NEXT:    vmovnt.i16 q1, q2
100; BE-NEXT:    vrev64.8 q0, q1
101; BE-NEXT:    bx lr
102entry:
103  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
104  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
105  %2 = trunc <16 x i16> %1 to <16 x i8>
106  ret <16 x i8> %2
107}
108
109define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {
110; LE-LABEL: test_vmovntq_s32:
111; LE:       @ %bb.0: @ %entry
112; LE-NEXT:    vmovnt.i32 q0, q1
113; LE-NEXT:    bx lr
114;
115; BE-LABEL: test_vmovntq_s32:
116; BE:       @ %bb.0: @ %entry
117; BE-NEXT:    vrev64.32 q2, q1
118; BE-NEXT:    vrev64.16 q1, q0
119; BE-NEXT:    vmovnt.i32 q1, q2
120; BE-NEXT:    vrev64.16 q0, q1
121; BE-NEXT:    bx lr
122entry:
123  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
124  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
125  %2 = trunc <8 x i32> %1 to <8 x i16>
126  ret <8 x i16> %2
127}
128
129define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {
130; LE-LABEL: test_vmovntq_u16:
131; LE:       @ %bb.0: @ %entry
132; LE-NEXT:    vmovnt.i16 q0, q1
133; LE-NEXT:    bx lr
134;
135; BE-LABEL: test_vmovntq_u16:
136; BE:       @ %bb.0: @ %entry
137; BE-NEXT:    vrev64.16 q2, q1
138; BE-NEXT:    vrev64.8 q1, q0
139; BE-NEXT:    vmovnt.i16 q1, q2
140; BE-NEXT:    vrev64.8 q0, q1
141; BE-NEXT:    bx lr
142entry:
143  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
144  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
145  %2 = trunc <16 x i16> %1 to <16 x i8>
146  ret <16 x i8> %2
147}
148
149define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {
150; LE-LABEL: test_vmovntq_u32:
151; LE:       @ %bb.0: @ %entry
152; LE-NEXT:    vmovnt.i32 q0, q1
153; LE-NEXT:    bx lr
154;
155; BE-LABEL: test_vmovntq_u32:
156; BE:       @ %bb.0: @ %entry
157; BE-NEXT:    vrev64.32 q2, q1
158; BE-NEXT:    vrev64.16 q1, q0
159; BE-NEXT:    vmovnt.i32 q1, q2
160; BE-NEXT:    vrev64.16 q0, q1
161; BE-NEXT:    bx lr
162entry:
163  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
164  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
165  %2 = trunc <8 x i32> %1 to <8 x i16>
166  ret <8 x i16> %2
167}
168
169define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_m_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) {
170; LE-LABEL: test_vmovnbq_m_s16:
171; LE:       @ %bb.0: @ %entry
172; LE-NEXT:    vmsr p0, r0
173; LE-NEXT:    vpst
174; LE-NEXT:    vmovnbt.i16 q0, q1
175; LE-NEXT:    bx lr
176;
177; BE-LABEL: test_vmovnbq_m_s16:
178; BE:       @ %bb.0: @ %entry
179; BE-NEXT:    vrev64.16 q2, q1
180; BE-NEXT:    vrev64.8 q1, q0
181; BE-NEXT:    vmsr p0, r0
182; BE-NEXT:    vpst
183; BE-NEXT:    vmovnbt.i16 q1, q2
184; BE-NEXT:    vrev64.8 q0, q1
185; BE-NEXT:    bx lr
186entry:
187  %0 = zext i16 %p to i32
188  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
189  %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 0, <8 x i1> %1)
190  ret <16 x i8> %2
191}
192
193define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_m_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) {
194; LE-LABEL: test_vmovnbq_m_s32:
195; LE:       @ %bb.0: @ %entry
196; LE-NEXT:    vmsr p0, r0
197; LE-NEXT:    vpst
198; LE-NEXT:    vmovnbt.i32 q0, q1
199; LE-NEXT:    bx lr
200;
201; BE-LABEL: test_vmovnbq_m_s32:
202; BE:       @ %bb.0: @ %entry
203; BE-NEXT:    vrev64.32 q2, q1
204; BE-NEXT:    vrev64.16 q1, q0
205; BE-NEXT:    vmsr p0, r0
206; BE-NEXT:    vpst
207; BE-NEXT:    vmovnbt.i32 q1, q2
208; BE-NEXT:    vrev64.16 q0, q1
209; BE-NEXT:    bx lr
210entry:
211  %0 = zext i16 %p to i32
212  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
213  %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
214  ret <8 x i16> %2
215}
216
217define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_m_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) {
218; LE-LABEL: test_vmovnbq_m_u16:
219; LE:       @ %bb.0: @ %entry
220; LE-NEXT:    vmsr p0, r0
221; LE-NEXT:    vpst
222; LE-NEXT:    vmovnbt.i16 q0, q1
223; LE-NEXT:    bx lr
224;
225; BE-LABEL: test_vmovnbq_m_u16:
226; BE:       @ %bb.0: @ %entry
227; BE-NEXT:    vrev64.16 q2, q1
228; BE-NEXT:    vrev64.8 q1, q0
229; BE-NEXT:    vmsr p0, r0
230; BE-NEXT:    vpst
231; BE-NEXT:    vmovnbt.i16 q1, q2
232; BE-NEXT:    vrev64.8 q0, q1
233; BE-NEXT:    bx lr
234entry:
235  %0 = zext i16 %p to i32
236  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
237  %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 0, <8 x i1> %1)
238  ret <16 x i8> %2
239}
240
241define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_m_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) {
242; LE-LABEL: test_vmovnbq_m_u32:
243; LE:       @ %bb.0: @ %entry
244; LE-NEXT:    vmsr p0, r0
245; LE-NEXT:    vpst
246; LE-NEXT:    vmovnbt.i32 q0, q1
247; LE-NEXT:    bx lr
248;
249; BE-LABEL: test_vmovnbq_m_u32:
250; BE:       @ %bb.0: @ %entry
251; BE-NEXT:    vrev64.32 q2, q1
252; BE-NEXT:    vrev64.16 q1, q0
253; BE-NEXT:    vmsr p0, r0
254; BE-NEXT:    vpst
255; BE-NEXT:    vmovnbt.i32 q1, q2
256; BE-NEXT:    vrev64.16 q0, q1
257; BE-NEXT:    bx lr
258entry:
259  %0 = zext i16 %p to i32
260  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
261  %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
262  ret <8 x i16> %2
263}
264
265define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_m_s16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) {
266; LE-LABEL: test_vmovntq_m_s16:
267; LE:       @ %bb.0: @ %entry
268; LE-NEXT:    vmsr p0, r0
269; LE-NEXT:    vpst
270; LE-NEXT:    vmovntt.i16 q0, q1
271; LE-NEXT:    bx lr
272;
273; BE-LABEL: test_vmovntq_m_s16:
274; BE:       @ %bb.0: @ %entry
275; BE-NEXT:    vrev64.16 q2, q1
276; BE-NEXT:    vrev64.8 q1, q0
277; BE-NEXT:    vmsr p0, r0
278; BE-NEXT:    vpst
279; BE-NEXT:    vmovntt.i16 q1, q2
280; BE-NEXT:    vrev64.8 q0, q1
281; BE-NEXT:    bx lr
282entry:
283  %0 = zext i16 %p to i32
284  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
285  %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, <8 x i1> %1)
286  ret <16 x i8> %2
287}
288
289define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_m_s32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) {
290; LE-LABEL: test_vmovntq_m_s32:
291; LE:       @ %bb.0: @ %entry
292; LE-NEXT:    vmsr p0, r0
293; LE-NEXT:    vpst
294; LE-NEXT:    vmovntt.i32 q0, q1
295; LE-NEXT:    bx lr
296;
297; BE-LABEL: test_vmovntq_m_s32:
298; BE:       @ %bb.0: @ %entry
299; BE-NEXT:    vrev64.32 q2, q1
300; BE-NEXT:    vrev64.16 q1, q0
301; BE-NEXT:    vmsr p0, r0
302; BE-NEXT:    vpst
303; BE-NEXT:    vmovntt.i32 q1, q2
304; BE-NEXT:    vrev64.16 q0, q1
305; BE-NEXT:    bx lr
306entry:
307  %0 = zext i16 %p to i32
308  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
309  %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, <4 x i1> %1)
310  ret <8 x i16> %2
311}
312
313define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_m_u16(<16 x i8> %a, <8 x i16> %b, i16 zeroext %p) {
314; LE-LABEL: test_vmovntq_m_u16:
315; LE:       @ %bb.0: @ %entry
316; LE-NEXT:    vmsr p0, r0
317; LE-NEXT:    vpst
318; LE-NEXT:    vmovntt.i16 q0, q1
319; LE-NEXT:    bx lr
320;
321; BE-LABEL: test_vmovntq_m_u16:
322; BE:       @ %bb.0: @ %entry
323; BE-NEXT:    vrev64.16 q2, q1
324; BE-NEXT:    vrev64.8 q1, q0
325; BE-NEXT:    vmsr p0, r0
326; BE-NEXT:    vpst
327; BE-NEXT:    vmovntt.i16 q1, q2
328; BE-NEXT:    vrev64.8 q0, q1
329; BE-NEXT:    bx lr
330entry:
331  %0 = zext i16 %p to i32
332  %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
333  %2 = tail call <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8> %a, <8 x i16> %b, i32 1, <8 x i1> %1)
334  ret <16 x i8> %2
335}
336
337define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_m_u32(<8 x i16> %a, <4 x i32> %b, i16 zeroext %p) {
338; LE-LABEL: test_vmovntq_m_u32:
339; LE:       @ %bb.0: @ %entry
340; LE-NEXT:    vmsr p0, r0
341; LE-NEXT:    vpst
342; LE-NEXT:    vmovntt.i32 q0, q1
343; LE-NEXT:    bx lr
344;
345; BE-LABEL: test_vmovntq_m_u32:
346; BE:       @ %bb.0: @ %entry
347; BE-NEXT:    vrev64.32 q2, q1
348; BE-NEXT:    vrev64.16 q1, q0
349; BE-NEXT:    vmsr p0, r0
350; BE-NEXT:    vpst
351; BE-NEXT:    vmovntt.i32 q1, q2
352; BE-NEXT:    vrev64.16 q0, q1
353; BE-NEXT:    bx lr
354entry:
355  %0 = zext i16 %p to i32
356  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
357  %2 = tail call <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16> %a, <4 x i32> %b, i32 1, <4 x i1> %1)
358  ret <8 x i16> %2
359}
360
361declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)
362declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)
363declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
364declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
365declare <16 x i8> @llvm.arm.mve.vmovn.predicated.v16i8.v8i16.v8i1(<16 x i8>, <8 x i16>, i32, <8 x i1>)
366declare <8 x i16> @llvm.arm.mve.vmovn.predicated.v8i16.v4i32.v4i1(<8 x i16>, <4 x i32>, i32, <4 x i1>)
367