1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define void @vadd(i32* %s1, i32 %c0, i32 %N) {
5; CHECK-LABEL: vadd:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r2, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    poplt {r7, pc}
12; CHECK-NEXT:  .LBB0_1: @ %while.body.lr.ph
13; CHECK-NEXT:    dlstp.32 lr, r2
14; CHECK-NEXT:  .LBB0_2: @ %while.body
15; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
16; CHECK-NEXT:    vldrw.u32 q0, [r0]
17; CHECK-NEXT:    vadd.i32 q0, q0, r1
18; CHECK-NEXT:    vstrw.32 q0, [r0], #16
19; CHECK-NEXT:    letp lr, .LBB0_2
20; CHECK-NEXT:  @ %bb.3: @ %while.end
21; CHECK-NEXT:    pop {r7, pc}
22entry:
23  %cmp11 = icmp sgt i32 %N, 0
24  br i1 %cmp11, label %while.body.lr.ph, label %while.end
25
26while.body.lr.ph:                                 ; preds = %entry
27  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
28  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
29  br label %while.body
30
31while.body:                                       ; preds = %while.body.lr.ph, %while.body
32  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
33  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
34  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
35  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
36  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
37  %3 = tail call <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
38  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
39  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
40  %sub = add nsw i32 %N.addr.012, -4
41  %cmp = icmp sgt i32 %N.addr.012, 4
42  br i1 %cmp, label %while.body, label %while.end
43
44while.end:                                        ; preds = %while.body, %entry
45  ret void
46}
47
48define void @vsub(i32* %s1, i32 %c0, i32 %N) {
49; CHECK-LABEL: vsub:
50; CHECK:       @ %bb.0: @ %entry
51; CHECK-NEXT:    .save {r7, lr}
52; CHECK-NEXT:    push {r7, lr}
53; CHECK-NEXT:    cmp r2, #1
54; CHECK-NEXT:    it lt
55; CHECK-NEXT:    poplt {r7, pc}
56; CHECK-NEXT:  .LBB1_1: @ %while.body.lr.ph
57; CHECK-NEXT:    dlstp.32 lr, r2
58; CHECK-NEXT:  .LBB1_2: @ %while.body
59; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
60; CHECK-NEXT:    vldrw.u32 q0, [r0]
61; CHECK-NEXT:    vsub.i32 q0, q0, r1
62; CHECK-NEXT:    vstrw.32 q0, [r0], #16
63; CHECK-NEXT:    letp lr, .LBB1_2
64; CHECK-NEXT:  @ %bb.3: @ %while.end
65; CHECK-NEXT:    pop {r7, pc}
66entry:
67  %cmp11 = icmp sgt i32 %N, 0
68  br i1 %cmp11, label %while.body.lr.ph, label %while.end
69
70while.body.lr.ph:                                 ; preds = %entry
71  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
72  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
73  br label %while.body
74
75while.body:                                       ; preds = %while.body.lr.ph, %while.body
76  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
77  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
78  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
79  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
80  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
81  %3 = tail call <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
82  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
83  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
84  %sub = add nsw i32 %N.addr.012, -4
85  %cmp = icmp sgt i32 %N.addr.012, 4
86  br i1 %cmp, label %while.body, label %while.end
87
88while.end:                                        ; preds = %while.body, %entry
89  ret void
90}
91
92define void @vmul(i32* %s1, i32 %c0, i32 %N) {
93; CHECK-LABEL: vmul:
94; CHECK:       @ %bb.0: @ %entry
95; CHECK-NEXT:    .save {r7, lr}
96; CHECK-NEXT:    push {r7, lr}
97; CHECK-NEXT:    cmp r2, #1
98; CHECK-NEXT:    it lt
99; CHECK-NEXT:    poplt {r7, pc}
100; CHECK-NEXT:  .LBB2_1: @ %while.body.lr.ph
101; CHECK-NEXT:    dlstp.32 lr, r2
102; CHECK-NEXT:  .LBB2_2: @ %while.body
103; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
104; CHECK-NEXT:    vldrw.u32 q0, [r0]
105; CHECK-NEXT:    vmul.i32 q0, q0, r1
106; CHECK-NEXT:    vstrw.32 q0, [r0], #16
107; CHECK-NEXT:    letp lr, .LBB2_2
108; CHECK-NEXT:  @ %bb.3: @ %while.end
109; CHECK-NEXT:    pop {r7, pc}
110entry:
111  %cmp11 = icmp sgt i32 %N, 0
112  br i1 %cmp11, label %while.body.lr.ph, label %while.end
113
114while.body.lr.ph:                                 ; preds = %entry
115  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
116  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
117  br label %while.body
118
119while.body:                                       ; preds = %while.body.lr.ph, %while.body
120  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
121  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
122  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
123  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
124  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
125  %3 = tail call <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
126  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
127  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
128  %sub = add nsw i32 %N.addr.012, -4
129  %cmp = icmp sgt i32 %N.addr.012, 4
130  br i1 %cmp, label %while.body, label %while.end
131
132while.end:                                        ; preds = %while.body, %entry
133  ret void
134}
135
136define void @vqadd(i32* %s1, i32 %c0, i32 %N) {
137; CHECK-LABEL: vqadd:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    .save {r7, lr}
140; CHECK-NEXT:    push {r7, lr}
141; CHECK-NEXT:    cmp r2, #1
142; CHECK-NEXT:    it lt
143; CHECK-NEXT:    poplt {r7, pc}
144; CHECK-NEXT:  .LBB3_1: @ %while.body.lr.ph
145; CHECK-NEXT:    dlstp.32 lr, r2
146; CHECK-NEXT:  .LBB3_2: @ %while.body
147; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
148; CHECK-NEXT:    vldrw.u32 q0, [r0]
149; CHECK-NEXT:    vqadd.s32 q0, q0, r1
150; CHECK-NEXT:    vstrw.32 q0, [r0], #16
151; CHECK-NEXT:    letp lr, .LBB3_2
152; CHECK-NEXT:  @ %bb.3: @ %while.end
153; CHECK-NEXT:    pop {r7, pc}
154entry:
155  %cmp11 = icmp sgt i32 %N, 0
156  br i1 %cmp11, label %while.body.lr.ph, label %while.end
157
158while.body.lr.ph:                                 ; preds = %entry
159  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
160  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
161  br label %while.body
162
163while.body:                                       ; preds = %while.body.lr.ph, %while.body
164  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
165  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
166  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
167  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
168  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
169  %3 = tail call <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
170  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
171  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
172  %sub = add nsw i32 %N.addr.012, -4
173  %cmp = icmp sgt i32 %N.addr.012, 4
174  br i1 %cmp, label %while.body, label %while.end
175
176while.end:                                        ; preds = %while.body, %entry
177  ret void
178}
179
180define void @vqsub(i32* %s1, i32 %c0, i32 %N) {
181; CHECK-LABEL: vqsub:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    .save {r7, lr}
184; CHECK-NEXT:    push {r7, lr}
185; CHECK-NEXT:    cmp r2, #1
186; CHECK-NEXT:    it lt
187; CHECK-NEXT:    poplt {r7, pc}
188; CHECK-NEXT:  .LBB4_1: @ %while.body.lr.ph
189; CHECK-NEXT:    dlstp.32 lr, r2
190; CHECK-NEXT:  .LBB4_2: @ %while.body
191; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
192; CHECK-NEXT:    vldrw.u32 q0, [r0]
193; CHECK-NEXT:    vqsub.s32 q0, q0, r1
194; CHECK-NEXT:    vstrw.32 q0, [r0], #16
195; CHECK-NEXT:    letp lr, .LBB4_2
196; CHECK-NEXT:  @ %bb.3: @ %while.end
197; CHECK-NEXT:    pop {r7, pc}
198entry:
199  %cmp11 = icmp sgt i32 %N, 0
200  br i1 %cmp11, label %while.body.lr.ph, label %while.end
201
202while.body.lr.ph:                                 ; preds = %entry
203  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
204  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
205  br label %while.body
206
207while.body:                                       ; preds = %while.body.lr.ph, %while.body
208  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
209  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
210  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
211  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
212  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
213  %3 = tail call <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
214  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
215  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
216  %sub = add nsw i32 %N.addr.012, -4
217  %cmp = icmp sgt i32 %N.addr.012, 4
218  br i1 %cmp, label %while.body, label %while.end
219
220while.end:                                        ; preds = %while.body, %entry
221  ret void
222}
223
224define void @vhadd(i32* %s1, i32 %c0, i32 %N) {
225; CHECK-LABEL: vhadd:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    .save {r7, lr}
228; CHECK-NEXT:    push {r7, lr}
229; CHECK-NEXT:    cmp r2, #1
230; CHECK-NEXT:    it lt
231; CHECK-NEXT:    poplt {r7, pc}
232; CHECK-NEXT:  .LBB5_1: @ %while.body.lr.ph
233; CHECK-NEXT:    dlstp.32 lr, r2
234; CHECK-NEXT:  .LBB5_2: @ %while.body
235; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
236; CHECK-NEXT:    vldrw.u32 q0, [r0]
237; CHECK-NEXT:    vhadd.s32 q0, q0, r1
238; CHECK-NEXT:    vstrw.32 q0, [r0], #16
239; CHECK-NEXT:    letp lr, .LBB5_2
240; CHECK-NEXT:  @ %bb.3: @ %while.end
241; CHECK-NEXT:    pop {r7, pc}
242entry:
243  %cmp11 = icmp sgt i32 %N, 0
244  br i1 %cmp11, label %while.body.lr.ph, label %while.end
245
246while.body.lr.ph:                                 ; preds = %entry
247  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
248  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
249  br label %while.body
250
251while.body:                                       ; preds = %while.body.lr.ph, %while.body
252  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
253  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
254  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
255  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
256  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
257  %3 = tail call <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
258  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
259  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
260  %sub = add nsw i32 %N.addr.012, -4
261  %cmp = icmp sgt i32 %N.addr.012, 4
262  br i1 %cmp, label %while.body, label %while.end
263
264while.end:                                        ; preds = %while.body, %entry
265  ret void
266}
267
268define void @vhsub(i32* %s1, i32 %c0, i32 %N) {
269; CHECK-LABEL: vhsub:
270; CHECK:       @ %bb.0: @ %entry
271; CHECK-NEXT:    .save {r7, lr}
272; CHECK-NEXT:    push {r7, lr}
273; CHECK-NEXT:    cmp r2, #1
274; CHECK-NEXT:    it lt
275; CHECK-NEXT:    poplt {r7, pc}
276; CHECK-NEXT:  .LBB6_1: @ %while.body.lr.ph
277; CHECK-NEXT:    dlstp.32 lr, r2
278; CHECK-NEXT:  .LBB6_2: @ %while.body
279; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
280; CHECK-NEXT:    vldrw.u32 q0, [r0]
281; CHECK-NEXT:    vhsub.s32 q0, q0, r1
282; CHECK-NEXT:    vstrw.32 q0, [r0], #16
283; CHECK-NEXT:    letp lr, .LBB6_2
284; CHECK-NEXT:  @ %bb.3: @ %while.end
285; CHECK-NEXT:    pop {r7, pc}
286entry:
287  %cmp11 = icmp sgt i32 %N, 0
288  br i1 %cmp11, label %while.body.lr.ph, label %while.end
289
290while.body.lr.ph:                                 ; preds = %entry
291  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
292  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
293  br label %while.body
294
295while.body:                                       ; preds = %while.body.lr.ph, %while.body
296  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
297  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
298  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
299  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
300  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
301  %3 = tail call <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, i32 0, <4 x i1> %0, <4 x i32> %2)
302  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
303  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
304  %sub = add nsw i32 %N.addr.012, -4
305  %cmp = icmp sgt i32 %N.addr.012, 4
306  br i1 %cmp, label %while.body, label %while.end
307
308while.end:                                        ; preds = %while.body, %entry
309  ret void
310}
311
312define void @vqdmull(i32* %s1, i32 %c0, i32 %N) {
313; CHECK-LABEL: vqdmull:
314; CHECK:       @ %bb.0: @ %entry
315; CHECK-NEXT:    .save {r7, lr}
316; CHECK-NEXT:    push {r7, lr}
317; CHECK-NEXT:    cmp r2, #1
318; CHECK-NEXT:    it lt
319; CHECK-NEXT:    poplt {r7, pc}
320; CHECK-NEXT:  .LBB7_1: @ %while.body.lr.ph
321; CHECK-NEXT:    dlstp.32 lr, r2
322; CHECK-NEXT:  .LBB7_2: @ %while.body
323; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
324; CHECK-NEXT:    vldrh.s32 q0, [r0]
325; CHECK-NEXT:    vqdmullb.s16 q0, q0, r1
326; CHECK-NEXT:    vstrw.32 q0, [r0], #16
327; CHECK-NEXT:    letp lr, .LBB7_2
328; CHECK-NEXT:  @ %bb.3: @ %while.end
329; CHECK-NEXT:    pop {r7, pc}
330entry:
331  %cmp11 = icmp sgt i32 %N, 0
332  br i1 %cmp11, label %while.body.lr.ph, label %while.end
333
334while.body.lr.ph:                                 ; preds = %entry
335  %conv = trunc i32 %c0 to i16
336  %.splatinsert = insertelement <8 x i16> undef, i16 %conv, i32 0
337  %.splat = shufflevector <8 x i16> %.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
338  br label %while.body
339
340while.body:                                       ; preds = %while.body.lr.ph, %while.body
341  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
342  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
343  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
344  %1 = bitcast i32* %s1.addr.013 to <4 x i16>*
345  %2 = tail call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %1, i32 2, <4 x i1> %0, <4 x i16> zeroinitializer)
346  %3 = sext <4 x i16> %2 to <4 x i32>
347  %4 = bitcast <4 x i32> %3 to <8 x i16>
348  %5 = tail call <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16> %4, <8 x i16> %.splat, i32 0, <4 x i1> %0, <4 x i32> %3)
349  %6 = bitcast i32* %s1.addr.013 to <4 x i32>*
350  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %5, <4 x i32>* %6, i32 4, <4 x i1> %0)
351  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
352  %sub = add nsw i32 %N.addr.012, -4
353  %cmp = icmp sgt i32 %N.addr.012, 4
354  br i1 %cmp, label %while.body, label %while.end
355
356while.end:                                        ; preds = %while.body, %entry
357  ret void
358}
359
360define void @vqdmulh(i32* %s1, i32 %c0, i32 %N) {
361; CHECK-LABEL: vqdmulh:
362; CHECK:       @ %bb.0: @ %entry
363; CHECK-NEXT:    .save {r7, lr}
364; CHECK-NEXT:    push {r7, lr}
365; CHECK-NEXT:    cmp r2, #1
366; CHECK-NEXT:    it lt
367; CHECK-NEXT:    poplt {r7, pc}
368; CHECK-NEXT:  .LBB8_1: @ %while.body.lr.ph
369; CHECK-NEXT:    dlstp.32 lr, r2
370; CHECK-NEXT:  .LBB8_2: @ %while.body
371; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
372; CHECK-NEXT:    vldrw.u32 q0, [r0]
373; CHECK-NEXT:    vqdmulh.s32 q0, q0, r1
374; CHECK-NEXT:    vstrw.32 q0, [r0], #16
375; CHECK-NEXT:    letp lr, .LBB8_2
376; CHECK-NEXT:  @ %bb.3: @ %while.end
377; CHECK-NEXT:    pop {r7, pc}
378entry:
379  %cmp11 = icmp sgt i32 %N, 0
380  br i1 %cmp11, label %while.body.lr.ph, label %while.end
381
382while.body.lr.ph:                                 ; preds = %entry
383  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
384  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
385  br label %while.body
386
387while.body:                                       ; preds = %while.body.lr.ph, %while.body
388  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
389  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
390  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
391  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
392  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
393  %3 = tail call <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
394  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
395  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
396  %sub = add nsw i32 %N.addr.012, -4
397  %cmp = icmp sgt i32 %N.addr.012, 4
398  br i1 %cmp, label %while.body, label %while.end
399
400while.end:                                        ; preds = %while.body, %entry
401  ret void
402}
403
404define void @vqrdmulh(i32* %s1, i32 %c0, i32 %N) {
405; CHECK-LABEL: vqrdmulh:
406; CHECK:       @ %bb.0: @ %entry
407; CHECK-NEXT:    .save {r7, lr}
408; CHECK-NEXT:    push {r7, lr}
409; CHECK-NEXT:    cmp r2, #1
410; CHECK-NEXT:    it lt
411; CHECK-NEXT:    poplt {r7, pc}
412; CHECK-NEXT:  .LBB9_1: @ %while.body.lr.ph
413; CHECK-NEXT:    dlstp.32 lr, r2
414; CHECK-NEXT:  .LBB9_2: @ %while.body
415; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
416; CHECK-NEXT:    vldrw.u32 q0, [r0]
417; CHECK-NEXT:    vqrdmulh.s32 q0, q0, r1
418; CHECK-NEXT:    vstrw.32 q0, [r0], #16
419; CHECK-NEXT:    letp lr, .LBB9_2
420; CHECK-NEXT:  @ %bb.3: @ %while.end
421; CHECK-NEXT:    pop {r7, pc}
422entry:
423  %cmp11 = icmp sgt i32 %N, 0
424  br i1 %cmp11, label %while.body.lr.ph, label %while.end
425
426while.body.lr.ph:                                 ; preds = %entry
427  %.splatinsert = insertelement <4 x i32> undef, i32 %c0, i32 0
428  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
429  br label %while.body
430
431while.body:                                       ; preds = %while.body.lr.ph, %while.body
432  %s1.addr.013 = phi i32* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
433  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
434  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
435  %1 = bitcast i32* %s1.addr.013 to <4 x i32>*
436  %2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %0, <4 x i32> zeroinitializer)
437  %3 = tail call <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32> %2, <4 x i32> %.splat, <4 x i1> %0, <4 x i32> %2)
438  tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %1, i32 4, <4 x i1> %0)
439  %add.ptr = getelementptr inbounds i32, i32* %s1.addr.013, i32 4
440  %sub = add nsw i32 %N.addr.012, -4
441  %cmp = icmp sgt i32 %N.addr.012, 4
442  br i1 %cmp, label %while.body, label %while.end
443
444while.end:                                        ; preds = %while.body, %entry
445  ret void
446}
447
448define void @vaddf(float* %s1, float %c0, i32 %N) {
449; CHECK-LABEL: vaddf:
450; CHECK:       @ %bb.0: @ %entry
451; CHECK-NEXT:    .save {r7, lr}
452; CHECK-NEXT:    push {r7, lr}
453; CHECK-NEXT:    cmp r2, #1
454; CHECK-NEXT:    it lt
455; CHECK-NEXT:    poplt {r7, pc}
456; CHECK-NEXT:  .LBB10_1: @ %while.body.lr.ph
457; CHECK-NEXT:    dlstp.32 lr, r2
458; CHECK-NEXT:  .LBB10_2: @ %while.body
459; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
460; CHECK-NEXT:    vldrw.u32 q0, [r0]
461; CHECK-NEXT:    vadd.f32 q0, q0, r1
462; CHECK-NEXT:    vstrw.32 q0, [r0], #16
463; CHECK-NEXT:    letp lr, .LBB10_2
464; CHECK-NEXT:  @ %bb.3: @ %while.end
465; CHECK-NEXT:    pop {r7, pc}
466entry:
467  %cmp11 = icmp sgt i32 %N, 0
468  br i1 %cmp11, label %while.body.lr.ph, label %while.end
469
470while.body.lr.ph:                                 ; preds = %entry
471  %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
472  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
473  br label %while.body
474
475while.body:                                       ; preds = %while.body.lr.ph, %while.body
476  %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
477  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
478  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
479  %1 = bitcast float* %s1.addr.013 to <4 x float>*
480  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
481  %3 = tail call fast <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
482  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
483  %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
484  %sub = add nsw i32 %N.addr.012, -4
485  %cmp = icmp sgt i32 %N.addr.012, 4
486  br i1 %cmp, label %while.body, label %while.end
487
488while.end:                                        ; preds = %while.body, %entry
489  ret void
490}
491
492define void @vsubf(float* %s1, float %c0, i32 %N) {
493; CHECK-LABEL: vsubf:
494; CHECK:       @ %bb.0: @ %entry
495; CHECK-NEXT:    .save {r7, lr}
496; CHECK-NEXT:    push {r7, lr}
497; CHECK-NEXT:    cmp r2, #1
498; CHECK-NEXT:    it lt
499; CHECK-NEXT:    poplt {r7, pc}
500; CHECK-NEXT:  .LBB11_1: @ %while.body.lr.ph
501; CHECK-NEXT:    dlstp.32 lr, r2
502; CHECK-NEXT:  .LBB11_2: @ %while.body
503; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
504; CHECK-NEXT:    vldrw.u32 q0, [r0]
505; CHECK-NEXT:    vsub.f32 q0, q0, r1
506; CHECK-NEXT:    vstrw.32 q0, [r0], #16
507; CHECK-NEXT:    letp lr, .LBB11_2
508; CHECK-NEXT:  @ %bb.3: @ %while.end
509; CHECK-NEXT:    pop {r7, pc}
510entry:
511  %cmp11 = icmp sgt i32 %N, 0
512  br i1 %cmp11, label %while.body.lr.ph, label %while.end
513
514while.body.lr.ph:                                 ; preds = %entry
515  %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
516  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
517  br label %while.body
518
519while.body:                                       ; preds = %while.body.lr.ph, %while.body
520  %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
521  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
522  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
523  %1 = bitcast float* %s1.addr.013 to <4 x float>*
524  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
525  %3 = tail call fast <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
526  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
527  %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
528  %sub = add nsw i32 %N.addr.012, -4
529  %cmp = icmp sgt i32 %N.addr.012, 4
530  br i1 %cmp, label %while.body, label %while.end
531
532while.end:                                        ; preds = %while.body, %entry
533  ret void
534}
535
536define void @vmulf(float* %s1, float %c0, i32 %N) {
537; CHECK-LABEL: vmulf:
538; CHECK:       @ %bb.0: @ %entry
539; CHECK-NEXT:    .save {r7, lr}
540; CHECK-NEXT:    push {r7, lr}
541; CHECK-NEXT:    cmp r2, #1
542; CHECK-NEXT:    it lt
543; CHECK-NEXT:    poplt {r7, pc}
544; CHECK-NEXT:  .LBB12_1: @ %while.body.lr.ph
545; CHECK-NEXT:    dlstp.32 lr, r2
546; CHECK-NEXT:  .LBB12_2: @ %while.body
547; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
548; CHECK-NEXT:    vldrw.u32 q0, [r0]
549; CHECK-NEXT:    vmul.f32 q0, q0, r1
550; CHECK-NEXT:    vstrw.32 q0, [r0], #16
551; CHECK-NEXT:    letp lr, .LBB12_2
552; CHECK-NEXT:  @ %bb.3: @ %while.end
553; CHECK-NEXT:    pop {r7, pc}
554entry:
555  %cmp11 = icmp sgt i32 %N, 0
556  br i1 %cmp11, label %while.body.lr.ph, label %while.end
557
558while.body.lr.ph:                                 ; preds = %entry
559  %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
560  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
561  br label %while.body
562
563while.body:                                       ; preds = %while.body.lr.ph, %while.body
564  %s1.addr.013 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
565  %N.addr.012 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
566  %0 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.012)
567  %1 = bitcast float* %s1.addr.013 to <4 x float>*
568  %2 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %1, i32 4, <4 x i1> %0, <4 x float> zeroinitializer)
569  %3 = tail call fast <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float> %2, <4 x float> %.splat, <4 x i1> %0, <4 x float> %2)
570  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %1, i32 4, <4 x i1> %0)
571  %add.ptr = getelementptr inbounds float, float* %s1.addr.013, i32 4
572  %sub = add nsw i32 %N.addr.012, -4
573  %cmp = icmp sgt i32 %N.addr.012, 4
574  br i1 %cmp, label %while.body, label %while.end
575
576while.end:                                        ; preds = %while.body, %entry
577  ret void
578}
579
580define void @vfma(float* %s1, float* %s2, float %c0, i32 %N) {
581; CHECK-LABEL: vfma:
582; CHECK:       @ %bb.0: @ %entry
583; CHECK-NEXT:    .save {r7, lr}
584; CHECK-NEXT:    push {r7, lr}
585; CHECK-NEXT:    cmp r3, #1
586; CHECK-NEXT:    it lt
587; CHECK-NEXT:    poplt {r7, pc}
588; CHECK-NEXT:  .LBB13_1: @ %while.body.lr.ph
589; CHECK-NEXT:    dlstp.32 lr, r3
590; CHECK-NEXT:  .LBB13_2: @ %while.body
591; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
592; CHECK-NEXT:    vldrw.u32 q0, [r1]
593; CHECK-NEXT:    vldrw.u32 q1, [r0]
594; CHECK-NEXT:    vfma.f32 q1, q0, r2
595; CHECK-NEXT:    vstrw.32 q1, [r0], #16
596; CHECK-NEXT:    letp lr, .LBB13_2
597; CHECK-NEXT:  @ %bb.3: @ %while.end
598; CHECK-NEXT:    pop {r7, pc}
599entry:
600  %cmp12 = icmp sgt i32 %N, 0
601  br i1 %cmp12, label %while.body.lr.ph, label %while.end
602
603while.body.lr.ph:                                 ; preds = %entry
604  %0 = bitcast float* %s2 to <4 x float>*
605  %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
606  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
607  br label %while.body
608
609while.body:                                       ; preds = %while.body.lr.ph, %while.body
610  %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
611  %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
612  %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
613  %2 = bitcast float* %s1.addr.014 to <4 x float>*
614  %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
615  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
616  %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %4, <4 x float> %.splat, <4 x float> %3, <4 x i1> %1)
617  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
618  %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
619  %sub = add nsw i32 %N.addr.013, -4
620  %cmp = icmp sgt i32 %N.addr.013, 4
621  br i1 %cmp, label %while.body, label %while.end
622
623while.end:                                        ; preds = %while.body, %entry
624  ret void
625}
626
627define void @vfmas(float* %s1, float* %s2, float %c0, i32 %N) {
628; CHECK-LABEL: vfmas:
629; CHECK:       @ %bb.0: @ %entry
630; CHECK-NEXT:    .save {r7, lr}
631; CHECK-NEXT:    push {r7, lr}
632; CHECK-NEXT:    cmp r3, #1
633; CHECK-NEXT:    it lt
634; CHECK-NEXT:    poplt {r7, pc}
635; CHECK-NEXT:  .LBB14_1: @ %while.body.lr.ph
636; CHECK-NEXT:    dlstp.32 lr, r3
637; CHECK-NEXT:  .LBB14_2: @ %while.body
638; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
639; CHECK-NEXT:    vldrw.u32 q0, [r1]
640; CHECK-NEXT:    vldrw.u32 q1, [r0]
641; CHECK-NEXT:    vfmas.f32 q1, q0, r2
642; CHECK-NEXT:    vstrw.32 q1, [r0], #16
643; CHECK-NEXT:    letp lr, .LBB14_2
644; CHECK-NEXT:  @ %bb.3: @ %while.end
645; CHECK-NEXT:    pop {r7, pc}
646entry:
647  %cmp12 = icmp sgt i32 %N, 0
648  br i1 %cmp12, label %while.body.lr.ph, label %while.end
649
650while.body.lr.ph:                                 ; preds = %entry
651  %0 = bitcast float* %s2 to <4 x float>*
652  %.splatinsert = insertelement <4 x float> undef, float %c0, i32 0
653  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
654  br label %while.body
655
656while.body:                                       ; preds = %while.body.lr.ph, %while.body
657  %s1.addr.014 = phi float* [ %s1, %while.body.lr.ph ], [ %add.ptr, %while.body ]
658  %N.addr.013 = phi i32 [ %N, %while.body.lr.ph ], [ %sub, %while.body ]
659  %1 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %N.addr.013)
660  %2 = bitcast float* %s1.addr.014 to <4 x float>*
661  %3 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %2, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
662  %4 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %1, <4 x float> zeroinitializer)
663  %5 = tail call fast <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float> %3, <4 x float> %4, <4 x float> %.splat, <4 x i1> %1)
664  tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %5, <4 x float>* %2, i32 4, <4 x i1> %1)
665  %add.ptr = getelementptr inbounds float, float* %s1.addr.014, i32 4
666  %sub = add nsw i32 %N.addr.013, -4
667  %cmp = icmp sgt i32 %N.addr.013, 4
668  br i1 %cmp, label %while.body, label %while.end
669
670while.end:                                        ; preds = %while.body, %entry
671  ret void
672}
673
674declare <4 x i1> @llvm.arm.mve.vctp32(i32)
675declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
676declare <4 x i32> @llvm.arm.mve.add.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
677declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
678declare <4 x i32> @llvm.arm.mve.sub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
679declare <4 x i32> @llvm.arm.mve.mul.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
680declare <4 x i32> @llvm.arm.mve.qadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
681declare <4 x i32> @llvm.arm.mve.qsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
682declare <4 x i32> @llvm.arm.mve.hadd.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
683declare <4 x i32> @llvm.arm.mve.hsub.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>, <4 x i32>)
684declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>)
685declare <4 x i32> @llvm.arm.mve.vqdmull.predicated.v4i32.v8i16.v4i1(<8 x i16>, <8 x i16>, i32, <4 x i1>, <4 x i32>)
686declare <4 x i32> @llvm.arm.mve.qdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
687declare <4 x i32> @llvm.arm.mve.qrdmulh.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, <4 x i1>, <4 x i32>)
688declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
689declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
690declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)
691declare <4 x float> @llvm.arm.mve.sub.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
692declare <4 x float> @llvm.arm.mve.mul.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
693declare <4 x float> @llvm.arm.mve.fma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x float>, <4 x i1>)
694