1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled -o - %s | FileCheck %s
3
4define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
5; CHECK-LABEL: round:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r2, #0
10; CHECK-NEXT:    it eq
11; CHECK-NEXT:    popeq {r7, pc}
12; CHECK-NEXT:  .LBB0_1: @ %vector.ph
13; CHECK-NEXT:    dlstp.32 lr, r2
14; CHECK-NEXT:  .LBB0_2: @ %vector.body
15; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
16; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
17; CHECK-NEXT:    vrinta.f32 q0, q0
18; CHECK-NEXT:    vstrw.32 q0, [r1], #16
19; CHECK-NEXT:    letp lr, .LBB0_2
20; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
21; CHECK-NEXT:    pop {r7, pc}
22entry:
23  %cmp5 = icmp eq i32 %n, 0
24  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
25
26vector.ph:                                        ; preds = %entry
27  %n.rnd.up = add i32 %n, 3
28  %n.vec = and i32 %n.rnd.up, -4
29  br label %vector.body
30
31vector.body:                                      ; preds = %vector.body, %vector.ph
32  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
33  %next.gep = getelementptr float, float* %pSrcA, i32 %index
34  %next.gep14 = getelementptr float, float* %pDst, i32 %index
35  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
36  %0 = bitcast float* %next.gep to <4 x float>*
37  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
38  %1 = call fast <4 x float> @llvm.round.v4f32(<4 x float> %wide.masked.load)
39  %2 = bitcast float* %next.gep14 to <4 x float>*
40  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
41  %index.next = add i32 %index, 4
42  %3 = icmp eq i32 %index.next, %n.vec
43  br i1 %3, label %for.cond.cleanup, label %vector.body
44
45for.cond.cleanup:                                 ; preds = %vector.body, %entry
46  ret void
47}
48
49define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
50; CHECK-LABEL: rint:
51; CHECK:       @ %bb.0: @ %entry
52; CHECK-NEXT:    .save {r7, lr}
53; CHECK-NEXT:    push {r7, lr}
54; CHECK-NEXT:    cmp r2, #0
55; CHECK-NEXT:    it eq
56; CHECK-NEXT:    popeq {r7, pc}
57; CHECK-NEXT:  .LBB1_1: @ %vector.ph
58; CHECK-NEXT:    dlstp.32 lr, r2
59; CHECK-NEXT:  .LBB1_2: @ %vector.body
60; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
61; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
62; CHECK-NEXT:    vrintx.f32 q0, q0
63; CHECK-NEXT:    vstrw.32 q0, [r1], #16
64; CHECK-NEXT:    letp lr, .LBB1_2
65; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
66; CHECK-NEXT:    pop {r7, pc}
67entry:
68  %cmp5 = icmp eq i32 %n, 0
69  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
70
71vector.ph:                                        ; preds = %entry
72  %n.rnd.up = add i32 %n, 3
73  %n.vec = and i32 %n.rnd.up, -4
74  br label %vector.body
75
76vector.body:                                      ; preds = %vector.body, %vector.ph
77  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
78  %next.gep = getelementptr float, float* %pSrcA, i32 %index
79  %next.gep14 = getelementptr float, float* %pDst, i32 %index
80  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
81  %0 = bitcast float* %next.gep to <4 x float>*
82  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
83  %1 = call fast <4 x float> @llvm.rint.v4f32(<4 x float> %wide.masked.load)
84  %2 = bitcast float* %next.gep14 to <4 x float>*
85  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
86  %index.next = add i32 %index, 4
87  %3 = icmp eq i32 %index.next, %n.vec
88  br i1 %3, label %for.cond.cleanup, label %vector.body
89
90for.cond.cleanup:                                 ; preds = %vector.body, %entry
91  ret void
92}
93
94define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
95; CHECK-LABEL: trunc:
96; CHECK:       @ %bb.0: @ %entry
97; CHECK-NEXT:    .save {r7, lr}
98; CHECK-NEXT:    push {r7, lr}
99; CHECK-NEXT:    cmp r2, #0
100; CHECK-NEXT:    it eq
101; CHECK-NEXT:    popeq {r7, pc}
102; CHECK-NEXT:  .LBB2_1: @ %vector.ph
103; CHECK-NEXT:    dlstp.32 lr, r2
104; CHECK-NEXT:  .LBB2_2: @ %vector.body
105; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
106; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
107; CHECK-NEXT:    vrintz.f32 q0, q0
108; CHECK-NEXT:    vstrw.32 q0, [r1], #16
109; CHECK-NEXT:    letp lr, .LBB2_2
110; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
111; CHECK-NEXT:    pop {r7, pc}
112entry:
113  %cmp5 = icmp eq i32 %n, 0
114  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
115
116vector.ph:                                        ; preds = %entry
117  %n.rnd.up = add i32 %n, 3
118  %n.vec = and i32 %n.rnd.up, -4
119  br label %vector.body
120
121vector.body:                                      ; preds = %vector.body, %vector.ph
122  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
123  %next.gep = getelementptr float, float* %pSrcA, i32 %index
124  %next.gep14 = getelementptr float, float* %pDst, i32 %index
125  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
126  %0 = bitcast float* %next.gep to <4 x float>*
127  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
128  %1 = call fast <4 x float> @llvm.trunc.v4f32(<4 x float> %wide.masked.load)
129  %2 = bitcast float* %next.gep14 to <4 x float>*
130  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
131  %index.next = add i32 %index, 4
132  %3 = icmp eq i32 %index.next, %n.vec
133  br i1 %3, label %for.cond.cleanup, label %vector.body
134
135for.cond.cleanup:                                 ; preds = %vector.body, %entry
136  ret void
137}
138
139define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
140; CHECK-LABEL: ceil:
141; CHECK:       @ %bb.0: @ %entry
142; CHECK-NEXT:    .save {r7, lr}
143; CHECK-NEXT:    push {r7, lr}
144; CHECK-NEXT:    cmp r2, #0
145; CHECK-NEXT:    it eq
146; CHECK-NEXT:    popeq {r7, pc}
147; CHECK-NEXT:  .LBB3_1: @ %vector.ph
148; CHECK-NEXT:    dlstp.32 lr, r2
149; CHECK-NEXT:  .LBB3_2: @ %vector.body
150; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
151; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
152; CHECK-NEXT:    vrintp.f32 q0, q0
153; CHECK-NEXT:    vstrw.32 q0, [r1], #16
154; CHECK-NEXT:    letp lr, .LBB3_2
155; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
156; CHECK-NEXT:    pop {r7, pc}
157entry:
158  %cmp5 = icmp eq i32 %n, 0
159  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
160
161vector.ph:                                        ; preds = %entry
162  %n.rnd.up = add i32 %n, 3
163  %n.vec = and i32 %n.rnd.up, -4
164  br label %vector.body
165
166vector.body:                                      ; preds = %vector.body, %vector.ph
167  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
168  %next.gep = getelementptr float, float* %pSrcA, i32 %index
169  %next.gep14 = getelementptr float, float* %pDst, i32 %index
170  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
171  %0 = bitcast float* %next.gep to <4 x float>*
172  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
173  %1 = call fast <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.masked.load)
174  %2 = bitcast float* %next.gep14 to <4 x float>*
175  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
176  %index.next = add i32 %index, 4
177  %3 = icmp eq i32 %index.next, %n.vec
178  br i1 %3, label %for.cond.cleanup, label %vector.body
179
180for.cond.cleanup:                                 ; preds = %vector.body, %entry
181  ret void
182}
183
184define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
185; CHECK-LABEL: floor:
186; CHECK:       @ %bb.0: @ %entry
187; CHECK-NEXT:    .save {r7, lr}
188; CHECK-NEXT:    push {r7, lr}
189; CHECK-NEXT:    cmp r2, #0
190; CHECK-NEXT:    it eq
191; CHECK-NEXT:    popeq {r7, pc}
192; CHECK-NEXT:  .LBB4_1: @ %vector.ph
193; CHECK-NEXT:    dlstp.32 lr, r2
194; CHECK-NEXT:  .LBB4_2: @ %vector.body
195; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
196; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
197; CHECK-NEXT:    vrintm.f32 q0, q0
198; CHECK-NEXT:    vstrw.32 q0, [r1], #16
199; CHECK-NEXT:    letp lr, .LBB4_2
200; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
201; CHECK-NEXT:    pop {r7, pc}
202entry:
203  %cmp5 = icmp eq i32 %n, 0
204  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
205
206vector.ph:                                        ; preds = %entry
207  %n.rnd.up = add i32 %n, 3
208  %n.vec = and i32 %n.rnd.up, -4
209  br label %vector.body
210
211vector.body:                                      ; preds = %vector.body, %vector.ph
212  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
213  %next.gep = getelementptr float, float* %pSrcA, i32 %index
214  %next.gep14 = getelementptr float, float* %pDst, i32 %index
215  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
216  %0 = bitcast float* %next.gep to <4 x float>*
217  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
218  %1 = call fast <4 x float> @llvm.floor.v4f32(<4 x float> %wide.masked.load)
219  %2 = bitcast float* %next.gep14 to <4 x float>*
220  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
221  %index.next = add i32 %index, 4
222  %3 = icmp eq i32 %index.next, %n.vec
223  br i1 %3, label %for.cond.cleanup, label %vector.body
224
225for.cond.cleanup:                                 ; preds = %vector.body, %entry
226  ret void
227}
228
229; nearbyint shouldn't be tail predicated because it's lowered into multiple instructions
230define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 {
231; CHECK-LABEL: nearbyint:
232; CHECK:       @ %bb.0: @ %entry
233; CHECK-NEXT:    .save {r7, lr}
234; CHECK-NEXT:    push {r7, lr}
235; CHECK-NEXT:    cmp r2, #0
236; CHECK-NEXT:    it eq
237; CHECK-NEXT:    popeq {r7, pc}
238; CHECK-NEXT:  .LBB5_1: @ %vector.ph
239; CHECK-NEXT:    dlstp.32 lr, r2
240; CHECK-NEXT:  .LBB5_2: @ %vector.body
241; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
242; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
243; CHECK-NEXT:    vrintr.f32 s7, s3
244; CHECK-NEXT:    vrintr.f32 s6, s2
245; CHECK-NEXT:    vrintr.f32 s5, s1
246; CHECK-NEXT:    vrintr.f32 s4, s0
247; CHECK-NEXT:    vstrw.32 q1, [r1], #16
248; CHECK-NEXT:    letp lr, .LBB5_2
249; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
250; CHECK-NEXT:    pop {r7, pc}
251entry:
252  %cmp5 = icmp eq i32 %n, 0
253  br i1 %cmp5, label %for.cond.cleanup, label %vector.ph
254
255vector.ph:                                        ; preds = %entry
256  %n.rnd.up = add i32 %n, 3
257  %n.vec = and i32 %n.rnd.up, -4
258  br label %vector.body
259
260vector.body:                                      ; preds = %vector.body, %vector.ph
261  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
262  %next.gep = getelementptr float, float* %pSrcA, i32 %index
263  %next.gep14 = getelementptr float, float* %pDst, i32 %index
264  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
265  %0 = bitcast float* %next.gep to <4 x float>*
266  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
267  %1 = call fast <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.masked.load)
268  %2 = bitcast float* %next.gep14 to <4 x float>*
269  call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %active.lane.mask)
270  %index.next = add i32 %index, 4
271  %3 = icmp eq i32 %index.next, %n.vec
272  br i1 %3, label %for.cond.cleanup, label %vector.body
273
274for.cond.cleanup:                                 ; preds = %vector.body, %entry
275  ret void
276}
277
278declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
279
280declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) #2
281
282declare <4 x float> @llvm.trunc.v4f32(<4 x float>) #3
283
284declare <4 x float> @llvm.rint.v4f32(<4 x float>) #3
285
286declare <4 x float> @llvm.round.v4f32(<4 x float>) #3
287
288declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #3
289
290declare <4 x float> @llvm.floor.v4f32(<4 x float>) #3
291
292declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) #1
293
294declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) #4
295