1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
4define arm_aapcs_vfpcc <2 x i32> @vmulhs_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
5; CHECK-LABEL: vmulhs_v2i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmullb.s32 q2, q0, q1
8; CHECK-NEXT:    vmov r0, s11
9; CHECK-NEXT:    vmov r1, s9
10; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
11; CHECK-NEXT:    asrs r0, r0, #31
12; CHECK-NEXT:    asrs r1, r1, #31
13; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
14; CHECK-NEXT:    bx lr
16  %s0s = sext <2 x i32> %s0 to <2 x i64>
17  %s1s = sext <2 x i32> %s1 to <2 x i64>
18  %m = mul <2 x i64> %s0s, %s1s
19  %s = ashr <2 x i64> %m, <i64 32, i64 32>
20  %s2 = trunc <2 x i64> %s to <2 x i32>
21  ret <2 x i32> %s2
24define arm_aapcs_vfpcc <2 x i32> @vmulhu_v2i32(<2 x i32> %s0, <2 x i32> %s1) {
25; CHECK-LABEL: vmulhu_v2i32:
26; CHECK:       @ %bb.0: @ %entry
27; CHECK-NEXT:    vmullb.u32 q2, q0, q1
28; CHECK-NEXT:    vldr s1, .LCPI1_0
29; CHECK-NEXT:    vmov.f32 s0, s9
30; CHECK-NEXT:    vmov.f32 s2, s11
31; CHECK-NEXT:    vmov.f32 s3, s1
32; CHECK-NEXT:    bx lr
33; CHECK-NEXT:    .p2align 2
34; CHECK-NEXT:  @ %bb.1:
35; CHECK-NEXT:  .LCPI1_0:
36; CHECK-NEXT:    .long 0x00000000 @ float 0
38  %s0s = zext <2 x i32> %s0 to <2 x i64>
39  %s1s = zext <2 x i32> %s1 to <2 x i64>
40  %m = mul <2 x i64> %s0s, %s1s
41  %s = lshr <2 x i64> %m, <i64 32, i64 32>
42  %s2 = trunc <2 x i64> %s to <2 x i32>
43  ret <2 x i32> %s2
46define arm_aapcs_vfpcc <4 x i32> @vmulhs_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
47; CHECK-LABEL: vmulhs_v4i32:
48; CHECK:       @ %bb.0: @ %entry
49; CHECK-NEXT:    vmulh.s32 q0, q0, q1
50; CHECK-NEXT:    bx lr
52  %s0s = sext <4 x i32> %s0 to <4 x i64>
53  %s1s = sext <4 x i32> %s1 to <4 x i64>
54  %m = mul <4 x i64> %s0s, %s1s
55  %s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
56  %s2 = trunc <4 x i64> %s to <4 x i32>
57  ret <4 x i32> %s2
60define arm_aapcs_vfpcc <4 x i32> @vmulhu_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
61; CHECK-LABEL: vmulhu_v4i32:
62; CHECK:       @ %bb.0: @ %entry
63; CHECK-NEXT:    vmulh.u32 q0, q0, q1
64; CHECK-NEXT:    bx lr
66  %s0s = zext <4 x i32> %s0 to <4 x i64>
67  %s1s = zext <4 x i32> %s1 to <4 x i64>
68  %m = mul <4 x i64> %s0s, %s1s
69  %s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
70  %s2 = trunc <4 x i64> %s to <4 x i32>
71  ret <4 x i32> %s2
74define arm_aapcs_vfpcc <4 x i16> @vmulhs_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
75; CHECK-LABEL: vmulhs_v4i16:
76; CHECK:       @ %bb.0: @ %entry
77; CHECK-NEXT:    vmullb.s16 q0, q0, q1
78; CHECK-NEXT:    vshr.s32 q0, q0, #16
79; CHECK-NEXT:    bx lr
81  %s0s = sext <4 x i16> %s0 to <4 x i32>
82  %s1s = sext <4 x i16> %s1 to <4 x i32>
83  %m = mul <4 x i32> %s0s, %s1s
84  %s = ashr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
85  %s2 = trunc <4 x i32> %s to <4 x i16>
86  ret <4 x i16> %s2
89define arm_aapcs_vfpcc <4 x i16> @vmulhu_v4i16(<4 x i16> %s0, <4 x i16> %s1) {
90; CHECK-LABEL: vmulhu_v4i16:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vmullb.u16 q0, q0, q1
93; CHECK-NEXT:    vshr.u32 q0, q0, #16
94; CHECK-NEXT:    bx lr
96  %s0s = zext <4 x i16> %s0 to <4 x i32>
97  %s1s = zext <4 x i16> %s1 to <4 x i32>
98  %m = mul <4 x i32> %s0s, %s1s
99  %s = lshr <4 x i32> %m, <i32 16, i32 16, i32 16, i32 16>
100  %s2 = trunc <4 x i32> %s to <4 x i16>
101  ret <4 x i16> %s2
104define arm_aapcs_vfpcc <8 x i16> @vmulhs_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
105; CHECK-LABEL: vmulhs_v8i16:
106; CHECK:       @ %bb.0: @ %entry
107; CHECK-NEXT:    vmulh.s16 q0, q0, q1
108; CHECK-NEXT:    bx lr
110  %s0s = sext <8 x i16> %s0 to <8 x i32>
111  %s1s = sext <8 x i16> %s1 to <8 x i32>
112  %m = mul <8 x i32> %s0s, %s1s
113  %s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
114  %s2 = trunc <8 x i32> %s to <8 x i16>
115  ret <8 x i16> %s2
118define arm_aapcs_vfpcc <8 x i16> @vmulhu_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
119; CHECK-LABEL: vmulhu_v8i16:
120; CHECK:       @ %bb.0: @ %entry
121; CHECK-NEXT:    vmulh.u16 q0, q0, q1
122; CHECK-NEXT:    bx lr
124  %s0s = zext <8 x i16> %s0 to <8 x i32>
125  %s1s = zext <8 x i16> %s1 to <8 x i32>
126  %m = mul <8 x i32> %s0s, %s1s
127  %s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
128  %s2 = trunc <8 x i32> %s to <8 x i16>
129  ret <8 x i16> %s2
132define arm_aapcs_vfpcc <8 x i8> @vmulhs_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
133; CHECK-LABEL: vmulhs_v8i8:
134; CHECK:       @ %bb.0: @ %entry
135; CHECK-NEXT:    vmullb.s8 q0, q0, q1
136; CHECK-NEXT:    vshr.s16 q0, q0, #8
137; CHECK-NEXT:    bx lr
139  %s0s = sext <8 x i8> %s0 to <8 x i16>
140  %s1s = sext <8 x i8> %s1 to <8 x i16>
141  %m = mul <8 x i16> %s0s, %s1s
142  %s = ashr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
143  %s2 = trunc <8 x i16> %s to <8 x i8>
144  ret <8 x i8> %s2
147define arm_aapcs_vfpcc <8 x i8> @vmulhu_v8i8(<8 x i8> %s0, <8 x i8> %s1) {
148; CHECK-LABEL: vmulhu_v8i8:
149; CHECK:       @ %bb.0: @ %entry
150; CHECK-NEXT:    vmullb.u8 q0, q0, q1
151; CHECK-NEXT:    vshr.u16 q0, q0, #8
152; CHECK-NEXT:    bx lr
154  %s0s = zext <8 x i8> %s0 to <8 x i16>
155  %s1s = zext <8 x i8> %s1 to <8 x i16>
156  %m = mul <8 x i16> %s0s, %s1s
157  %s = lshr <8 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
158  %s2 = trunc <8 x i16> %s to <8 x i8>
159  ret <8 x i8> %s2
162define arm_aapcs_vfpcc <16 x i8> @vmulhs_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
163; CHECK-LABEL: vmulhs_v16i8:
164; CHECK:       @ %bb.0: @ %entry
165; CHECK-NEXT:    vmulh.s8 q0, q0, q1
166; CHECK-NEXT:    bx lr
168  %s0s = sext <16 x i8> %s0 to <16 x i16>
169  %s1s = sext <16 x i8> %s1 to <16 x i16>
170  %m = mul <16 x i16> %s0s, %s1s
171  %s = ashr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
172  %s2 = trunc <16 x i16> %s to <16 x i8>
173  ret <16 x i8> %s2
176define arm_aapcs_vfpcc <16 x i8> @vmulhu_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
177; CHECK-LABEL: vmulhu_v16i8:
178; CHECK:       @ %bb.0: @ %entry
179; CHECK-NEXT:    vmulh.u8 q0, q0, q1
180; CHECK-NEXT:    bx lr
182  %s0s = zext <16 x i8> %s0 to <16 x i16>
183  %s1s = zext <16 x i8> %s1 to <16 x i16>
184  %m = mul <16 x i16> %s0s, %s1s
185  %s = lshr <16 x i16> %m, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
186  %s2 = trunc <16 x i16> %s to <16 x i8>
187  ret <16 x i8> %s2
190define void @vmulh_s8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
191; CHECK-LABEL: vmulh_s8:
192; CHECK:       @ %bb.0: @ %entry
193; CHECK-NEXT:    .save {r7, lr}
194; CHECK-NEXT:    push {r7, lr}
195; CHECK-NEXT:    mov.w lr, #64
196; CHECK-NEXT:  .LBB12_1: @ %vector.body
197; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
198; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
199; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
200; CHECK-NEXT:    vmulh.s8 q0, q1, q0
201; CHECK-NEXT:    vstrb.8 q0, [r2], #16
202; CHECK-NEXT:    le lr, .LBB12_1
203; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
204; CHECK-NEXT:    pop {r7, pc}
206  br label %vector.body
208vector.body:                                      ; preds = %vector.body, %entry
209  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
210  %0 = getelementptr inbounds i8, i8* %x, i32 %index
211  %1 = bitcast i8* %0 to <16 x i8>*
212  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
213  %2 = sext <16 x i8> %wide.load to <16 x i16>
214  %3 = getelementptr inbounds i8, i8* %y, i32 %index
215  %4 = bitcast i8* %3 to <16 x i8>*
216  %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
217  %5 = sext <16 x i8> %wide.load17 to <16 x i16>
218  %6 = mul nsw <16 x i16> %5, %2
219  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
220  %8 = trunc <16 x i16> %7 to <16 x i8>
221  %9 = getelementptr inbounds i8, i8* %z, i32 %index
222  %10 = bitcast i8* %9 to <16 x i8>*
223  store <16 x i8> %8, <16 x i8>* %10, align 1
224  %index.next = add i32 %index, 16
225  %11 = icmp eq i32 %index.next, 1024
226  br i1 %11, label %for.cond.cleanup, label %vector.body
228for.cond.cleanup:                                 ; preds = %vector.body
229  ret void
232define void @vmulh_s16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
233; CHECK-LABEL: vmulh_s16:
234; CHECK:       @ %bb.0: @ %entry
235; CHECK-NEXT:    .save {r7, lr}
236; CHECK-NEXT:    push {r7, lr}
237; CHECK-NEXT:    mov.w lr, #128
238; CHECK-NEXT:  .LBB13_1: @ %vector.body
239; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
240; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
241; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
242; CHECK-NEXT:    vmulh.s16 q0, q1, q0
243; CHECK-NEXT:    vstrb.8 q0, [r2], #16
244; CHECK-NEXT:    le lr, .LBB13_1
245; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
246; CHECK-NEXT:    pop {r7, pc}
248  br label %vector.body
250vector.body:                                      ; preds = %vector.body, %entry
251  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
252  %0 = getelementptr inbounds i16, i16* %x, i32 %index
253  %1 = bitcast i16* %0 to <8 x i16>*
254  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
255  %2 = sext <8 x i16> %wide.load to <8 x i32>
256  %3 = getelementptr inbounds i16, i16* %y, i32 %index
257  %4 = bitcast i16* %3 to <8 x i16>*
258  %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
259  %5 = sext <8 x i16> %wide.load17 to <8 x i32>
260  %6 = mul nsw <8 x i32> %5, %2
261  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
262  %8 = trunc <8 x i32> %7 to <8 x i16>
263  %9 = getelementptr inbounds i16, i16* %z, i32 %index
264  %10 = bitcast i16* %9 to <8 x i16>*
265  store <8 x i16> %8, <8 x i16>* %10, align 2
266  %index.next = add i32 %index, 8
267  %11 = icmp eq i32 %index.next, 1024
268  br i1 %11, label %for.cond.cleanup, label %vector.body
270for.cond.cleanup:                                 ; preds = %vector.body
271  ret void
274define void @vmulh_s32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
275; CHECK-LABEL: vmulh_s32:
276; CHECK:       @ %bb.0: @ %entry
277; CHECK-NEXT:    .save {r7, lr}
278; CHECK-NEXT:    push {r7, lr}
279; CHECK-NEXT:    mov.w lr, #256
280; CHECK-NEXT:  .LBB14_1: @ %vector.body
281; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
282; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
283; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
284; CHECK-NEXT:    vmulh.s32 q0, q1, q0
285; CHECK-NEXT:    vstrb.8 q0, [r2], #16
286; CHECK-NEXT:    le lr, .LBB14_1
287; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
288; CHECK-NEXT:    pop {r7, pc}
290  br label %vector.body
292vector.body:                                      ; preds = %vector.body, %entry
293  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
294  %0 = getelementptr inbounds i32, i32* %x, i32 %index
295  %1 = bitcast i32* %0 to <4 x i32>*
296  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
297  %2 = sext <4 x i32> %wide.load to <4 x i64>
298  %3 = getelementptr inbounds i32, i32* %y, i32 %index
299  %4 = bitcast i32* %3 to <4 x i32>*
300  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
301  %5 = sext <4 x i32> %wide.load17 to <4 x i64>
302  %6 = mul nsw <4 x i64> %5, %2
303  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
304  %8 = trunc <4 x i64> %7 to <4 x i32>
305  %9 = getelementptr inbounds i32, i32* %z, i32 %index
306  %10 = bitcast i32* %9 to <4 x i32>*
307  store <4 x i32> %8, <4 x i32>* %10, align 4
308  %index.next = add i32 %index, 4
309  %11 = icmp eq i32 %index.next, 1024
310  br i1 %11, label %for.cond.cleanup, label %vector.body
312for.cond.cleanup:                                 ; preds = %vector.body
313  ret void
316define void @vmulh_u8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) {
317; CHECK-LABEL: vmulh_u8:
318; CHECK:       @ %bb.0: @ %entry
319; CHECK-NEXT:    .save {r7, lr}
320; CHECK-NEXT:    push {r7, lr}
321; CHECK-NEXT:    mov.w lr, #64
322; CHECK-NEXT:  .LBB15_1: @ %vector.body
323; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
324; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
325; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
326; CHECK-NEXT:    vmulh.u8 q0, q1, q0
327; CHECK-NEXT:    vstrb.8 q0, [r2], #16
328; CHECK-NEXT:    le lr, .LBB15_1
329; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
330; CHECK-NEXT:    pop {r7, pc}
332  br label %vector.body
334vector.body:                                      ; preds = %vector.body, %entry
335  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
336  %0 = getelementptr inbounds i8, i8* %x, i32 %index
337  %1 = bitcast i8* %0 to <16 x i8>*
338  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
339  %2 = zext <16 x i8> %wide.load to <16 x i16>
340  %3 = getelementptr inbounds i8, i8* %y, i32 %index
341  %4 = bitcast i8* %3 to <16 x i8>*
342  %wide.load17 = load <16 x i8>, <16 x i8>* %4, align 1
343  %5 = zext <16 x i8> %wide.load17 to <16 x i16>
344  %6 = mul nuw <16 x i16> %5, %2
345  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
346  %8 = trunc <16 x i16> %7 to <16 x i8>
347  %9 = getelementptr inbounds i8, i8* %z, i32 %index
348  %10 = bitcast i8* %9 to <16 x i8>*
349  store <16 x i8> %8, <16 x i8>* %10, align 1
350  %index.next = add i32 %index, 16
351  %11 = icmp eq i32 %index.next, 1024
352  br i1 %11, label %for.cond.cleanup, label %vector.body
354for.cond.cleanup:                                 ; preds = %vector.body
355  ret void
358define void @vmulh_u16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
359; CHECK-LABEL: vmulh_u16:
360; CHECK:       @ %bb.0: @ %entry
361; CHECK-NEXT:    .save {r7, lr}
362; CHECK-NEXT:    push {r7, lr}
363; CHECK-NEXT:    mov.w lr, #128
364; CHECK-NEXT:  .LBB16_1: @ %vector.body
365; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
366; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
367; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
368; CHECK-NEXT:    vmulh.u16 q0, q1, q0
369; CHECK-NEXT:    vstrb.8 q0, [r2], #16
370; CHECK-NEXT:    le lr, .LBB16_1
371; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
372; CHECK-NEXT:    pop {r7, pc}
374  br label %vector.body
376vector.body:                                      ; preds = %vector.body, %entry
377  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
378  %0 = getelementptr inbounds i16, i16* %x, i32 %index
379  %1 = bitcast i16* %0 to <8 x i16>*
380  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
381  %2 = zext <8 x i16> %wide.load to <8 x i32>
382  %3 = getelementptr inbounds i16, i16* %y, i32 %index
383  %4 = bitcast i16* %3 to <8 x i16>*
384  %wide.load17 = load <8 x i16>, <8 x i16>* %4, align 2
385  %5 = zext <8 x i16> %wide.load17 to <8 x i32>
386  %6 = mul nuw <8 x i32> %5, %2
387  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
388  %8 = trunc <8 x i32> %7 to <8 x i16>
389  %9 = getelementptr inbounds i16, i16* %z, i32 %index
390  %10 = bitcast i16* %9 to <8 x i16>*
391  store <8 x i16> %8, <8 x i16>* %10, align 2
392  %index.next = add i32 %index, 8
393  %11 = icmp eq i32 %index.next, 1024
394  br i1 %11, label %for.cond.cleanup, label %vector.body
396for.cond.cleanup:                                 ; preds = %vector.body
397  ret void
400define void @vmulh_u32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
401; CHECK-LABEL: vmulh_u32:
402; CHECK:       @ %bb.0: @ %entry
403; CHECK-NEXT:    .save {r7, lr}
404; CHECK-NEXT:    push {r7, lr}
405; CHECK-NEXT:    mov.w lr, #256
406; CHECK-NEXT:  .LBB17_1: @ %vector.body
407; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
408; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
409; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
410; CHECK-NEXT:    vmulh.u32 q0, q1, q0
411; CHECK-NEXT:    vstrb.8 q0, [r2], #16
412; CHECK-NEXT:    le lr, .LBB17_1
413; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
414; CHECK-NEXT:    pop {r7, pc}
416  br label %vector.body
418vector.body:                                      ; preds = %vector.body, %entry
419  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
420  %0 = getelementptr inbounds i32, i32* %x, i32 %index
421  %1 = bitcast i32* %0 to <4 x i32>*
422  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
423  %2 = zext <4 x i32> %wide.load to <4 x i64>
424  %3 = getelementptr inbounds i32, i32* %y, i32 %index
425  %4 = bitcast i32* %3 to <4 x i32>*
426  %wide.load17 = load <4 x i32>, <4 x i32>* %4, align 4
427  %5 = zext <4 x i32> %wide.load17 to <4 x i64>
428  %6 = mul nuw <4 x i64> %5, %2
429  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
430  %8 = trunc <4 x i64> %7 to <4 x i32>
431  %9 = getelementptr inbounds i32, i32* %z, i32 %index
432  %10 = bitcast i32* %9 to <4 x i32>*
433  store <4 x i32> %8, <4 x i32>* %10, align 4
434  %index.next = add i32 %index, 4
435  %11 = icmp eq i32 %index.next, 1024
436  br i1 %11, label %for.cond.cleanup, label %vector.body
438for.cond.cleanup:                                 ; preds = %vector.body
439  ret void
443define void @vmulh_s32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
444; CHECK-LABEL: vmulh_s32_pred:
445; CHECK:       @ %bb.0: @ %entry
446; CHECK-NEXT:    .save {r7, lr}
447; CHECK-NEXT:    push {r7, lr}
448; CHECK-NEXT:    cmp r3, #1
449; CHECK-NEXT:    it lt
450; CHECK-NEXT:    poplt {r7, pc}
451; CHECK-NEXT:  .LBB18_1: @ %vector.ph
452; CHECK-NEXT:    dlstp.32 lr, r3
453; CHECK-NEXT:  .LBB18_2: @ %vector.body
454; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
455; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
456; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
457; CHECK-NEXT:    vmulh.s32 q0, q1, q0
458; CHECK-NEXT:    vstrw.32 q0, [r0], #16
459; CHECK-NEXT:    letp lr, .LBB18_2
460; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
461; CHECK-NEXT:    pop {r7, pc}
463  %cmp10 = icmp sgt i32 %n, 0
464  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
466vector.ph:                                        ; preds = %entry
467  %n.rnd.up = add i32 %n, 3
468  %n.vec = and i32 %n.rnd.up, -4
469  br label %vector.body
471vector.body:                                      ; preds = %vector.body, %vector.ph
472  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
473  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
474  %0 = getelementptr inbounds i32, i32* %x, i32 %index
475  %1 = bitcast i32* %0 to <4 x i32>*
476  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
477  %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
478  %3 = getelementptr inbounds i32, i32* %y, i32 %index
479  %4 = bitcast i32* %3 to <4 x i32>*
480  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
481  %5 = sext <4 x i32> %wide.masked.load12 to <4 x i64>
482  %6 = mul nsw <4 x i64> %5, %2
483  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
484  %8 = trunc <4 x i64> %7 to <4 x i32>
485  %9 = getelementptr inbounds i32, i32* %d, i32 %index
486  %10 = bitcast i32* %9 to <4 x i32>*
487  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
488  %index.next = add i32 %index, 4
489  %11 = icmp eq i32 %index.next, %n.vec
490  br i1 %11, label %for.cond.cleanup, label %vector.body
492for.cond.cleanup:                                 ; preds = %vector.body, %entry
493  ret void
496define void @vmulh_u32_pred(i32* noalias nocapture %d, i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
497; CHECK-LABEL: vmulh_u32_pred:
498; CHECK:       @ %bb.0: @ %entry
499; CHECK-NEXT:    .save {r7, lr}
500; CHECK-NEXT:    push {r7, lr}
501; CHECK-NEXT:    cmp r3, #1
502; CHECK-NEXT:    it lt
503; CHECK-NEXT:    poplt {r7, pc}
504; CHECK-NEXT:  .LBB19_1: @ %vector.ph
505; CHECK-NEXT:    dlstp.32 lr, r3
506; CHECK-NEXT:  .LBB19_2: @ %vector.body
507; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
508; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
509; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
510; CHECK-NEXT:    vmulh.u32 q0, q1, q0
511; CHECK-NEXT:    vstrw.32 q0, [r0], #16
512; CHECK-NEXT:    letp lr, .LBB19_2
513; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
514; CHECK-NEXT:    pop {r7, pc}
516  %cmp10 = icmp sgt i32 %n, 0
517  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
519vector.ph:                                        ; preds = %entry
520  %n.rnd.up = add i32 %n, 3
521  %n.vec = and i32 %n.rnd.up, -4
522  br label %vector.body
524vector.body:                                      ; preds = %vector.body, %vector.ph
525  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
526  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
527  %0 = getelementptr inbounds i32, i32* %x, i32 %index
528  %1 = bitcast i32* %0 to <4 x i32>*
529  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
530  %2 = zext <4 x i32> %wide.masked.load to <4 x i64>
531  %3 = getelementptr inbounds i32, i32* %y, i32 %index
532  %4 = bitcast i32* %3 to <4 x i32>*
533  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison)
534  %5 = zext <4 x i32> %wide.masked.load12 to <4 x i64>
535  %6 = mul nuw <4 x i64> %5, %2
536  %7 = lshr <4 x i64> %6, <i64 32, i64 32, i64 32, i64 32>
537  %8 = trunc <4 x i64> %7 to <4 x i32>
538  %9 = getelementptr inbounds i32, i32* %d, i32 %index
539  %10 = bitcast i32* %9 to <4 x i32>*
540  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %active.lane.mask)
541  %index.next = add i32 %index, 4
542  %11 = icmp eq i32 %index.next, %n.vec
543  br i1 %11, label %for.cond.cleanup, label %vector.body
545for.cond.cleanup:                                 ; preds = %vector.body, %entry
546  ret void
549define void @vmulh_s16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
550; CHECK-LABEL: vmulh_s16_pred:
551; CHECK:       @ %bb.0: @ %entry
552; CHECK-NEXT:    .save {r7, lr}
553; CHECK-NEXT:    push {r7, lr}
554; CHECK-NEXT:    cmp r3, #1
555; CHECK-NEXT:    it lt
556; CHECK-NEXT:    poplt {r7, pc}
557; CHECK-NEXT:  .LBB20_1: @ %vector.ph
558; CHECK-NEXT:    dlstp.16 lr, r3
559; CHECK-NEXT:  .LBB20_2: @ %vector.body
560; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
561; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
562; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
563; CHECK-NEXT:    vmulh.s16 q0, q1, q0
564; CHECK-NEXT:    vstrh.16 q0, [r0], #16
565; CHECK-NEXT:    letp lr, .LBB20_2
566; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
567; CHECK-NEXT:    pop {r7, pc}
569  %cmp10 = icmp sgt i32 %n, 0
570  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
572vector.ph:                                        ; preds = %entry
573  %n.rnd.up = add i32 %n, 7
574  %n.vec = and i32 %n.rnd.up, -8
575  br label %vector.body
577vector.body:                                      ; preds = %vector.body, %vector.ph
578  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
579  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
580  %0 = getelementptr inbounds i16, i16* %x, i32 %index
581  %1 = bitcast i16* %0 to <8 x i16>*
582  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
583  %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
584  %3 = getelementptr inbounds i16, i16* %y, i32 %index
585  %4 = bitcast i16* %3 to <8 x i16>*
586  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
587  %5 = sext <8 x i16> %wide.masked.load12 to <8 x i32>
588  %6 = mul nsw <8 x i32> %5, %2
589  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
590  %8 = trunc <8 x i32> %7 to <8 x i16>
591  %9 = getelementptr inbounds i16, i16* %d, i32 %index
592  %10 = bitcast i16* %9 to <8 x i16>*
593  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
594  %index.next = add i32 %index, 8
595  %11 = icmp eq i32 %index.next, %n.vec
596  br i1 %11, label %for.cond.cleanup, label %vector.body
598for.cond.cleanup:                                 ; preds = %vector.body, %entry
599  ret void
602define void @vmulh_u16_pred(i16* noalias nocapture %d, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
603; CHECK-LABEL: vmulh_u16_pred:
604; CHECK:       @ %bb.0: @ %entry
605; CHECK-NEXT:    .save {r7, lr}
606; CHECK-NEXT:    push {r7, lr}
607; CHECK-NEXT:    cmp r3, #1
608; CHECK-NEXT:    it lt
609; CHECK-NEXT:    poplt {r7, pc}
610; CHECK-NEXT:  .LBB21_1: @ %vector.ph
611; CHECK-NEXT:    dlstp.16 lr, r3
612; CHECK-NEXT:  .LBB21_2: @ %vector.body
613; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
614; CHECK-NEXT:    vldrh.u16 q0, [r1], #16
615; CHECK-NEXT:    vldrh.u16 q1, [r2], #16
616; CHECK-NEXT:    vmulh.u16 q0, q1, q0
617; CHECK-NEXT:    vstrh.16 q0, [r0], #16
618; CHECK-NEXT:    letp lr, .LBB21_2
619; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
620; CHECK-NEXT:    pop {r7, pc}
622  %cmp10 = icmp sgt i32 %n, 0
623  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
625vector.ph:                                        ; preds = %entry
626  %n.rnd.up = add i32 %n, 7
627  %n.vec = and i32 %n.rnd.up, -8
628  br label %vector.body
630vector.body:                                      ; preds = %vector.body, %vector.ph
631  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
632  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
633  %0 = getelementptr inbounds i16, i16* %x, i32 %index
634  %1 = bitcast i16* %0 to <8 x i16>*
635  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
636  %2 = zext <8 x i16> %wide.masked.load to <8 x i32>
637  %3 = getelementptr inbounds i16, i16* %y, i32 %index
638  %4 = bitcast i16* %3 to <8 x i16>*
639  %wide.masked.load12 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
640  %5 = zext <8 x i16> %wide.masked.load12 to <8 x i32>
641  %6 = mul nuw <8 x i32> %5, %2
642  %7 = lshr <8 x i32> %6, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
643  %8 = trunc <8 x i32> %7 to <8 x i16>
644  %9 = getelementptr inbounds i16, i16* %d, i32 %index
645  %10 = bitcast i16* %9 to <8 x i16>*
646  call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %8, <8 x i16>* %10, i32 2, <8 x i1> %active.lane.mask)
647  %index.next = add i32 %index, 8
648  %11 = icmp eq i32 %index.next, %n.vec
649  br i1 %11, label %for.cond.cleanup, label %vector.body
651for.cond.cleanup:                                 ; preds = %vector.body, %entry
652  ret void
655define void @vmulh_s8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
656; CHECK-LABEL: vmulh_s8_pred:
657; CHECK:       @ %bb.0: @ %entry
658; CHECK-NEXT:    .save {r7, lr}
659; CHECK-NEXT:    push {r7, lr}
660; CHECK-NEXT:    cmp r3, #1
661; CHECK-NEXT:    it lt
662; CHECK-NEXT:    poplt {r7, pc}
663; CHECK-NEXT:  .LBB22_1: @ %vector.ph
664; CHECK-NEXT:    dlstp.8 lr, r3
665; CHECK-NEXT:  .LBB22_2: @ %vector.body
666; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
667; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
668; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
669; CHECK-NEXT:    vmulh.s8 q0, q1, q0
670; CHECK-NEXT:    vstrb.8 q0, [r0], #16
671; CHECK-NEXT:    letp lr, .LBB22_2
672; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
673; CHECK-NEXT:    pop {r7, pc}
675  %cmp10 = icmp sgt i32 %n, 0
676  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
678vector.ph:                                        ; preds = %entry
679  %n.rnd.up = add i32 %n, 15
680  %n.vec = and i32 %n.rnd.up, -16
681  br label %vector.body
683vector.body:                                      ; preds = %vector.body, %vector.ph
684  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
685  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
686  %0 = getelementptr inbounds i8, i8* %x, i32 %index
687  %1 = bitcast i8* %0 to <16 x i8>*
688  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
689  %2 = sext <16 x i8> %wide.masked.load to <16 x i16>
690  %3 = getelementptr inbounds i8, i8* %y, i32 %index
691  %4 = bitcast i8* %3 to <16 x i8>*
692  %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
693  %5 = sext <16 x i8> %wide.masked.load12 to <16 x i16>
694  %6 = mul nsw <16 x i16> %5, %2
695  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
696  %8 = trunc <16 x i16> %7 to <16 x i8>
697  %9 = getelementptr inbounds i8, i8* %d, i32 %index
698  %10 = bitcast i8* %9 to <16 x i8>*
699  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
700  %index.next = add i32 %index, 16
701  %11 = icmp eq i32 %index.next, %n.vec
702  br i1 %11, label %for.cond.cleanup, label %vector.body
704for.cond.cleanup:                                 ; preds = %vector.body, %entry
705  ret void
708define void @vmulh_u8_pred(i8* noalias nocapture %d, i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
709; CHECK-LABEL: vmulh_u8_pred:
710; CHECK:       @ %bb.0: @ %entry
711; CHECK-NEXT:    .save {r7, lr}
712; CHECK-NEXT:    push {r7, lr}
713; CHECK-NEXT:    cmp r3, #1
714; CHECK-NEXT:    it lt
715; CHECK-NEXT:    poplt {r7, pc}
716; CHECK-NEXT:  .LBB23_1: @ %vector.ph
717; CHECK-NEXT:    dlstp.8 lr, r3
718; CHECK-NEXT:  .LBB23_2: @ %vector.body
719; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
720; CHECK-NEXT:    vldrb.u8 q0, [r1], #16
721; CHECK-NEXT:    vldrb.u8 q1, [r2], #16
722; CHECK-NEXT:    vmulh.u8 q0, q1, q0
723; CHECK-NEXT:    vstrb.8 q0, [r0], #16
724; CHECK-NEXT:    letp lr, .LBB23_2
725; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
726; CHECK-NEXT:    pop {r7, pc}
728  %cmp10 = icmp sgt i32 %n, 0
729  br i1 %cmp10, label %vector.ph, label %for.cond.cleanup
731vector.ph:                                        ; preds = %entry
732  %n.rnd.up = add i32 %n, 15
733  %n.vec = and i32 %n.rnd.up, -16
734  br label %vector.body
736vector.body:                                      ; preds = %vector.body, %vector.ph
737  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
738  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
739  %0 = getelementptr inbounds i8, i8* %x, i32 %index
740  %1 = bitcast i8* %0 to <16 x i8>*
741  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
742  %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
743  %3 = getelementptr inbounds i8, i8* %y, i32 %index
744  %4 = bitcast i8* %3 to <16 x i8>*
745  %wide.masked.load12 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> poison)
746  %5 = zext <16 x i8> %wide.masked.load12 to <16 x i16>
747  %6 = mul nuw <16 x i16> %5, %2
748  %7 = lshr <16 x i16> %6, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
749  %8 = trunc <16 x i16> %7 to <16 x i8>
750  %9 = getelementptr inbounds i8, i8* %d, i32 %index
751  %10 = bitcast i8* %9 to <16 x i8>*
752  call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %8, <16 x i8>* %10, i32 1, <16 x i1> %active.lane.mask)
753  %index.next = add i32 %index, 16
754  %11 = icmp eq i32 %index.next, %n.vec
755  br i1 %11, label %for.cond.cleanup, label %vector.body
757for.cond.cleanup:                                 ; preds = %vector.body, %entry
758  ret void
761declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
762declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
763declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
764declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
765declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
766declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
767declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
768declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
769declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)