1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc i32 @vqdmulh_v16i8(<16 x i8> %s0, <16 x i8> %s1) {
5; CHECK-LABEL: vqdmulh_v16i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
8; CHECK-NEXT:    vaddv.s8 r0, q0
9; CHECK-NEXT:    bx lr
10entry:
11  %l2 = sext <16 x i8> %s0 to <16 x i32>
12  %l5 = sext <16 x i8> %s1 to <16 x i32>
13  %l6 = mul nsw <16 x i32> %l5, %l2
14  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
15  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
16  %l10 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %l9)
17  ret i32 %l10
18}
19
20define arm_aapcs_vfpcc <16 x i8> @vqdmulh_v16i8_b(<16 x i8> %s0, <16 x i8> %s1) {
21; CHECK-LABEL: vqdmulh_v16i8_b:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
24; CHECK-NEXT:    bx lr
25entry:
26  %l2 = sext <16 x i8> %s0 to <16 x i32>
27  %l5 = sext <16 x i8> %s1 to <16 x i32>
28  %l6 = mul nsw <16 x i32> %l5, %l2
29  %l7 = ashr <16 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
30  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
31  %l10 = trunc <16 x i32> %l9 to <16 x i8>
32  ret <16 x i8> %l10
33}
34
35define arm_aapcs_vfpcc <8 x i8> @vqdmulh_v8i8_b(<8 x i8> %s0, <8 x i8> %s1) {
36; CHECK-LABEL: vqdmulh_v8i8_b:
37; CHECK:       @ %bb.0: @ %entry
38; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
39; CHECK-NEXT:    vmovlb.s8 q0, q0
40; CHECK-NEXT:    bx lr
41entry:
42  %l2 = sext <8 x i8> %s0 to <8 x i32>
43  %l5 = sext <8 x i8> %s1 to <8 x i32>
44  %l6 = mul nsw <8 x i32> %l5, %l2
45  %l7 = ashr <8 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
46  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
47  %l10 = trunc <8 x i32> %l9 to <8 x i8>
48  ret <8 x i8> %l10
49}
50
51define arm_aapcs_vfpcc <4 x i8> @vqdmulh_v4i8_b(<4 x i8> %s0, <4 x i8> %s1) {
52; CHECK-LABEL: vqdmulh_v4i8_b:
53; CHECK:       @ %bb.0: @ %entry
54; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
55; CHECK-NEXT:    vmovlb.s8 q0, q0
56; CHECK-NEXT:    vmovlb.s16 q0, q0
57; CHECK-NEXT:    bx lr
58entry:
59  %l2 = sext <4 x i8> %s0 to <4 x i32>
60  %l5 = sext <4 x i8> %s1 to <4 x i32>
61  %l6 = mul nsw <4 x i32> %l5, %l2
62  %l7 = ashr <4 x i32> %l6, <i32 7, i32 7, i32 7, i32 7>
63  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
64  %l10 = trunc <4 x i32> %l9 to <4 x i8>
65  ret <4 x i8> %l10
66}
67
68define arm_aapcs_vfpcc <32 x i8> @vqdmulh_v32i8_b(<32 x i8> %s0, <32 x i8> %s1) {
69; CHECK-LABEL: vqdmulh_v32i8_b:
70; CHECK:       @ %bb.0: @ %entry
71; CHECK-NEXT:    vqdmulh.s8 q0, q2, q0
72; CHECK-NEXT:    vqdmulh.s8 q1, q3, q1
73; CHECK-NEXT:    bx lr
74entry:
75  %l2 = sext <32 x i8> %s0 to <32 x i32>
76  %l5 = sext <32 x i8> %s1 to <32 x i32>
77  %l6 = mul nsw <32 x i32> %l5, %l2
78  %l7 = ashr <32 x i32> %l6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
79  %l9 = call <32 x i32> @llvm.smin.v32i32(<32 x i32> %l7, <32 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
80  %l10 = trunc <32 x i32> %l9 to <32 x i8>
81  ret <32 x i8> %l10
82}
83
84define arm_aapcs_vfpcc i32 @vqdmulh_v8i16(<8 x i16> %s0, <8 x i16> %s1) {
85; CHECK-LABEL: vqdmulh_v8i16:
86; CHECK:       @ %bb.0: @ %entry
87; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
88; CHECK-NEXT:    vaddv.s16 r0, q0
89; CHECK-NEXT:    bx lr
90entry:
91  %l2 = sext <8 x i16> %s0 to <8 x i32>
92  %l5 = sext <8 x i16> %s1 to <8 x i32>
93  %l6 = mul nsw <8 x i32> %l5, %l2
94  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
95  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
96  %l10 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %l9)
97  ret i32 %l10
98}
99
100define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_b(<8 x i16> %s0, <8 x i16> %s1) {
101; CHECK-LABEL: vqdmulh_v8i16_b:
102; CHECK:       @ %bb.0: @ %entry
103; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
104; CHECK-NEXT:    bx lr
105entry:
106  %l2 = sext <8 x i16> %s0 to <8 x i32>
107  %l5 = sext <8 x i16> %s1 to <8 x i32>
108  %l6 = mul nsw <8 x i32> %l5, %l2
109  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
110  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
111  %l10 = trunc <8 x i32> %l9 to <8 x i16>
112  ret <8 x i16> %l10
113}
114
115define arm_aapcs_vfpcc <4 x i16> @vqdmulh_v4i16_b(<4 x i16> %s0, <4 x i16> %s1) {
116; CHECK-LABEL: vqdmulh_v4i16_b:
117; CHECK:       @ %bb.0: @ %entry
118; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
119; CHECK-NEXT:    vmovlb.s16 q0, q0
120; CHECK-NEXT:    bx lr
121entry:
122  %l2 = sext <4 x i16> %s0 to <4 x i32>
123  %l5 = sext <4 x i16> %s1 to <4 x i32>
124  %l6 = mul nsw <4 x i32> %l5, %l2
125  %l7 = ashr <4 x i32> %l6, <i32 15, i32 15, i32 15, i32 15>
126  %l9 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l7, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
127  %l10 = trunc <4 x i32> %l9 to <4 x i16>
128  ret <4 x i16> %l10
129}
130
131define arm_aapcs_vfpcc <16 x i16> @vqdmulh_v16i16_b(<16 x i16> %s0, <16 x i16> %s1) {
132; CHECK-LABEL: vqdmulh_v16i16_b:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    vqdmulh.s16 q0, q2, q0
135; CHECK-NEXT:    vqdmulh.s16 q1, q3, q1
136; CHECK-NEXT:    bx lr
137entry:
138  %l2 = sext <16 x i16> %s0 to <16 x i32>
139  %l5 = sext <16 x i16> %s1 to <16 x i32>
140  %l6 = mul nsw <16 x i32> %l5, %l2
141  %l7 = ashr <16 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
142  %l9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %l7, <16 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
143  %l10 = trunc <16 x i32> %l9 to <16 x i16>
144  ret <16 x i16> %l10
145}
146
147define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_c(<8 x i16> %s0, <8 x i16> %s1) {
148; CHECK-LABEL: vqdmulh_v8i16_c:
149; CHECK:       @ %bb.0: @ %entry
150; CHECK-NEXT:    vmov q2, q0
151; CHECK-NEXT:    vmov.u16 r0, q0[2]
152; CHECK-NEXT:    vmov.u16 r1, q0[0]
153; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
154; CHECK-NEXT:    vmov.u16 r0, q2[3]
155; CHECK-NEXT:    vmov.u16 r1, q2[1]
156; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
157; CHECK-NEXT:    vmov.u16 r0, q1[2]
158; CHECK-NEXT:    vmov.u16 r1, q1[0]
159; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
160; CHECK-NEXT:    vmov.u16 r0, q1[3]
161; CHECK-NEXT:    vmov.u16 r1, q1[1]
162; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
163; CHECK-NEXT:    vmullb.s16 q0, q3, q0
164; CHECK-NEXT:    vshl.i32 q0, q0, #10
165; CHECK-NEXT:    vshr.s32 q0, q0, #10
166; CHECK-NEXT:    vshr.s32 q3, q0, #15
167; CHECK-NEXT:    vmov r0, r1, d6
168; CHECK-NEXT:    vmov.16 q0[0], r0
169; CHECK-NEXT:    vmov.16 q0[1], r1
170; CHECK-NEXT:    vmov r0, r1, d7
171; CHECK-NEXT:    vmov.16 q0[2], r0
172; CHECK-NEXT:    vmov.u16 r0, q2[6]
173; CHECK-NEXT:    vmov.16 q0[3], r1
174; CHECK-NEXT:    vmov.u16 r1, q2[4]
175; CHECK-NEXT:    vmov q3[2], q3[0], r1, r0
176; CHECK-NEXT:    vmov.u16 r0, q2[7]
177; CHECK-NEXT:    vmov.u16 r1, q2[5]
178; CHECK-NEXT:    vmov q3[3], q3[1], r1, r0
179; CHECK-NEXT:    vmov.u16 r0, q1[6]
180; CHECK-NEXT:    vmov.u16 r1, q1[4]
181; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
182; CHECK-NEXT:    vmov.u16 r0, q1[7]
183; CHECK-NEXT:    vmov.u16 r1, q1[5]
184; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
185; CHECK-NEXT:    vmullb.s16 q1, q2, q3
186; CHECK-NEXT:    vshl.i32 q1, q1, #10
187; CHECK-NEXT:    vshr.s32 q1, q1, #10
188; CHECK-NEXT:    vshr.s32 q1, q1, #15
189; CHECK-NEXT:    vmov r0, r1, d2
190; CHECK-NEXT:    vmov.16 q0[4], r0
191; CHECK-NEXT:    vmov.16 q0[5], r1
192; CHECK-NEXT:    vmov r0, r1, d3
193; CHECK-NEXT:    vmov.16 q0[6], r0
194; CHECK-NEXT:    vmov.16 q0[7], r1
195; CHECK-NEXT:    bx lr
196entry:
197  %l2 = sext <8 x i16> %s0 to <8 x i22>
198  %l5 = sext <8 x i16> %s1 to <8 x i22>
199  %l6 = mul nsw <8 x i22> %l5, %l2
200  %l7 = ashr <8 x i22> %l6, <i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15, i22 15>
201  %l9 = call <8 x i22> @llvm.smin.v8i22(<8 x i22> %l7, <8 x i22> <i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767, i22 32767>)
202  %l10 = trunc <8 x i22> %l9 to <8 x i16>
203  ret <8 x i16> %l10
204}
205
206define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved(<8 x i16> %s0, <8 x i16> %s1) {
207; CHECK-LABEL: vqdmulh_v8i16_interleaved:
208; CHECK:       @ %bb.0: @ %entry
209; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
210; CHECK-NEXT:    bx lr
211entry:
212  %0 = shufflevector <8 x i16> %s0, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
213  %1 = sext <8 x i16> %0 to <8 x i32>
214  %l2 = sext <8 x i16> %s0 to <8 x i32>
215  %2 = shufflevector <8 x i16> %s1, <8 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
216  %3 = sext <8 x i16> %2 to <8 x i32>
217  %l5 = sext <8 x i16> %s1 to <8 x i32>
218  %l6 = mul nsw <8 x i32> %3, %1
219  %l7 = ashr <8 x i32> %l6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
220  %l9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %l7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
221  %l10 = trunc <8 x i32> %l9 to <8 x i16>
222  %4 = shufflevector <8 x i16> %l10, <8 x i16> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
223  ret <8 x i16> %4
224}
225
226define arm_aapcs_vfpcc <8 x i16> @vqdmulh_v8i16_interleaved2(<4 x i32> %s0a, <8 x i16> %s1) {
227; CHECK-LABEL: vqdmulh_v8i16_interleaved2:
228; CHECK:       @ %bb.0:
229; CHECK-NEXT:    vqdmulh.s16 q2, q1, q0
230; CHECK-NEXT:    vrev32.16 q1, q1
231; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
232; CHECK-NEXT:    vmovnt.i32 q2, q0
233; CHECK-NEXT:    vmov q0, q2
234; CHECK-NEXT:    bx lr
235  %s0 = trunc <4 x i32> %s0a to <4 x i16>
236  %strided.vec = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
237  %strided.vec44 = shufflevector <8 x i16> %s1, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
238  %l7 = sext <4 x i16> %strided.vec to <4 x i32>
239  %l8 = sext <4 x i16> %s0 to <4 x i32>
240  %l9 = mul nsw <4 x i32> %l7, %l8
241  %l10 = ashr <4 x i32> %l9, <i32 15, i32 15, i32 15, i32 15>
242  %l12 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l10, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
243  %l13 = trunc <4 x i32> %l12 to <4 x i16>
244  %l14 = sext <4 x i16> %strided.vec44 to <4 x i32>
245  %l15 = mul nsw <4 x i32> %l14, %l8
246  %l16 = ashr <4 x i32> %l15, <i32 15, i32 15, i32 15, i32 15>
247  %l18 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %l16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
248  %l19 = trunc <4 x i32> %l18 to <4 x i16>
249  %interleaved.vec = shufflevector <4 x i16> %l13, <4 x i16> %l19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
250  ret <8 x i16> %interleaved.vec
251}
252
253define arm_aapcs_vfpcc i64 @vqdmulh_v4i32(<4 x i32> %s0, <4 x i32> %s1) {
254; CHECK-LABEL: vqdmulh_v4i32:
255; CHECK:       @ %bb.0: @ %entry
256; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
257; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
258; CHECK-NEXT:    bx lr
259entry:
260  %l2 = sext <4 x i32> %s0 to <4 x i64>
261  %l5 = sext <4 x i32> %s1 to <4 x i64>
262  %l6 = mul nsw <4 x i64> %l5, %l2
263  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
264  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
265  %l10 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %l9)
266  ret i64 %l10
267}
268
269define arm_aapcs_vfpcc <4 x i32> @vqdmulh_v4i32_b(<4 x i32> %s0, <4 x i32> %s1) {
270; CHECK-LABEL: vqdmulh_v4i32_b:
271; CHECK:       @ %bb.0: @ %entry
272; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
273; CHECK-NEXT:    bx lr
274entry:
275  %l2 = sext <4 x i32> %s0 to <4 x i64>
276  %l5 = sext <4 x i32> %s1 to <4 x i64>
277  %l6 = mul nsw <4 x i64> %l5, %l2
278  %l7 = ashr <4 x i64> %l6, <i64 31, i64 31, i64 31, i64 31>
279  %l9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %l7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
280  %l10 = trunc <4 x i64> %l9 to <4 x i32>
281  ret <4 x i32> %l10
282}
283
284define arm_aapcs_vfpcc <2 x i32> @vqdmulh_v2i32_b(<2 x i32> %s0, <2 x i32> %s1) {
285; CHECK-LABEL: vqdmulh_v2i32_b:
286; CHECK:       @ %bb.0: @ %entry
287; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
288; CHECK-NEXT:    vmov r0, s2
289; CHECK-NEXT:    vmov r1, s0
290; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
291; CHECK-NEXT:    asrs r0, r0, #31
292; CHECK-NEXT:    asrs r1, r1, #31
293; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
294; CHECK-NEXT:    bx lr
295entry:
296  %l2 = sext <2 x i32> %s0 to <2 x i64>
297  %l5 = sext <2 x i32> %s1 to <2 x i64>
298  %l6 = mul nsw <2 x i64> %l5, %l2
299  %l7 = ashr <2 x i64> %l6, <i64 31, i64 31>
300  %l9 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %l7, <2 x i64> <i64 2147483647, i64 2147483647>)
301  %l10 = trunc <2 x i64> %l9 to <2 x i32>
302  ret <2 x i32> %l10
303}
304
305define arm_aapcs_vfpcc <8 x i32> @vqdmulh_v8i32_b(<8 x i32> %s0, <8 x i32> %s1) {
306; CHECK-LABEL: vqdmulh_v8i32_b:
307; CHECK:       @ %bb.0: @ %entry
308; CHECK-NEXT:    vqdmulh.s32 q0, q2, q0
309; CHECK-NEXT:    vqdmulh.s32 q1, q3, q1
310; CHECK-NEXT:    bx lr
311entry:
312  %l2 = sext <8 x i32> %s0 to <8 x i64>
313  %l5 = sext <8 x i32> %s1 to <8 x i64>
314  %l6 = mul nsw <8 x i64> %l5, %l2
315  %l7 = ashr <8 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
316  %l9 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> %l7, <8 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
317  %l10 = trunc <8 x i64> %l9 to <8 x i32>
318  ret <8 x i32> %l10
319}
320
321define arm_aapcs_vfpcc <16 x i32> @vqdmulh_v16i32_b(<16 x i32> %s0, <16 x i32> %s1) {
322; CHECK-LABEL: vqdmulh_v16i32_b:
323; CHECK:       @ %bb.0: @ %entry
324; CHECK-NEXT:    .vsave {d8, d9}
325; CHECK-NEXT:    vpush {d8, d9}
326; CHECK-NEXT:    add r0, sp, #16
327; CHECK-NEXT:    vldrw.u32 q4, [r0]
328; CHECK-NEXT:    add r0, sp, #32
329; CHECK-NEXT:    vqdmulh.s32 q0, q4, q0
330; CHECK-NEXT:    vldrw.u32 q4, [r0]
331; CHECK-NEXT:    add r0, sp, #48
332; CHECK-NEXT:    vqdmulh.s32 q1, q4, q1
333; CHECK-NEXT:    vldrw.u32 q4, [r0]
334; CHECK-NEXT:    add r0, sp, #64
335; CHECK-NEXT:    vqdmulh.s32 q2, q4, q2
336; CHECK-NEXT:    vldrw.u32 q4, [r0]
337; CHECK-NEXT:    vqdmulh.s32 q3, q4, q3
338; CHECK-NEXT:    vpop {d8, d9}
339; CHECK-NEXT:    bx lr
340entry:
341  %l2 = sext <16 x i32> %s0 to <16 x i64>
342  %l5 = sext <16 x i32> %s1 to <16 x i64>
343  %l6 = mul nsw <16 x i64> %l5, %l2
344  %l7 = ashr <16 x i64> %l6, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
345  %l9 = call <16 x i64> @llvm.smin.v16i64(<16 x i64> %l7, <16 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
346  %l10 = trunc <16 x i64> %l9 to <16 x i32>
347  ret <16 x i32> %l10
348}
349
350
351
352define void @vqdmulh_loop_i8(i8* nocapture readonly %x, i8* nocapture readonly %y, i8* noalias nocapture %z, i32 %n) local_unnamed_addr #0 {
353; CHECK-LABEL: vqdmulh_loop_i8:
354; CHECK:       @ %bb.0: @ %entry
355; CHECK-NEXT:    .save {r7, lr}
356; CHECK-NEXT:    push {r7, lr}
357; CHECK-NEXT:    mov.w lr, #64
358; CHECK-NEXT:  .LBB17_1: @ %vector.body
359; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
360; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
361; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
362; CHECK-NEXT:    vqdmulh.s8 q0, q1, q0
363; CHECK-NEXT:    vstrb.8 q0, [r2], #16
364; CHECK-NEXT:    le lr, .LBB17_1
365; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
366; CHECK-NEXT:    pop {r7, pc}
367entry:
368  br label %vector.body
369
370vector.body:                                      ; preds = %vector.body, %entry
371  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
372  %0 = getelementptr inbounds i8, i8* %x, i32 %index
373  %1 = bitcast i8* %0 to <16 x i8>*
374  %wide.load = load <16 x i8>, <16 x i8>* %1, align 1
375  %2 = sext <16 x i8> %wide.load to <16 x i32>
376  %3 = getelementptr inbounds i8, i8* %y, i32 %index
377  %4 = bitcast i8* %3 to <16 x i8>*
378  %wide.load26 = load <16 x i8>, <16 x i8>* %4, align 1
379  %5 = sext <16 x i8> %wide.load26 to <16 x i32>
380  %6 = mul nsw <16 x i32> %5, %2
381  %7 = ashr <16 x i32> %6, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
382  %8 = icmp slt <16 x i32> %7, <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>
383  %9 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> %7, <16 x i32> <i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127, i32 127>)
384  %10 = trunc <16 x i32> %9 to <16 x i8>
385  %11 = getelementptr inbounds i8, i8* %z, i32 %index
386  %12 = bitcast i8* %11 to <16 x i8>*
387  store <16 x i8> %10, <16 x i8>* %12, align 1
388  %index.next = add i32 %index, 16
389  %13 = icmp eq i32 %index.next, 1024
390  br i1 %13, label %for.cond.cleanup, label %vector.body
391
392for.cond.cleanup:                                 ; preds = %vector.body
393  ret void
394}
395
396define void @vqdmulh_loop_i16(i16* nocapture readonly %x, i16* nocapture readonly %y, i16* noalias nocapture %z, i32 %n) {
397; CHECK-LABEL: vqdmulh_loop_i16:
398; CHECK:       @ %bb.0: @ %entry
399; CHECK-NEXT:    .save {r7, lr}
400; CHECK-NEXT:    push {r7, lr}
401; CHECK-NEXT:    mov.w lr, #128
402; CHECK-NEXT:  .LBB18_1: @ %vector.body
403; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
404; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
405; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
406; CHECK-NEXT:    vqdmulh.s16 q0, q1, q0
407; CHECK-NEXT:    vstrb.8 q0, [r2], #16
408; CHECK-NEXT:    le lr, .LBB18_1
409; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
410; CHECK-NEXT:    pop {r7, pc}
411entry:
412  br label %vector.body
413
414vector.body:                                      ; preds = %vector.body, %entry
415  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
416  %0 = getelementptr inbounds i16, i16* %x, i32 %index
417  %1 = bitcast i16* %0 to <8 x i16>*
418  %wide.load = load <8 x i16>, <8 x i16>* %1, align 2
419  %2 = sext <8 x i16> %wide.load to <8 x i32>
420  %3 = getelementptr inbounds i16, i16* %y, i32 %index
421  %4 = bitcast i16* %3 to <8 x i16>*
422  %wide.load30 = load <8 x i16>, <8 x i16>* %4, align 2
423  %5 = sext <8 x i16> %wide.load30 to <8 x i32>
424  %6 = mul nsw <8 x i32> %5, %2
425  %7 = ashr <8 x i32> %6, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
426  %8 = icmp slt <8 x i32> %7, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
427  %9 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %7, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
428  %10 = trunc <8 x i32> %9 to <8 x i16>
429  %11 = getelementptr inbounds i16, i16* %z, i32 %index
430  %12 = bitcast i16* %11 to <8 x i16>*
431  store <8 x i16> %10, <8 x i16>* %12, align 2
432  %index.next = add i32 %index, 8
433  %13 = icmp eq i32 %index.next, 1024
434  br i1 %13, label %for.cond.cleanup, label %vector.body
435
436for.cond.cleanup:                                 ; preds = %vector.body
437  ret void
438}
439
440define void @vqdmulh_loop_i32(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* noalias nocapture %z, i32 %n) {
441; CHECK-LABEL: vqdmulh_loop_i32:
442; CHECK:       @ %bb.0: @ %entry
443; CHECK-NEXT:    .save {r7, lr}
444; CHECK-NEXT:    push {r7, lr}
445; CHECK-NEXT:    mov.w lr, #256
446; CHECK-NEXT:  .LBB19_1: @ %vector.body
447; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
448; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
449; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
450; CHECK-NEXT:    vqdmulh.s32 q0, q1, q0
451; CHECK-NEXT:    vstrb.8 q0, [r2], #16
452; CHECK-NEXT:    le lr, .LBB19_1
453; CHECK-NEXT:  @ %bb.2: @ %for.cond.cleanup
454; CHECK-NEXT:    pop {r7, pc}
455entry:
456  br label %vector.body
457
458vector.body:                                      ; preds = %vector.body, %entry
459  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
460  %0 = getelementptr inbounds i32, i32* %x, i32 %index
461  %1 = bitcast i32* %0 to <4 x i32>*
462  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
463  %2 = sext <4 x i32> %wide.load to <4 x i64>
464  %3 = getelementptr inbounds i32, i32* %y, i32 %index
465  %4 = bitcast i32* %3 to <4 x i32>*
466  %wide.load30 = load <4 x i32>, <4 x i32>* %4, align 4
467  %5 = sext <4 x i32> %wide.load30 to <4 x i64>
468  %6 = mul nsw <4 x i64> %5, %2
469  %7 = ashr <4 x i64> %6, <i64 31, i64 31, i64 31, i64 31>
470  %8 = icmp slt <4 x i64> %7, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
471  %9 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %7, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
472  %10 = trunc <4 x i64> %9 to <4 x i32>
473  %11 = getelementptr inbounds i32, i32* %z, i32 %index
474  %12 = bitcast i32* %11 to <4 x i32>*
475  store <4 x i32> %10, <4 x i32>* %12, align 4
476  %index.next = add i32 %index, 4
477  %13 = icmp eq i32 %index.next, 1024
478  br i1 %13, label %for.cond.cleanup, label %vector.body
479
480for.cond.cleanup:                                 ; preds = %vector.body
481  ret void
482}
483
484define i32 @scalar(i16 %a) {
485; CHECK-LABEL: scalar:
486; CHECK:       @ %bb.0:
487; CHECK-NEXT:    smulbb r1, r0, r0
488; CHECK-NEXT:    movs r0, #127
489; CHECK-NEXT:    asrs r2, r1, #7
490; CHECK-NEXT:    cmp r2, #127
491; CHECK-NEXT:    it lt
492; CHECK-NEXT:    asrlt r0, r1, #7
493; CHECK-NEXT:    bx lr
494  %e = sext i16 %a to i32
495  %d = mul nsw i32 %e, %e
496  %b = ashr i32 %d, 7
497  %c = call i32 @llvm.smin.i32(i32 %b, i32 127)
498  ret i32 %c
499}
500
501declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
502declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
503declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
504declare i32 @llvm.smin.i32(i32 %a, i32 %b)
505declare <2 x i64> @llvm.smin.v2i64(<2 x i64>, <2 x i64>)
506declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>)
507declare <8 x i64> @llvm.smin.v8i64(<8 x i64>, <8 x i64>)
508declare <16 x i64> @llvm.smin.v16i64(<16 x i64>, <16 x i64>)
509declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>)
510declare <8 x i32> @llvm.smin.v8i32(<8 x i32>, <8 x i32>)
511declare <16 x i32> @llvm.smin.v16i32(<16 x i32>, <16 x i32>)
512declare <32 x i32> @llvm.smin.v32i32(<32 x i32>, <32 x i32>)
513declare <8 x i22> @llvm.smin.v8i22(<8 x i22>, <8 x i22>)
514