1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -mcpu=krait | FileCheck %s
3
4define arm_aapcs_vfpcc <4 x i16> @mla_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
5; CHECK-LABEL: mla_args:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vmull.u16 q8, d1, d0
8; CHECK-NEXT:    vaddw.u16 q8, q8, d2
9; CHECK-NEXT:    vmovn.i32 d0, q8
10; CHECK-NEXT:    bx lr
11entry:
12  %v0 = sext <4 x i16> %vec0 to <4 x i32>
13  %v1 = sext <4 x i16> %vec1 to <4 x i32>
14  %v2 = sext <4 x i16> %vec2 to <4 x i32>
15  %v3 = mul <4 x i32> %v1, %v0
16  %v4 = add <4 x i32> %v3, %v2
17  %v5 = trunc <4 x i32> %v4 to <4 x i16>
18  ret <4 x i16> %v5
19}
20
21define void @mla_loadstore(i16* %a, i16* %b, i16* %c) {
22; CHECK-LABEL: mla_loadstore:
23; CHECK:       @ %bb.0: @ %entry
24; CHECK-NEXT:    vldr d16, [r0, #16]
25; CHECK-NEXT:    vldr d17, [r1, #16]
26; CHECK-NEXT:    vldr d18, [r2, #16]
27; CHECK-NEXT:    vmull.u16 q8, d17, d16
28; CHECK-NEXT:    vaddw.u16 q8, q8, d18
29; CHECK-NEXT:    vmovn.i32 d16, q8
30; CHECK-NEXT:    vstr d16, [r0, #16]
31; CHECK-NEXT:    bx lr
32entry:
33  %scevgep0 = getelementptr i16, i16* %a, i32 8
34  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
35  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
36  %v0 = sext <4 x i16> %vec0 to <4 x i32>
37  %scevgep1 = getelementptr i16, i16* %b, i32 8
38  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
39  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
40  %v1 = sext <4 x i16> %vec1 to <4 x i32>
41  %scevgep2 = getelementptr i16, i16* %c, i32 8
42  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
43  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
44  %v2 = sext <4 x i16> %vec2 to <4 x i32>
45  %v3 = mul <4 x i32> %v1, %v0
46  %v4 = add <4 x i32> %v3, %v2
47  %v5 = trunc <4 x i32> %v4 to <4 x i16>
48  %scevgep3 = getelementptr i16, i16* %a, i32 8
49  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
50  store <4 x i16> %v5, <4 x i16>* %vector_ptr3, align 8
51  ret void
52}
53
54define arm_aapcs_vfpcc <4 x i16> @addmul_args(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
55; CHECK-LABEL: addmul_args:
56; CHECK:       @ %bb.0: @ %entry
57; CHECK-NEXT:    vmull.u16 q8, d1, d2
58; CHECK-NEXT:    vmlal.u16 q8, d0, d2
59; CHECK-NEXT:    vmovn.i32 d0, q8
60; CHECK-NEXT:    bx lr
61entry:
62  %v0 = sext <4 x i16> %vec0 to <4 x i32>
63  %v1 = sext <4 x i16> %vec1 to <4 x i32>
64  %v2 = sext <4 x i16> %vec2 to <4 x i32>
65  %v3 = add <4 x i32> %v1, %v0
66  %v4 = mul <4 x i32> %v3, %v2
67  %v5 = trunc <4 x i32> %v4 to <4 x i16>
68  ret <4 x i16> %v5
69}
70
71define void @addmul_loadstore(i16* %a, i16* %b, i16* %c) {
72; CHECK-LABEL: addmul_loadstore:
73; CHECK:       @ %bb.0: @ %entry
74; CHECK-NEXT:    vldr d16, [r2, #16]
75; CHECK-NEXT:    vldr d17, [r1, #16]
76; CHECK-NEXT:    vmull.u16 q9, d17, d16
77; CHECK-NEXT:    vldr d17, [r0, #16]
78; CHECK-NEXT:    vmlal.u16 q9, d17, d16
79; CHECK-NEXT:    vmovn.i32 d16, q9
80; CHECK-NEXT:    vstr d16, [r0, #16]
81; CHECK-NEXT:    bx lr
82entry:
83  %scevgep0 = getelementptr i16, i16* %a, i32 8
84  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
85  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
86  %v0 = sext <4 x i16> %vec0 to <4 x i32>
87  %scevgep1 = getelementptr i16, i16* %b, i32 8
88  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
89  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
90  %v1 = sext <4 x i16> %vec1 to <4 x i32>
91  %scevgep2 = getelementptr i16, i16* %c, i32 8
92  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
93  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
94  %v2 = sext <4 x i16> %vec2 to <4 x i32>
95  %v3 = add <4 x i32> %v1, %v0
96  %v4 = mul <4 x i32> %v3, %v2
97  %v5 = trunc <4 x i32> %v4 to <4 x i16>
98  %scevgep3 = getelementptr i16, i16* %a, i32 8
99  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
100  store <4 x i16> %v5, <4 x i16>* %vector_ptr3, align 8
101  ret void
102}
103
104define void @func1(i16* %a, i16* %b, i16* %c) {
105; CHECK-LABEL: func1:
106; CHECK:       @ %bb.0: @ %entry
107; CHECK-NEXT:    add r3, r1, #16
108; CHECK-NEXT:    vldr d18, [r2, #16]
109; CHECK-NEXT:    vld1.16 {d16}, [r3:64]
110; CHECK-NEXT:    vmovl.u16 q8, d16
111; CHECK-NEXT:    vaddw.u16 q10, q8, d18
112; CHECK-NEXT:    vmovn.i32 d19, q10
113; CHECK-NEXT:    vldr d20, [r0, #16]
114; CHECK-NEXT:    vstr d19, [r0, #16]
115; CHECK-NEXT:    vldr d19, [r2, #16]
116; CHECK-NEXT:    vmull.s16 q11, d18, d19
117; CHECK-NEXT:    vmovl.s16 q9, d19
118; CHECK-NEXT:    vmla.i32 q11, q8, q9
119; CHECK-NEXT:    vmovn.i32 d16, q11
120; CHECK-NEXT:    vstr d16, [r1, #16]
121; CHECK-NEXT:    vldr d16, [r2, #16]
122; CHECK-NEXT:    vmlal.u16 q11, d16, d20
123; CHECK-NEXT:    vmovn.i32 d16, q11
124; CHECK-NEXT:    vstr d16, [r0, #16]
125; CHECK-NEXT:    bx lr
126entry:
127; The test case trying to vectorize the pseudo code below.
128; a[i] = b[i] + c[i];
129; b[i] = a[i] * c[i];
130; a[i] = b[i] + a[i] * c[i];
131; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i]" is
132; scheduled before the first vector store to "a[i] = b[i] + c[i]".
133; Checking that there is no vector load a[i] scheduled between the vector
134; stores to a[i], otherwise the load of a[i] will be polluted by the first
135; vector store to a[i].
136; This test case check that the chain information is updated during
137; lowerMUL for the new created Load SDNode.
138
139
140  %scevgep0 = getelementptr i16, i16* %a, i32 8
141  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
142  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
143  %scevgep1 = getelementptr i16, i16* %b, i32 8
144  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
145  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
146  %0 = zext <4 x i16> %vec1 to <4 x i32>
147  %scevgep2 = getelementptr i16, i16* %c, i32 8
148  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
149  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
150  %1 = sext <4 x i16> %vec2 to <4 x i32>
151  %vec3 = add <4 x i32> %1, %0
152  %2 = trunc <4 x i32> %vec3 to <4 x i16>
153  %scevgep3 = getelementptr i16, i16* %a, i32 8
154  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
155  store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8
156  %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>*
157  %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8
158  %3 = sext <4 x i16> %vec4 to <4 x i32>
159  %vec5 = mul <4 x i32> %3, %vec3
160  %4 = trunc <4 x i32> %vec5 to <4 x i16>
161  %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>*
162  store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8
163  %5 = sext <4 x i16> %vec0 to <4 x i32>
164  %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>*
165  %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8
166  %6 = sext <4 x i16> %vec6 to <4 x i32>
167  %vec7 = mul <4 x i32> %6, %5
168  %vec8 = add <4 x i32> %vec7, %vec5
169  %7 = trunc <4 x i32> %vec8 to <4 x i16>
170  %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>*
171  store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8
172  ret void
173}
174
175define void @func2(i16* %a, i16* %b, i16* %c) {
176; CHECK-LABEL: func2:
177; CHECK:       @ %bb.0: @ %entry
178; CHECK-NEXT:    vldr d16, [r1, #16]
179; CHECK-NEXT:    add r3, r0, #16
180; CHECK-NEXT:    vldr d17, [r2, #16]
181; CHECK-NEXT:    vaddl.u16 q9, d17, d16
182; CHECK-NEXT:    vmovn.i32 d18, q9
183; CHECK-NEXT:    vld1.16 {d19}, [r3:64]
184; CHECK-NEXT:    vstr d18, [r0, #16]
185; CHECK-NEXT:    vldr d18, [r2, #16]
186; CHECK-NEXT:    vmull.s16 q10, d17, d18
187; CHECK-NEXT:    vmovl.s16 q11, d18
188; CHECK-NEXT:    vmovl.u16 q8, d16
189; CHECK-NEXT:    vmovl.s16 q9, d19
190; CHECK-NEXT:    vmla.i32 q10, q8, q11
191; CHECK-NEXT:    vmovn.i32 d16, q10
192; CHECK-NEXT:    vstr d16, [r1, #16]
193; CHECK-NEXT:    add r1, r2, #16
194; CHECK-NEXT:    vld1.16 {d16}, [r1:64]
195; CHECK-NEXT:    vmovl.u16 q8, d16
196; CHECK-NEXT:    vmla.i32 q10, q8, q9
197; CHECK-NEXT:    vadd.i32 q8, q10, q9
198; CHECK-NEXT:    vmovn.i32 d16, q8
199; CHECK-NEXT:    vstr d16, [r0, #16]
200; CHECK-NEXT:    bx lr
201entry:
202; The test case trying to vectorize the pseudo code below.
203; a[i] = b[i] + c[i];
204; b[i] = a[i] * c[i];
205; a[i] = b[i] + a[i] * c[i] + a[i];
206; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i] + a[i]"
207; is scheduled before the first vector store to "a[i] = b[i] + c[i]".
208; Checking that there is no vector load a[i] scheduled between the first
209; vector store to a[i] and the vector add of a[i], otherwise the load of
210; a[i] will be polluted by the first vector store to a[i].
211; This test case check that both the chain and value of the new created
212; Load SDNode are updated during lowerMUL.
213
214
215  %scevgep0 = getelementptr i16, i16* %a, i32 8
216  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
217  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
218  %scevgep1 = getelementptr i16, i16* %b, i32 8
219  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
220  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
221  %0 = zext <4 x i16> %vec1 to <4 x i32>
222  %scevgep2 = getelementptr i16, i16* %c, i32 8
223  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
224  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
225  %1 = sext <4 x i16> %vec2 to <4 x i32>
226  %vec3 = add <4 x i32> %1, %0
227  %2 = trunc <4 x i32> %vec3 to <4 x i16>
228  %scevgep3 = getelementptr i16, i16* %a, i32 8
229  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
230  store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8
231  %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>*
232  %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8
233  %3 = sext <4 x i16> %vec4 to <4 x i32>
234  %vec5 = mul <4 x i32> %3, %vec3
235  %4 = trunc <4 x i32> %vec5 to <4 x i16>
236  %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>*
237  store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8
238  %5 = sext <4 x i16> %vec0 to <4 x i32>
239  %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>*
240  %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8
241  %6 = sext <4 x i16> %vec6 to <4 x i32>
242  %vec7 = mul <4 x i32> %6, %5
243  %vec8 = add <4 x i32> %vec7, %vec5
244  %vec9 = add <4 x i32> %vec8, %5
245  %7 = trunc <4 x i32> %vec9 to <4 x i16>
246  %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>*
247  store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8
248  ret void
249}
250