1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=armv8-a-linux-gnueabihf -arm-parallel-dsp -dce --verify %s -S -o - | FileCheck %s
3
4define dso_local void @a() align 2 {
5; CHECK-LABEL: @a(
6; CHECK-NEXT:  for.end:
7; CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
8; CHECK-NEXT:    [[TMP0:%.*]] = load i16, i16* bitcast (void ()* @a to i16*), align 2
9; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
10; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV]], [[CONV]]
11; CHECK-NEXT:    [[TMP1:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 1), align 2
12; CHECK-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP1]] to i32
13; CHECK-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[CONV3]], [[CONV3]]
14; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[MUL6]], [[MUL]]
15; CHECK-NEXT:    [[TMP2:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 2), align 2
16; CHECK-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP2]] to i32
17; CHECK-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[CONV11]], [[CONV3]]
18; CHECK-NEXT:    [[ADD14:%.*]] = add nsw i32 [[MUL12]], [[ADD]]
19; CHECK-NEXT:    [[TMP3:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 3), align 2
20; CHECK-NEXT:    [[CONV17:%.*]] = sext i16 [[TMP3]] to i32
21; CHECK-NEXT:    [[ADD19:%.*]] = add nsw i32 [[ADD14]], [[CONV17]]
22; CHECK-NEXT:    store i32 [[ADD19]], i32* [[B]], align 4
23; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 4), align 2
24; CHECK-NEXT:    [[CONV21:%.*]] = sext i16 [[TMP4]] to i32
25; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[CONV21]]
26; CHECK-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR]], i32 9
27; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX22]], align 4
28; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[TMP5]], 1
29; CHECK-NEXT:    store i32 [[SHL]], i32* [[ARRAYIDX22]], align 4
30; CHECK-NEXT:    br label [[FOR_COND23:%.*]]
31; CHECK:       for.cond23:
32; CHECK-NEXT:    br label [[FOR_COND23]]
33;
34for.end:
35  %b = alloca i32, align 4
36  %0 = bitcast i32* %b to i8*
37  %1 = load i16, i16* bitcast (void ()* @a to i16*), align 2
38  %conv = sext i16 %1 to i32
39  %mul = mul nsw i32 %conv, %conv
40  %2 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 1), align 2
41  %conv3 = sext i16 %2 to i32
42  %mul6 = mul nsw i32 %conv3, %conv3
43  %add = add nuw nsw i32 %mul6, %mul
44  %3 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 2), align 2
45  %conv11 = sext i16 %3 to i32
46  %mul12 = mul nsw i32 %conv11, %conv3
47  %add14 = add nsw i32 %mul12, %add
48  %4 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 3), align 2
49  %conv17 = sext i16 %4 to i32
50  %add19 = add nsw i32 %add14, %conv17
51  store i32 %add19, i32* %b, align 4
52  %5 = load i16, i16* getelementptr (i16, i16* bitcast (void ()* @a to i16*), i32 4), align 2
53  %conv21 = sext i16 %5 to i32
54  %add.ptr = getelementptr inbounds i32, i32* %b, i32 %conv21
55  %arrayidx22 = getelementptr inbounds i32, i32* %add.ptr, i32 9
56  %6 = load i32, i32* %arrayidx22, align 4
57  %shl = shl i32 %6, 1
58  store i32 %shl, i32* %arrayidx22, align 4
59  br label %for.cond23
60
61for.cond23:                                       ; preds = %for.cond23, %for.end
62  br label %for.cond23
63}
64
65define i32 @accumulate_square_a0(i16* %a, i16* %b, i32 %acc) {
66; CHECK-LABEL: @accumulate_square_a0(
67; CHECK-NEXT:  entry:
68; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
69; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
70; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, i16* [[A]]
71; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
72; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[ADDR_A_1]] to i32*
73; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2
74; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
75; CHECK-NEXT:    [[TMP3:%.*]] = sext i16 [[TMP2]] to i32
76; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i16* [[ADDR_B_1]] to i32*
77; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 2
78; CHECK-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
79; CHECK-NEXT:    [[TMP7:%.*]] = sext i16 [[TMP6]] to i32
80; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
81; CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[MUL_0]], [[ACC:%.*]]
82; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP3]], [[TMP7]]
83; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[MUL_1]], [[TMP8]]
84; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP5]], i32 [[TMP9]])
85; CHECK-NEXT:    ret i32 [[TMP10]]
86;
87entry:
88  %addr.a.1 = getelementptr i16, i16* %a, i32 1
89  %addr.b.1 = getelementptr i16, i16* %b, i32 1
90  %ld.a.0 = load i16, i16* %a
91  %sext.a.0 = sext i16 %ld.a.0 to i32
92  %ld.b.0 = load i16, i16* %b
93  %ld.a.1 = load i16, i16* %addr.a.1
94  %ld.b.1 = load i16, i16* %addr.b.1
95  %sext.a.1 = sext i16 %ld.a.1 to i32
96  %sext.b.1 = sext i16 %ld.b.1 to i32
97  %sext.b.0 = sext i16 %ld.b.0 to i32
98  %mul.0 = mul i32 %sext.a.0, %sext.a.0
99  %mul.1 = mul i32 %sext.a.1, %sext.b.1
100  %addr.a.2 = getelementptr i16, i16* %a, i32 2
101  %addr.b.2 = getelementptr i16, i16* %b, i32 2
102  %ld.a.2 = load i16, i16* %addr.a.2
103  %ld.b.2 = load i16, i16* %addr.b.2
104  %sext.a.2 = sext i16 %ld.a.2 to i32
105  %sext.b.2 = sext i16 %ld.b.2 to i32
106  %mul.2 = mul i32 %sext.a.2, %sext.b.2
107  %add = add i32 %mul.0, %mul.1
108  %add.1 = add i32 %mul.1, %mul.2
109  %add.2 = add i32 %add.1, %add
110  %res = add i32 %add.2, %acc
111  ret i32 %res
112}
113
114define i32 @accumulate_square_a2(i16* %a, i16* %b, i32 %acc) {
115; CHECK-LABEL: @accumulate_square_a2(
116; CHECK-NEXT:  entry:
117; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[A:%.*]] to i32*
118; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 2
119; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
120; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
121; CHECK-NEXT:    [[TMP4:%.*]] = sext i16 [[TMP3]] to i32
122; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16* [[B:%.*]] to i32*
123; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 2
124; CHECK-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
125; CHECK-NEXT:    [[TMP8:%.*]] = trunc i32 [[TMP7]] to i16
126; CHECK-NEXT:    [[TMP9:%.*]] = sext i16 [[TMP8]] to i32
127; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[TMP4]], [[TMP9]]
128; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2
129; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
130; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]]
131; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
132; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
133; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
134; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_A_2]]
135; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[MUL_2]], [[ACC:%.*]]
136; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[MUL_1]], [[TMP10]]
137; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP6]], i32 [[TMP11]])
138; CHECK-NEXT:    [[RES:%.*]] = add i32 [[TMP12]], [[SEXT_B_2]]
139; CHECK-NEXT:    ret i32 [[RES]]
140;
141entry:
142  %addr.a.1 = getelementptr i16, i16* %a, i32 1
143  %addr.b.1 = getelementptr i16, i16* %b, i32 1
144  %ld.a.0 = load i16, i16* %a
145  %sext.a.0 = sext i16 %ld.a.0 to i32
146  %ld.b.0 = load i16, i16* %b
147  %ld.a.1 = load i16, i16* %addr.a.1
148  %ld.b.1 = load i16, i16* %addr.b.1
149  %sext.a.1 = sext i16 %ld.a.1 to i32
150  %sext.b.1 = sext i16 %ld.b.1 to i32
151  %sext.b.0 = sext i16 %ld.b.0 to i32
152  %mul.0 = mul i32 %sext.a.0, %sext.b.0
153  %mul.1 = mul i32 %sext.a.1, %sext.b.1
154  %addr.a.2 = getelementptr i16, i16* %a, i32 2
155  %addr.b.2 = getelementptr i16, i16* %b, i32 2
156  %ld.a.2 = load i16, i16* %addr.a.2
157  %ld.b.2 = load i16, i16* %addr.b.2
158  %sext.a.2 = sext i16 %ld.a.2 to i32
159  %sext.b.2 = sext i16 %ld.b.2 to i32
160  %mul.2 = mul i32 %sext.a.2, %sext.a.2
161  %add = add i32 %mul.0, %mul.1
162  %add.1 = add i32 %mul.1, %mul.2
163  %add.2 = add i32 %add.1, %add
164  %add.3 = add i32 %add.2, %acc
165  %res = add i32 %add.3, %sext.b.2
166  ret i32 %res
167}
168
169define i32 @accumulate_square_b2(i16* %a, i16* %b, i32 %acc) {
170; CHECK-LABEL: @accumulate_square_b2(
171; CHECK-NEXT:  entry:
172; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
173; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
174; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, i16* [[A]]
175; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
176; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]]
177; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]]
178; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
179; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
180; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
181; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_B_1]]
182; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
183; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
184; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
185; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_B_2]], [[SEXT_B_2]]
186; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]]
187; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
188; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]]
189; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_2]], [[ACC:%.*]]
190; CHECK-NEXT:    ret i32 [[RES]]
191;
192entry:
193  %addr.a.1 = getelementptr i16, i16* %a, i32 1
194  %addr.b.1 = getelementptr i16, i16* %b, i32 1
195  %ld.a.0 = load i16, i16* %a
196  %sext.a.0 = sext i16 %ld.a.0 to i32
197  %ld.b.0 = load i16, i16* %b
198  %ld.a.1 = load i16, i16* %addr.a.1
199  %ld.b.1 = load i16, i16* %addr.b.1
200  %sext.a.1 = sext i16 %ld.a.1 to i32
201  %sext.b.1 = sext i16 %ld.b.1 to i32
202  %sext.b.0 = sext i16 %ld.b.0 to i32
203  %mul.0 = mul i32 %sext.a.0, %sext.a.0
204  %mul.1 = mul i32 %sext.a.1, %sext.b.1
205  %addr.a.2 = getelementptr i16, i16* %a, i32 2
206  %addr.b.2 = getelementptr i16, i16* %b, i32 2
207  %ld.a.2 = load i16, i16* %addr.a.2
208  %ld.b.2 = load i16, i16* %addr.b.2
209  %sext.a.2 = sext i16 %ld.a.2 to i32
210  %sext.b.2 = sext i16 %ld.b.2 to i32
211  %mul.2 = mul i32 %sext.b.2, %sext.b.2
212  %add = add i32 %mul.0, %mul.1
213  %add.1 = add i32 %mul.1, %mul.2
214  %add.2 = add i32 %add.1, %add
215  %add.3 = add i32 %add.2, %sext.a.2
216  %res = add i32 %add.2, %acc
217  ret i32 %res
218}
219
220define i32 @accumulate_square_a1(i16* %a, i16* %b, i32 %acc) {
221; CHECK-LABEL: @accumulate_square_a1(
222; CHECK-NEXT:  entry:
223; CHECK-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 1
224; CHECK-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 1
225; CHECK-NEXT:    [[LD_A_0:%.*]] = load i16, i16* [[A]]
226; CHECK-NEXT:    [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32
227; CHECK-NEXT:    [[LD_A_1:%.*]] = load i16, i16* [[ADDR_A_1]]
228; CHECK-NEXT:    [[LD_B_1:%.*]] = load i16, i16* [[ADDR_B_1]]
229; CHECK-NEXT:    [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32
230; CHECK-NEXT:    [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32
231; CHECK-NEXT:    [[MUL_0:%.*]] = mul i32 [[SEXT_A_0]], [[SEXT_A_0]]
232; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[SEXT_A_1]], [[SEXT_A_1]]
233; CHECK-NEXT:    [[ADDR_A_2:%.*]] = getelementptr i16, i16* [[A]], i32 2
234; CHECK-NEXT:    [[ADDR_B_2:%.*]] = getelementptr i16, i16* [[B]], i32 2
235; CHECK-NEXT:    [[LD_A_2:%.*]] = load i16, i16* [[ADDR_A_2]]
236; CHECK-NEXT:    [[LD_B_2:%.*]] = load i16, i16* [[ADDR_B_2]]
237; CHECK-NEXT:    [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32
238; CHECK-NEXT:    [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32
239; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[SEXT_A_2]], [[SEXT_B_2]]
240; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[MUL_1]], [[SEXT_B_1]]
241; CHECK-NEXT:    [[ADD_1:%.*]] = add i32 [[MUL_0]], [[ADD]]
242; CHECK-NEXT:    [[ADD_2:%.*]] = add i32 [[MUL_1]], [[MUL_2]]
243; CHECK-NEXT:    [[ADD_3:%.*]] = add i32 [[ADD_2]], [[ADD_1]]
244; CHECK-NEXT:    [[ADD_4:%.*]] = add i32 [[ADD_3]], [[SEXT_A_2]]
245; CHECK-NEXT:    [[RES:%.*]] = add i32 [[ADD_4]], [[ACC:%.*]]
246; CHECK-NEXT:    ret i32 [[RES]]
247;
248entry:
249  %addr.a.1 = getelementptr i16, i16* %a, i32 1
250  %addr.b.1 = getelementptr i16, i16* %b, i32 1
251  %ld.a.0 = load i16, i16* %a
252  %sext.a.0 = sext i16 %ld.a.0 to i32
253  %ld.b.0 = load i16, i16* %b
254  %ld.a.1 = load i16, i16* %addr.a.1
255  %ld.b.1 = load i16, i16* %addr.b.1
256  %sext.a.1 = sext i16 %ld.a.1 to i32
257  %sext.b.1 = sext i16 %ld.b.1 to i32
258  %sext.b.0 = sext i16 %ld.b.0 to i32
259  %mul.0 = mul i32 %sext.a.0, %sext.a.0
260  %mul.1 = mul i32 %sext.a.1, %sext.a.1
261  %addr.a.2 = getelementptr i16, i16* %a, i32 2
262  %addr.b.2 = getelementptr i16, i16* %b, i32 2
263  %ld.a.2 = load i16, i16* %addr.a.2
264  %ld.b.2 = load i16, i16* %addr.b.2
265  %sext.a.2 = sext i16 %ld.a.2 to i32
266  %sext.b.2 = sext i16 %ld.b.2 to i32
267  %mul.2 = mul i32 %sext.a.2, %sext.b.2
268  %add = add i32 %mul.1, %sext.b.1
269  %add.1 = add i32 %mul.0, %add
270  %add.2 = add i32 %mul.1, %mul.2
271  %add.3 = add i32 %add.2, %add.1
272  %add.4 = add i32 %add.3, %sext.a.2
273  %res = add i32 %add.4, %acc
274  ret i32 %res
275}
276