1; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s
2
3; CHECK-LABEL: overlap_1
4; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
5; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
6; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
7; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
8; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
9; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
10; CHECK: [[ACC:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
11; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
12; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
13; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
14; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
15; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_1]], i32 [[LD_B_1]], i32 [[ACC]])
16; CHECK: ret i32 [[RES]]
17define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) {
18entry:
19  %addr.a.1 = getelementptr i16, i16* %a, i32 1
20  %addr.b.1 = getelementptr i16, i16* %b, i32 1
21  %ld.a.0 = load i16, i16* %a
22  %sext.a.0 = sext i16 %ld.a.0 to i32
23  %ld.b.0 = load i16, i16* %b
24  %ld.a.1 = load i16, i16* %addr.a.1
25  %ld.b.1 = load i16, i16* %addr.b.1
26  %sext.a.1 = sext i16 %ld.a.1 to i32
27  %sext.b.1 = sext i16 %ld.b.1 to i32
28  %sext.b.0 = sext i16 %ld.b.0 to i32
29  %mul.0 = mul i32 %sext.a.0, %sext.b.0
30  %mul.1 = mul i32 %sext.a.1, %sext.b.1
31  %addr.a.2 = getelementptr i16, i16* %a, i32 2
32  %addr.b.2 = getelementptr i16, i16* %b, i32 2
33  %ld.a.2 = load i16, i16* %addr.a.2
34  %ld.b.2 = load i16, i16* %addr.b.2
35  %sext.a.2 = sext i16 %ld.a.2 to i32
36  %sext.b.2 = sext i16 %ld.b.2 to i32
37  %mul.2 = mul i32 %sext.a.2, %sext.b.2
38  %add = add i32 %mul.0, %mul.1
39  %add.1 = add i32 %mul.1, %mul.2
40  %add.2 = add i32 %add.1, %add
41  %res = add i32 %add.2, %acc
42  ret i32 %res
43}
44
45; TODO: Is it really best to generate smlald for the first instruction? Does
46; this just increase register pressure unnecessarily?
47; CHECK-LABEL: overlap_64_1
48; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
49; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
50; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
51; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
52; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
53; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
54; CHECK: [[ACC:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A]], i32 [[LD_B]], i64 %acc)
55; CHECK: [[CAST_A_1:%[^ ]+]] = bitcast i16* [[ADDR_A_1]] to i32*
56; CHECK: [[LD_A_1:%[^ ]+]] = load i32, i32* [[CAST_A_1]]
57; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[ADDR_B_1]] to i32*
58; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
59; CHECK: [[RES:%[^ ]+]] = call i64 @llvm.arm.smlald(i32 [[LD_A_1]], i32 [[LD_B_1]], i64 [[ACC]])
60; CHECK: ret i64 [[RES]]
61define i64 @overlap_64_1(i16* %a, i16* %b, i64 %acc) {
62entry:
63  %addr.a.1 = getelementptr i16, i16* %a, i32 1
64  %addr.b.1 = getelementptr i16, i16* %b, i32 1
65  %ld.a.0 = load i16, i16* %a
66  %sext.a.0 = sext i16 %ld.a.0 to i32
67  %ld.b.0 = load i16, i16* %b
68  %ld.a.1 = load i16, i16* %addr.a.1
69  %ld.b.1 = load i16, i16* %addr.b.1
70  %sext.a.1 = sext i16 %ld.a.1 to i32
71  %sext.b.1 = sext i16 %ld.b.1 to i32
72  %sext.b.0 = sext i16 %ld.b.0 to i32
73  %mul.0 = mul i32 %sext.a.0, %sext.b.0
74  %mul.1 = mul i32 %sext.a.1, %sext.b.1
75  %addr.a.2 = getelementptr i16, i16* %a, i32 2
76  %addr.b.2 = getelementptr i16, i16* %b, i32 2
77  %ld.a.2 = load i16, i16* %addr.a.2
78  %ld.b.2 = load i16, i16* %addr.b.2
79  %sext.a.2 = sext i16 %ld.a.2 to i32
80  %sext.b.2 = sext i16 %ld.b.2 to i32
81  %mul.2 = mul i32 %sext.a.2, %sext.b.2
82  %add = add i32 %mul.0, %mul.1
83  %add.1 = add i32 %mul.1, %mul.2
84  %sext.add = sext i32 %add to i64
85  %sext.add.1 = sext i32 %add.1 to i64
86  %add.2 = add i64 %sext.add.1, %sext.add
87  %res = add i64 %add.2, %acc
88  ret i64 %res
89}
90
91; CHECK-LABEL: overlap_2
92; CHECK: [[ADDR_A_1:%[^ ]+]] = getelementptr i16, i16* %a, i32 1
93; CHECK: [[ADDR_B_1:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
94; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
95; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
96; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
97; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
98; CHECK: [[ACC1:%[^ ]+]] = add i32 %mul.1, %acc
99; CHECK: [[ACC2:%[^ ]+]] = add i32 %mul.2, [[ACC1]]
100; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[ACC2]])
101; CHECK: ret i32 [[RES]]
102define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) {
103entry:
104  %addr.a.1 = getelementptr i16, i16* %a, i32 1
105  %addr.b.1 = getelementptr i16, i16* %b, i32 1
106  %ld.a.0 = load i16, i16* %a
107  %sext.a.0 = sext i16 %ld.a.0 to i32
108  %ld.b.0 = load i16, i16* %b
109  %ld.a.1 = load i16, i16* %addr.a.1
110  %ld.b.1 = load i16, i16* %addr.b.1
111  %sext.a.1 = sext i16 %ld.a.1 to i32
112  %sext.b.1 = sext i16 %ld.b.1 to i32
113  %sext.b.0 = sext i16 %ld.b.0 to i32
114  %mul.0 = mul i32 %sext.a.0, %sext.b.0
115  %mul.1 = mul i32 %sext.a.1, %sext.b.1
116  %addr.a.2 = getelementptr i16, i16* %a, i32 2
117  %addr.b.2 = getelementptr i16, i16* %b, i32 2
118  %ld.a.2 = load i16, i16* %addr.a.2
119  %ld.b.2 = load i16, i16* %addr.b.2
120  %sext.a.2 = sext i16 %ld.a.2 to i32
121  %sext.b.2 = sext i16 %ld.b.2 to i32
122  %mul.2 = mul i32 %sext.b.2, %sext.a.2
123  %add = add i32 %mul.0, %mul.1
124  %add.1 = add i32 %mul.1, %mul.2
125  %add.2 = add i32 %add, %add.1
126  %res = add i32 %add.2, %acc
127  ret i32 %res
128}
129
130; CHECK-LABEL: overlap_3
131; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
132; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
133; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
134; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
135; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
136; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
137; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
138; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
139; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
140; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
141; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
142; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]])
143; CHECK: ret i32 [[RES]]
144define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) {
145entry:
146  %addr.a.1 = getelementptr i16, i16* %a, i32 1
147  %addr.b.1 = getelementptr i16, i16* %b, i32 1
148  %ld.a.0 = load i16, i16* %a
149  %sext.a.0 = sext i16 %ld.a.0 to i32
150  %ld.b.0 = load i16, i16* %b
151  %ld.a.1 = load i16, i16* %addr.a.1
152  %ld.b.1 = load i16, i16* %addr.b.1
153  %sext.a.1 = sext i16 %ld.a.1 to i32
154  %sext.b.1 = sext i16 %ld.b.1 to i32
155  %sext.b.0 = sext i16 %ld.b.0 to i32
156  %mul.0 = mul i32 %sext.a.0, %sext.b.0
157  %mul.1 = mul i32 %sext.a.1, %sext.b.1
158  %addr.a.2 = getelementptr i16, i16* %a, i32 2
159  %addr.b.2 = getelementptr i16, i16* %b, i32 2
160  %addr.a.3 = getelementptr i16, i16* %a, i32 3
161  %ld.a.2 = load i16, i16* %addr.a.2
162  %ld.b.2 = load i16, i16* %addr.b.2
163  %ld.a.3 = load i16, i16* %addr.a.3
164  %sext.a.2 = sext i16 %ld.a.2 to i32
165  %sext.b.2 = sext i16 %ld.b.2 to i32
166  %sext.a.3 = sext i16 %ld.a.3 to i32
167  %mul.2 = mul i32 %sext.a.2, %sext.b.1
168  %mul.3 = mul i32 %sext.a.3, %sext.b.2
169  %add = add i32 %mul.0, %mul.1
170  %add.1 = add i32 %mul.2, %mul.3
171  %add.2 = add i32 %add.1, %add
172  %res = add i32 %add.2, %acc
173  ret i32 %res
174}
175
176; CHECK-LABEL: overlap_4
177; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1
178; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32*
179; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]]
180; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32*
181; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]]
182; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc)
183; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32*
184; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]]
185; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2
186; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32*
187; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]]
188; CHECK: [[RES:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 [[SMLAD]])
189; CHECK: ret i32 [[RES]]
190define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) {
191entry:
192  %addr.a.1 = getelementptr i16, i16* %a, i32 1
193  %addr.b.1 = getelementptr i16, i16* %b, i32 1
194  %ld.a.0 = load i16, i16* %a
195  %sext.a.0 = sext i16 %ld.a.0 to i32
196  %ld.b.0 = load i16, i16* %b
197  %ld.a.1 = load i16, i16* %addr.a.1
198  %ld.b.1 = load i16, i16* %addr.b.1
199  %sext.a.1 = sext i16 %ld.a.1 to i32
200  %sext.b.1 = sext i16 %ld.b.1 to i32
201  %sext.b.0 = sext i16 %ld.b.0 to i32
202  %mul.0 = mul i32 %sext.a.0, %sext.b.0
203  %mul.1 = mul i32 %sext.a.1, %sext.b.1
204  %addr.a.2 = getelementptr i16, i16* %a, i32 2
205  %addr.b.2 = getelementptr i16, i16* %b, i32 2
206  %addr.a.3 = getelementptr i16, i16* %a, i32 3
207  %ld.a.2 = load i16, i16* %addr.a.2
208  %ld.b.2 = load i16, i16* %addr.b.2
209  %ld.a.3 = load i16, i16* %addr.a.3
210  %sext.a.2 = sext i16 %ld.a.2 to i32
211  %sext.b.2 = sext i16 %ld.b.2 to i32
212  %sext.a.3 = sext i16 %ld.a.3 to i32
213  %mul.2 = mul i32 %sext.b.2, %sext.a.2
214  %mul.3 = mul i32 %sext.b.1, %sext.a.3
215  %add = add i32 %mul.0, %mul.1
216  %add.1 = add i32 %mul.2, %mul.3
217  %add.2 = add i32 %add.1, %add
218  %res = add i32 %add.2, %acc
219  ret i32 %res
220}
221