1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
convdw5x5s1_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void convdw5x5s1_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 #if __aarch64__
18 const int w = bottom_blob.w;
19 #endif
20
21 const int outw = top_blob.w;
22 const int outh = top_blob.h;
23
24 const int group = bottom_blob.c;
25
26 const float* bias = _bias;
27
28 #pragma omp parallel for num_threads(opt.num_threads)
29 for (int g = 0; g < group; g++)
30 {
31 Mat out = top_blob.channel(g);
32
33 float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);
34
35 const float* k0 = kernel.row(g);
36
37 float* outptr0 = out.row(0);
38
39 const Mat img0 = bottom_blob.channel(g);
40
41 const float* r0 = img0.row(0);
42 const float* r1 = img0.row(1);
43 const float* r2 = img0.row(2);
44 const float* r3 = img0.row(3);
45 const float* r4 = img0.row(4);
46
47 int i = 0;
48
49 #if __aarch64__
50 float* outptr1 = out.row(1);
51 const float* r5 = img0.row(5);
52
53 for (; i + 1 < outh; i += 2)
54 {
55 int j = 0;
56
57 for (; j + 3 < outw; j += 4)
58 {
59 float32x4_t _sum00 = _bias0;
60 float32x4_t _sum01 = _bias0;
61 float32x4_t _sum02 = _bias0;
62 float32x4_t _sum03 = _bias0;
63 float32x4_t _sum10 = _bias0;
64 float32x4_t _sum11 = _bias0;
65 float32x4_t _sum12 = _bias0;
66 float32x4_t _sum13 = _bias0;
67
68 float32x4_t _r00 = vld1q_f32(r0);
69 float32x4_t _r01 = vld1q_f32(r0 + 4);
70 float32x4_t _r02 = vld1q_f32(r0 + 8);
71 float32x4_t _r03 = vld1q_f32(r0 + 12);
72 float32x4_t _r04 = vld1q_f32(r0 + 16);
73 float32x4_t _r05 = vld1q_f32(r0 + 20);
74 float32x4_t _r06 = vld1q_f32(r0 + 24);
75 float32x4_t _r07 = vld1q_f32(r0 + 28);
76
77 float32x4_t _k00 = vld1q_f32(k0);
78 float32x4_t _k01 = vld1q_f32(k0 + 4);
79 float32x4_t _k02 = vld1q_f32(k0 + 8);
80 float32x4_t _k03 = vld1q_f32(k0 + 12);
81 float32x4_t _k04 = vld1q_f32(k0 + 16);
82 k0 += 20;
83
84 _sum00 = vmlaq_f32(_sum00, _k00, _r00);
85 _sum00 = vmlaq_f32(_sum00, _k01, _r01);
86 _sum00 = vmlaq_f32(_sum00, _k02, _r02);
87 _sum00 = vmlaq_f32(_sum00, _k03, _r03);
88 _sum00 = vmlaq_f32(_sum00, _k04, _r04);
89 _sum01 = vmlaq_f32(_sum01, _k00, _r01);
90 _sum01 = vmlaq_f32(_sum01, _k01, _r02);
91 _sum01 = vmlaq_f32(_sum01, _k02, _r03);
92 _sum01 = vmlaq_f32(_sum01, _k03, _r04);
93 _sum01 = vmlaq_f32(_sum01, _k04, _r05);
94 _sum02 = vmlaq_f32(_sum02, _k00, _r02);
95 _sum02 = vmlaq_f32(_sum02, _k01, _r03);
96 _sum02 = vmlaq_f32(_sum02, _k02, _r04);
97 _sum02 = vmlaq_f32(_sum02, _k03, _r05);
98 _sum02 = vmlaq_f32(_sum02, _k04, _r06);
99 _sum03 = vmlaq_f32(_sum03, _k00, _r03);
100 _sum03 = vmlaq_f32(_sum03, _k01, _r04);
101 _sum03 = vmlaq_f32(_sum03, _k02, _r05);
102 _sum03 = vmlaq_f32(_sum03, _k03, _r06);
103 _sum03 = vmlaq_f32(_sum03, _k04, _r07);
104
105 float32x4_t _r10 = vld1q_f32(r1);
106 float32x4_t _r11 = vld1q_f32(r1 + 4);
107 float32x4_t _r12 = vld1q_f32(r1 + 8);
108 float32x4_t _r13 = vld1q_f32(r1 + 12);
109 float32x4_t _r14 = vld1q_f32(r1 + 16);
110 float32x4_t _r15 = vld1q_f32(r1 + 20);
111 float32x4_t _r16 = vld1q_f32(r1 + 24);
112 float32x4_t _r17 = vld1q_f32(r1 + 28);
113
114 float32x4_t _k10 = vld1q_f32(k0);
115 float32x4_t _k11 = vld1q_f32(k0 + 4);
116 float32x4_t _k12 = vld1q_f32(k0 + 8);
117 float32x4_t _k13 = vld1q_f32(k0 + 12);
118 float32x4_t _k14 = vld1q_f32(k0 + 16);
119 k0 += 20;
120
121 _sum10 = vmlaq_f32(_sum10, _k00, _r10);
122 _sum10 = vmlaq_f32(_sum10, _k01, _r11);
123 _sum10 = vmlaq_f32(_sum10, _k02, _r12);
124 _sum10 = vmlaq_f32(_sum10, _k03, _r13);
125 _sum10 = vmlaq_f32(_sum10, _k04, _r14);
126 _sum11 = vmlaq_f32(_sum11, _k00, _r11);
127 _sum11 = vmlaq_f32(_sum11, _k01, _r12);
128 _sum11 = vmlaq_f32(_sum11, _k02, _r13);
129 _sum11 = vmlaq_f32(_sum11, _k03, _r14);
130 _sum11 = vmlaq_f32(_sum11, _k04, _r15);
131 _sum12 = vmlaq_f32(_sum12, _k00, _r12);
132 _sum12 = vmlaq_f32(_sum12, _k01, _r13);
133 _sum12 = vmlaq_f32(_sum12, _k02, _r14);
134 _sum12 = vmlaq_f32(_sum12, _k03, _r15);
135 _sum12 = vmlaq_f32(_sum12, _k04, _r16);
136 _sum13 = vmlaq_f32(_sum13, _k00, _r13);
137 _sum13 = vmlaq_f32(_sum13, _k01, _r14);
138 _sum13 = vmlaq_f32(_sum13, _k02, _r15);
139 _sum13 = vmlaq_f32(_sum13, _k03, _r16);
140 _sum13 = vmlaq_f32(_sum13, _k04, _r17);
141
142 _sum00 = vmlaq_f32(_sum00, _k10, _r10);
143 _sum00 = vmlaq_f32(_sum00, _k11, _r11);
144 _sum00 = vmlaq_f32(_sum00, _k12, _r12);
145 _sum00 = vmlaq_f32(_sum00, _k13, _r13);
146 _sum00 = vmlaq_f32(_sum00, _k14, _r14);
147 _sum01 = vmlaq_f32(_sum01, _k10, _r11);
148 _sum01 = vmlaq_f32(_sum01, _k11, _r12);
149 _sum01 = vmlaq_f32(_sum01, _k12, _r13);
150 _sum01 = vmlaq_f32(_sum01, _k13, _r14);
151 _sum01 = vmlaq_f32(_sum01, _k14, _r15);
152 _sum02 = vmlaq_f32(_sum02, _k10, _r12);
153 _sum02 = vmlaq_f32(_sum02, _k11, _r13);
154 _sum02 = vmlaq_f32(_sum02, _k12, _r14);
155 _sum02 = vmlaq_f32(_sum02, _k13, _r15);
156 _sum02 = vmlaq_f32(_sum02, _k14, _r16);
157 _sum03 = vmlaq_f32(_sum03, _k10, _r13);
158 _sum03 = vmlaq_f32(_sum03, _k11, _r14);
159 _sum03 = vmlaq_f32(_sum03, _k12, _r15);
160 _sum03 = vmlaq_f32(_sum03, _k13, _r16);
161 _sum03 = vmlaq_f32(_sum03, _k14, _r17);
162
163 float32x4_t _r20 = vld1q_f32(r2);
164 float32x4_t _r21 = vld1q_f32(r2 + 4);
165 float32x4_t _r22 = vld1q_f32(r2 + 8);
166 float32x4_t _r23 = vld1q_f32(r2 + 12);
167 float32x4_t _r24 = vld1q_f32(r2 + 16);
168 float32x4_t _r25 = vld1q_f32(r2 + 20);
169 float32x4_t _r26 = vld1q_f32(r2 + 24);
170 float32x4_t _r27 = vld1q_f32(r2 + 28);
171
172 float32x4_t _k20 = vld1q_f32(k0);
173 float32x4_t _k21 = vld1q_f32(k0 + 4);
174 float32x4_t _k22 = vld1q_f32(k0 + 8);
175 float32x4_t _k23 = vld1q_f32(k0 + 12);
176 float32x4_t _k24 = vld1q_f32(k0 + 16);
177 k0 += 20;
178
179 _sum10 = vmlaq_f32(_sum10, _k10, _r20);
180 _sum10 = vmlaq_f32(_sum10, _k11, _r21);
181 _sum10 = vmlaq_f32(_sum10, _k12, _r22);
182 _sum10 = vmlaq_f32(_sum10, _k13, _r23);
183 _sum10 = vmlaq_f32(_sum10, _k14, _r24);
184 _sum11 = vmlaq_f32(_sum11, _k10, _r21);
185 _sum11 = vmlaq_f32(_sum11, _k11, _r22);
186 _sum11 = vmlaq_f32(_sum11, _k12, _r23);
187 _sum11 = vmlaq_f32(_sum11, _k13, _r24);
188 _sum11 = vmlaq_f32(_sum11, _k14, _r25);
189 _sum12 = vmlaq_f32(_sum12, _k10, _r22);
190 _sum12 = vmlaq_f32(_sum12, _k11, _r23);
191 _sum12 = vmlaq_f32(_sum12, _k12, _r24);
192 _sum12 = vmlaq_f32(_sum12, _k13, _r25);
193 _sum12 = vmlaq_f32(_sum12, _k14, _r26);
194 _sum13 = vmlaq_f32(_sum13, _k10, _r23);
195 _sum13 = vmlaq_f32(_sum13, _k11, _r24);
196 _sum13 = vmlaq_f32(_sum13, _k12, _r25);
197 _sum13 = vmlaq_f32(_sum13, _k13, _r26);
198 _sum13 = vmlaq_f32(_sum13, _k14, _r27);
199
200 _sum00 = vmlaq_f32(_sum00, _k20, _r20);
201 _sum00 = vmlaq_f32(_sum00, _k21, _r21);
202 _sum00 = vmlaq_f32(_sum00, _k22, _r22);
203 _sum00 = vmlaq_f32(_sum00, _k23, _r23);
204 _sum00 = vmlaq_f32(_sum00, _k24, _r24);
205 _sum01 = vmlaq_f32(_sum01, _k20, _r21);
206 _sum01 = vmlaq_f32(_sum01, _k21, _r22);
207 _sum01 = vmlaq_f32(_sum01, _k22, _r23);
208 _sum01 = vmlaq_f32(_sum01, _k23, _r24);
209 _sum01 = vmlaq_f32(_sum01, _k24, _r25);
210 _sum02 = vmlaq_f32(_sum02, _k20, _r22);
211 _sum02 = vmlaq_f32(_sum02, _k21, _r23);
212 _sum02 = vmlaq_f32(_sum02, _k22, _r24);
213 _sum02 = vmlaq_f32(_sum02, _k23, _r25);
214 _sum02 = vmlaq_f32(_sum02, _k24, _r26);
215 _sum03 = vmlaq_f32(_sum03, _k20, _r23);
216 _sum03 = vmlaq_f32(_sum03, _k21, _r24);
217 _sum03 = vmlaq_f32(_sum03, _k22, _r25);
218 _sum03 = vmlaq_f32(_sum03, _k23, _r26);
219 _sum03 = vmlaq_f32(_sum03, _k24, _r27);
220
221 float32x4_t _r30 = vld1q_f32(r3);
222 float32x4_t _r31 = vld1q_f32(r3 + 4);
223 float32x4_t _r32 = vld1q_f32(r3 + 8);
224 float32x4_t _r33 = vld1q_f32(r3 + 12);
225 float32x4_t _r34 = vld1q_f32(r3 + 16);
226 float32x4_t _r35 = vld1q_f32(r3 + 20);
227 float32x4_t _r36 = vld1q_f32(r3 + 24);
228 float32x4_t _r37 = vld1q_f32(r3 + 28);
229
230 float32x4_t _k30 = vld1q_f32(k0);
231 float32x4_t _k31 = vld1q_f32(k0 + 4);
232 float32x4_t _k32 = vld1q_f32(k0 + 8);
233 float32x4_t _k33 = vld1q_f32(k0 + 12);
234 float32x4_t _k34 = vld1q_f32(k0 + 16);
235 k0 += 20;
236
237 _sum10 = vmlaq_f32(_sum10, _k20, _r30);
238 _sum10 = vmlaq_f32(_sum10, _k21, _r31);
239 _sum10 = vmlaq_f32(_sum10, _k22, _r32);
240 _sum10 = vmlaq_f32(_sum10, _k23, _r33);
241 _sum10 = vmlaq_f32(_sum10, _k24, _r34);
242 _sum11 = vmlaq_f32(_sum11, _k20, _r31);
243 _sum11 = vmlaq_f32(_sum11, _k21, _r32);
244 _sum11 = vmlaq_f32(_sum11, _k22, _r33);
245 _sum11 = vmlaq_f32(_sum11, _k23, _r34);
246 _sum11 = vmlaq_f32(_sum11, _k24, _r35);
247 _sum12 = vmlaq_f32(_sum12, _k20, _r32);
248 _sum12 = vmlaq_f32(_sum12, _k21, _r33);
249 _sum12 = vmlaq_f32(_sum12, _k22, _r34);
250 _sum12 = vmlaq_f32(_sum12, _k23, _r35);
251 _sum12 = vmlaq_f32(_sum12, _k24, _r36);
252 _sum13 = vmlaq_f32(_sum13, _k20, _r33);
253 _sum13 = vmlaq_f32(_sum13, _k21, _r34);
254 _sum13 = vmlaq_f32(_sum13, _k22, _r35);
255 _sum13 = vmlaq_f32(_sum13, _k23, _r36);
256 _sum13 = vmlaq_f32(_sum13, _k24, _r37);
257
258 _sum00 = vmlaq_f32(_sum00, _k30, _r30);
259 _sum00 = vmlaq_f32(_sum00, _k31, _r31);
260 _sum00 = vmlaq_f32(_sum00, _k32, _r32);
261 _sum00 = vmlaq_f32(_sum00, _k33, _r33);
262 _sum00 = vmlaq_f32(_sum00, _k34, _r34);
263 _sum01 = vmlaq_f32(_sum01, _k30, _r31);
264 _sum01 = vmlaq_f32(_sum01, _k31, _r32);
265 _sum01 = vmlaq_f32(_sum01, _k32, _r33);
266 _sum01 = vmlaq_f32(_sum01, _k33, _r34);
267 _sum01 = vmlaq_f32(_sum01, _k34, _r35);
268 _sum02 = vmlaq_f32(_sum02, _k30, _r32);
269 _sum02 = vmlaq_f32(_sum02, _k31, _r33);
270 _sum02 = vmlaq_f32(_sum02, _k32, _r34);
271 _sum02 = vmlaq_f32(_sum02, _k33, _r35);
272 _sum02 = vmlaq_f32(_sum02, _k34, _r36);
273 _sum03 = vmlaq_f32(_sum03, _k30, _r33);
274 _sum03 = vmlaq_f32(_sum03, _k31, _r34);
275 _sum03 = vmlaq_f32(_sum03, _k32, _r35);
276 _sum03 = vmlaq_f32(_sum03, _k33, _r36);
277 _sum03 = vmlaq_f32(_sum03, _k34, _r37);
278
279 float32x4_t _r40 = vld1q_f32(r4);
280 float32x4_t _r41 = vld1q_f32(r4 + 4);
281 float32x4_t _r42 = vld1q_f32(r4 + 8);
282 float32x4_t _r43 = vld1q_f32(r4 + 12);
283 float32x4_t _r44 = vld1q_f32(r4 + 16);
284 float32x4_t _r45 = vld1q_f32(r4 + 20);
285 float32x4_t _r46 = vld1q_f32(r4 + 24);
286 float32x4_t _r47 = vld1q_f32(r4 + 28);
287
288 float32x4_t _k40 = vld1q_f32(k0);
289 float32x4_t _k41 = vld1q_f32(k0 + 4);
290 float32x4_t _k42 = vld1q_f32(k0 + 8);
291 float32x4_t _k43 = vld1q_f32(k0 + 12);
292 float32x4_t _k44 = vld1q_f32(k0 + 16);
293 k0 -= 80;
294
295 _sum10 = vmlaq_f32(_sum10, _k30, _r40);
296 _sum10 = vmlaq_f32(_sum10, _k31, _r41);
297 _sum10 = vmlaq_f32(_sum10, _k32, _r42);
298 _sum10 = vmlaq_f32(_sum10, _k33, _r43);
299 _sum10 = vmlaq_f32(_sum10, _k34, _r44);
300 _sum11 = vmlaq_f32(_sum11, _k30, _r41);
301 _sum11 = vmlaq_f32(_sum11, _k31, _r42);
302 _sum11 = vmlaq_f32(_sum11, _k32, _r43);
303 _sum11 = vmlaq_f32(_sum11, _k33, _r44);
304 _sum11 = vmlaq_f32(_sum11, _k34, _r45);
305 _sum12 = vmlaq_f32(_sum12, _k30, _r42);
306 _sum12 = vmlaq_f32(_sum12, _k31, _r43);
307 _sum12 = vmlaq_f32(_sum12, _k32, _r44);
308 _sum12 = vmlaq_f32(_sum12, _k33, _r45);
309 _sum12 = vmlaq_f32(_sum12, _k34, _r46);
310 _sum13 = vmlaq_f32(_sum13, _k30, _r43);
311 _sum13 = vmlaq_f32(_sum13, _k31, _r44);
312 _sum13 = vmlaq_f32(_sum13, _k32, _r45);
313 _sum13 = vmlaq_f32(_sum13, _k33, _r46);
314 _sum13 = vmlaq_f32(_sum13, _k34, _r47);
315
316 _sum00 = vmlaq_f32(_sum00, _k40, _r40);
317 _sum00 = vmlaq_f32(_sum00, _k41, _r41);
318 _sum00 = vmlaq_f32(_sum00, _k42, _r42);
319 _sum00 = vmlaq_f32(_sum00, _k43, _r43);
320 _sum00 = vmlaq_f32(_sum00, _k44, _r44);
321 _sum01 = vmlaq_f32(_sum01, _k40, _r41);
322 _sum01 = vmlaq_f32(_sum01, _k41, _r42);
323 _sum01 = vmlaq_f32(_sum01, _k42, _r43);
324 _sum01 = vmlaq_f32(_sum01, _k43, _r44);
325 _sum01 = vmlaq_f32(_sum01, _k44, _r45);
326 _sum02 = vmlaq_f32(_sum02, _k40, _r42);
327 _sum02 = vmlaq_f32(_sum02, _k41, _r43);
328 _sum02 = vmlaq_f32(_sum02, _k42, _r44);
329 _sum02 = vmlaq_f32(_sum02, _k43, _r45);
330 _sum02 = vmlaq_f32(_sum02, _k44, _r46);
331 _sum03 = vmlaq_f32(_sum03, _k40, _r43);
332 _sum03 = vmlaq_f32(_sum03, _k41, _r44);
333 _sum03 = vmlaq_f32(_sum03, _k42, _r45);
334 _sum03 = vmlaq_f32(_sum03, _k43, _r46);
335 _sum03 = vmlaq_f32(_sum03, _k44, _r47);
336
337 float32x4_t _r50 = vld1q_f32(r5);
338 float32x4_t _r51 = vld1q_f32(r5 + 4);
339 float32x4_t _r52 = vld1q_f32(r5 + 8);
340 float32x4_t _r53 = vld1q_f32(r5 + 12);
341 float32x4_t _r54 = vld1q_f32(r5 + 16);
342 float32x4_t _r55 = vld1q_f32(r5 + 20);
343 float32x4_t _r56 = vld1q_f32(r5 + 24);
344 float32x4_t _r57 = vld1q_f32(r5 + 28);
345
346 _sum10 = vmlaq_f32(_sum10, _k40, _r50);
347 _sum10 = vmlaq_f32(_sum10, _k41, _r51);
348 _sum10 = vmlaq_f32(_sum10, _k42, _r52);
349 _sum10 = vmlaq_f32(_sum10, _k43, _r53);
350 _sum10 = vmlaq_f32(_sum10, _k44, _r54);
351 _sum11 = vmlaq_f32(_sum11, _k40, _r51);
352 _sum11 = vmlaq_f32(_sum11, _k41, _r52);
353 _sum11 = vmlaq_f32(_sum11, _k42, _r53);
354 _sum11 = vmlaq_f32(_sum11, _k43, _r54);
355 _sum11 = vmlaq_f32(_sum11, _k44, _r55);
356 _sum12 = vmlaq_f32(_sum12, _k40, _r52);
357 _sum12 = vmlaq_f32(_sum12, _k41, _r53);
358 _sum12 = vmlaq_f32(_sum12, _k42, _r54);
359 _sum12 = vmlaq_f32(_sum12, _k43, _r55);
360 _sum12 = vmlaq_f32(_sum12, _k44, _r56);
361 _sum13 = vmlaq_f32(_sum13, _k40, _r53);
362 _sum13 = vmlaq_f32(_sum13, _k41, _r54);
363 _sum13 = vmlaq_f32(_sum13, _k42, _r55);
364 _sum13 = vmlaq_f32(_sum13, _k43, _r56);
365 _sum13 = vmlaq_f32(_sum13, _k44, _r57);
366
367 vst1q_f32(outptr0, _sum00);
368 vst1q_f32(outptr0 + 4, _sum01);
369 vst1q_f32(outptr0 + 8, _sum02);
370 vst1q_f32(outptr0 + 12, _sum03);
371 vst1q_f32(outptr1, _sum10);
372 vst1q_f32(outptr1 + 4, _sum11);
373 vst1q_f32(outptr1 + 8, _sum12);
374 vst1q_f32(outptr1 + 12, _sum13);
375
376 r0 += 16;
377 r1 += 16;
378 r2 += 16;
379 r3 += 16;
380 r4 += 16;
381 r5 += 16;
382 outptr0 += 16;
383 outptr1 += 16;
384 }
385 for (; j + 1 < outw; j += 2)
386 {
387 float32x4_t _sum00 = _bias0;
388 float32x4_t _sum01 = _bias0;
389 float32x4_t _sum10 = _bias0;
390 float32x4_t _sum11 = _bias0;
391
392 float32x4_t _r00 = vld1q_f32(r0);
393 float32x4_t _r01 = vld1q_f32(r0 + 4);
394 float32x4_t _r02 = vld1q_f32(r0 + 8);
395 float32x4_t _r03 = vld1q_f32(r0 + 12);
396 float32x4_t _r04 = vld1q_f32(r0 + 16);
397 float32x4_t _r05 = vld1q_f32(r0 + 20);
398
399 float32x4_t _k00 = vld1q_f32(k0);
400 float32x4_t _k01 = vld1q_f32(k0 + 4);
401 float32x4_t _k02 = vld1q_f32(k0 + 8);
402 float32x4_t _k03 = vld1q_f32(k0 + 12);
403 float32x4_t _k04 = vld1q_f32(k0 + 16);
404 k0 += 20;
405
406 _sum00 = vmlaq_f32(_sum00, _k00, _r00);
407 _sum00 = vmlaq_f32(_sum00, _k01, _r01);
408 _sum00 = vmlaq_f32(_sum00, _k02, _r02);
409 _sum00 = vmlaq_f32(_sum00, _k03, _r03);
410 _sum00 = vmlaq_f32(_sum00, _k04, _r04);
411 _sum01 = vmlaq_f32(_sum01, _k00, _r01);
412 _sum01 = vmlaq_f32(_sum01, _k01, _r02);
413 _sum01 = vmlaq_f32(_sum01, _k02, _r03);
414 _sum01 = vmlaq_f32(_sum01, _k03, _r04);
415 _sum01 = vmlaq_f32(_sum01, _k04, _r05);
416
417 float32x4_t _r10 = vld1q_f32(r1);
418 float32x4_t _r11 = vld1q_f32(r1 + 4);
419 float32x4_t _r12 = vld1q_f32(r1 + 8);
420 float32x4_t _r13 = vld1q_f32(r1 + 12);
421 float32x4_t _r14 = vld1q_f32(r1 + 16);
422 float32x4_t _r15 = vld1q_f32(r1 + 20);
423
424 float32x4_t _k10 = vld1q_f32(k0);
425 float32x4_t _k11 = vld1q_f32(k0 + 4);
426 float32x4_t _k12 = vld1q_f32(k0 + 8);
427 float32x4_t _k13 = vld1q_f32(k0 + 12);
428 float32x4_t _k14 = vld1q_f32(k0 + 16);
429 k0 += 20;
430
431 _sum10 = vmlaq_f32(_sum10, _k00, _r10);
432 _sum10 = vmlaq_f32(_sum10, _k01, _r11);
433 _sum10 = vmlaq_f32(_sum10, _k02, _r12);
434 _sum10 = vmlaq_f32(_sum10, _k03, _r13);
435 _sum10 = vmlaq_f32(_sum10, _k04, _r14);
436 _sum11 = vmlaq_f32(_sum11, _k00, _r11);
437 _sum11 = vmlaq_f32(_sum11, _k01, _r12);
438 _sum11 = vmlaq_f32(_sum11, _k02, _r13);
439 _sum11 = vmlaq_f32(_sum11, _k03, _r14);
440 _sum11 = vmlaq_f32(_sum11, _k04, _r15);
441
442 _sum00 = vmlaq_f32(_sum00, _k10, _r10);
443 _sum00 = vmlaq_f32(_sum00, _k11, _r11);
444 _sum00 = vmlaq_f32(_sum00, _k12, _r12);
445 _sum00 = vmlaq_f32(_sum00, _k13, _r13);
446 _sum00 = vmlaq_f32(_sum00, _k14, _r14);
447 _sum01 = vmlaq_f32(_sum01, _k10, _r11);
448 _sum01 = vmlaq_f32(_sum01, _k11, _r12);
449 _sum01 = vmlaq_f32(_sum01, _k12, _r13);
450 _sum01 = vmlaq_f32(_sum01, _k13, _r14);
451 _sum01 = vmlaq_f32(_sum01, _k14, _r15);
452
453 float32x4_t _r20 = vld1q_f32(r2);
454 float32x4_t _r21 = vld1q_f32(r2 + 4);
455 float32x4_t _r22 = vld1q_f32(r2 + 8);
456 float32x4_t _r23 = vld1q_f32(r2 + 12);
457 float32x4_t _r24 = vld1q_f32(r2 + 16);
458 float32x4_t _r25 = vld1q_f32(r2 + 20);
459
460 float32x4_t _k20 = vld1q_f32(k0);
461 float32x4_t _k21 = vld1q_f32(k0 + 4);
462 float32x4_t _k22 = vld1q_f32(k0 + 8);
463 float32x4_t _k23 = vld1q_f32(k0 + 12);
464 float32x4_t _k24 = vld1q_f32(k0 + 16);
465 k0 += 20;
466
467 _sum10 = vmlaq_f32(_sum10, _k10, _r20);
468 _sum10 = vmlaq_f32(_sum10, _k11, _r21);
469 _sum10 = vmlaq_f32(_sum10, _k12, _r22);
470 _sum10 = vmlaq_f32(_sum10, _k13, _r23);
471 _sum10 = vmlaq_f32(_sum10, _k14, _r24);
472 _sum11 = vmlaq_f32(_sum11, _k10, _r21);
473 _sum11 = vmlaq_f32(_sum11, _k11, _r22);
474 _sum11 = vmlaq_f32(_sum11, _k12, _r23);
475 _sum11 = vmlaq_f32(_sum11, _k13, _r24);
476 _sum11 = vmlaq_f32(_sum11, _k14, _r25);
477
478 _sum00 = vmlaq_f32(_sum00, _k20, _r20);
479 _sum00 = vmlaq_f32(_sum00, _k21, _r21);
480 _sum00 = vmlaq_f32(_sum00, _k22, _r22);
481 _sum00 = vmlaq_f32(_sum00, _k23, _r23);
482 _sum00 = vmlaq_f32(_sum00, _k24, _r24);
483 _sum01 = vmlaq_f32(_sum01, _k20, _r21);
484 _sum01 = vmlaq_f32(_sum01, _k21, _r22);
485 _sum01 = vmlaq_f32(_sum01, _k22, _r23);
486 _sum01 = vmlaq_f32(_sum01, _k23, _r24);
487 _sum01 = vmlaq_f32(_sum01, _k24, _r25);
488
489 float32x4_t _r30 = vld1q_f32(r3);
490 float32x4_t _r31 = vld1q_f32(r3 + 4);
491 float32x4_t _r32 = vld1q_f32(r3 + 8);
492 float32x4_t _r33 = vld1q_f32(r3 + 12);
493 float32x4_t _r34 = vld1q_f32(r3 + 16);
494 float32x4_t _r35 = vld1q_f32(r3 + 20);
495
496 float32x4_t _k30 = vld1q_f32(k0);
497 float32x4_t _k31 = vld1q_f32(k0 + 4);
498 float32x4_t _k32 = vld1q_f32(k0 + 8);
499 float32x4_t _k33 = vld1q_f32(k0 + 12);
500 float32x4_t _k34 = vld1q_f32(k0 + 16);
501 k0 += 20;
502
503 _sum10 = vmlaq_f32(_sum10, _k20, _r30);
504 _sum10 = vmlaq_f32(_sum10, _k21, _r31);
505 _sum10 = vmlaq_f32(_sum10, _k22, _r32);
506 _sum10 = vmlaq_f32(_sum10, _k23, _r33);
507 _sum10 = vmlaq_f32(_sum10, _k24, _r34);
508 _sum11 = vmlaq_f32(_sum11, _k20, _r31);
509 _sum11 = vmlaq_f32(_sum11, _k21, _r32);
510 _sum11 = vmlaq_f32(_sum11, _k22, _r33);
511 _sum11 = vmlaq_f32(_sum11, _k23, _r34);
512 _sum11 = vmlaq_f32(_sum11, _k24, _r35);
513
514 _sum00 = vmlaq_f32(_sum00, _k30, _r30);
515 _sum00 = vmlaq_f32(_sum00, _k31, _r31);
516 _sum00 = vmlaq_f32(_sum00, _k32, _r32);
517 _sum00 = vmlaq_f32(_sum00, _k33, _r33);
518 _sum00 = vmlaq_f32(_sum00, _k34, _r34);
519 _sum01 = vmlaq_f32(_sum01, _k30, _r31);
520 _sum01 = vmlaq_f32(_sum01, _k31, _r32);
521 _sum01 = vmlaq_f32(_sum01, _k32, _r33);
522 _sum01 = vmlaq_f32(_sum01, _k33, _r34);
523 _sum01 = vmlaq_f32(_sum01, _k34, _r35);
524
525 float32x4_t _r40 = vld1q_f32(r4);
526 float32x4_t _r41 = vld1q_f32(r4 + 4);
527 float32x4_t _r42 = vld1q_f32(r4 + 8);
528 float32x4_t _r43 = vld1q_f32(r4 + 12);
529 float32x4_t _r44 = vld1q_f32(r4 + 16);
530 float32x4_t _r45 = vld1q_f32(r4 + 20);
531
532 float32x4_t _k40 = vld1q_f32(k0);
533 float32x4_t _k41 = vld1q_f32(k0 + 4);
534 float32x4_t _k42 = vld1q_f32(k0 + 8);
535 float32x4_t _k43 = vld1q_f32(k0 + 12);
536 float32x4_t _k44 = vld1q_f32(k0 + 16);
537 k0 -= 80;
538
539 _sum10 = vmlaq_f32(_sum10, _k30, _r40);
540 _sum10 = vmlaq_f32(_sum10, _k31, _r41);
541 _sum10 = vmlaq_f32(_sum10, _k32, _r42);
542 _sum10 = vmlaq_f32(_sum10, _k33, _r43);
543 _sum10 = vmlaq_f32(_sum10, _k34, _r44);
544 _sum11 = vmlaq_f32(_sum11, _k30, _r41);
545 _sum11 = vmlaq_f32(_sum11, _k31, _r42);
546 _sum11 = vmlaq_f32(_sum11, _k32, _r43);
547 _sum11 = vmlaq_f32(_sum11, _k33, _r44);
548 _sum11 = vmlaq_f32(_sum11, _k34, _r45);
549
550 _sum00 = vmlaq_f32(_sum00, _k40, _r40);
551 _sum00 = vmlaq_f32(_sum00, _k41, _r41);
552 _sum00 = vmlaq_f32(_sum00, _k42, _r42);
553 _sum00 = vmlaq_f32(_sum00, _k43, _r43);
554 _sum00 = vmlaq_f32(_sum00, _k44, _r44);
555 _sum01 = vmlaq_f32(_sum01, _k40, _r41);
556 _sum01 = vmlaq_f32(_sum01, _k41, _r42);
557 _sum01 = vmlaq_f32(_sum01, _k42, _r43);
558 _sum01 = vmlaq_f32(_sum01, _k43, _r44);
559 _sum01 = vmlaq_f32(_sum01, _k44, _r45);
560
561 float32x4_t _r50 = vld1q_f32(r5);
562 float32x4_t _r51 = vld1q_f32(r5 + 4);
563 float32x4_t _r52 = vld1q_f32(r5 + 8);
564 float32x4_t _r53 = vld1q_f32(r5 + 12);
565 float32x4_t _r54 = vld1q_f32(r5 + 16);
566 float32x4_t _r55 = vld1q_f32(r5 + 20);
567
568 _sum10 = vmlaq_f32(_sum10, _k40, _r50);
569 _sum10 = vmlaq_f32(_sum10, _k41, _r51);
570 _sum10 = vmlaq_f32(_sum10, _k42, _r52);
571 _sum10 = vmlaq_f32(_sum10, _k43, _r53);
572 _sum10 = vmlaq_f32(_sum10, _k44, _r54);
573 _sum11 = vmlaq_f32(_sum11, _k40, _r51);
574 _sum11 = vmlaq_f32(_sum11, _k41, _r52);
575 _sum11 = vmlaq_f32(_sum11, _k42, _r53);
576 _sum11 = vmlaq_f32(_sum11, _k43, _r54);
577 _sum11 = vmlaq_f32(_sum11, _k44, _r55);
578
579 vst1q_f32(outptr0, _sum00);
580 vst1q_f32(outptr0 + 4, _sum01);
581 vst1q_f32(outptr1, _sum10);
582 vst1q_f32(outptr1 + 4, _sum11);
583
584 r0 += 8;
585 r1 += 8;
586 r2 += 8;
587 r3 += 8;
588 r4 += 8;
589 r5 += 8;
590 outptr0 += 8;
591 outptr1 += 8;
592 }
593 for (; j < outw; j++)
594 {
595 float32x4_t _sum0 = _bias0;
596 float32x4_t _sum1 = _bias0;
597
598 float32x4_t _r00 = vld1q_f32(r0);
599 float32x4_t _r01 = vld1q_f32(r0 + 4);
600 float32x4_t _r02 = vld1q_f32(r0 + 8);
601 float32x4_t _r03 = vld1q_f32(r0 + 12);
602 float32x4_t _r04 = vld1q_f32(r0 + 16);
603
604 float32x4_t _k00 = vld1q_f32(k0);
605 float32x4_t _k01 = vld1q_f32(k0 + 4);
606 float32x4_t _k02 = vld1q_f32(k0 + 8);
607 float32x4_t _k03 = vld1q_f32(k0 + 12);
608 float32x4_t _k04 = vld1q_f32(k0 + 16);
609 k0 += 20;
610
611 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
612 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
613 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
614 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
615 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
616
617 float32x4_t _r10 = vld1q_f32(r1);
618 float32x4_t _r11 = vld1q_f32(r1 + 4);
619 float32x4_t _r12 = vld1q_f32(r1 + 8);
620 float32x4_t _r13 = vld1q_f32(r1 + 12);
621 float32x4_t _r14 = vld1q_f32(r1 + 16);
622
623 float32x4_t _k10 = vld1q_f32(k0);
624 float32x4_t _k11 = vld1q_f32(k0 + 4);
625 float32x4_t _k12 = vld1q_f32(k0 + 8);
626 float32x4_t _k13 = vld1q_f32(k0 + 12);
627 float32x4_t _k14 = vld1q_f32(k0 + 16);
628 k0 += 20;
629
630 _sum1 = vmlaq_f32(_sum1, _k00, _r10);
631 _sum1 = vmlaq_f32(_sum1, _k01, _r11);
632 _sum1 = vmlaq_f32(_sum1, _k02, _r12);
633 _sum1 = vmlaq_f32(_sum1, _k03, _r13);
634 _sum1 = vmlaq_f32(_sum1, _k04, _r14);
635
636 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
637 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
638 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
639 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
640 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
641
642 float32x4_t _r20 = vld1q_f32(r2);
643 float32x4_t _r21 = vld1q_f32(r2 + 4);
644 float32x4_t _r22 = vld1q_f32(r2 + 8);
645 float32x4_t _r23 = vld1q_f32(r2 + 12);
646 float32x4_t _r24 = vld1q_f32(r2 + 16);
647
648 float32x4_t _k20 = vld1q_f32(k0);
649 float32x4_t _k21 = vld1q_f32(k0 + 4);
650 float32x4_t _k22 = vld1q_f32(k0 + 8);
651 float32x4_t _k23 = vld1q_f32(k0 + 12);
652 float32x4_t _k24 = vld1q_f32(k0 + 16);
653 k0 += 20;
654
655 _sum1 = vmlaq_f32(_sum1, _k10, _r20);
656 _sum1 = vmlaq_f32(_sum1, _k11, _r21);
657 _sum1 = vmlaq_f32(_sum1, _k12, _r22);
658 _sum1 = vmlaq_f32(_sum1, _k13, _r23);
659 _sum1 = vmlaq_f32(_sum1, _k14, _r24);
660
661 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
662 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
663 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
664 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
665 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
666
667 float32x4_t _r30 = vld1q_f32(r3);
668 float32x4_t _r31 = vld1q_f32(r3 + 4);
669 float32x4_t _r32 = vld1q_f32(r3 + 8);
670 float32x4_t _r33 = vld1q_f32(r3 + 12);
671 float32x4_t _r34 = vld1q_f32(r3 + 16);
672
673 float32x4_t _k30 = vld1q_f32(k0);
674 float32x4_t _k31 = vld1q_f32(k0 + 4);
675 float32x4_t _k32 = vld1q_f32(k0 + 8);
676 float32x4_t _k33 = vld1q_f32(k0 + 12);
677 float32x4_t _k34 = vld1q_f32(k0 + 16);
678 k0 += 20;
679
680 _sum1 = vmlaq_f32(_sum1, _k20, _r30);
681 _sum1 = vmlaq_f32(_sum1, _k21, _r31);
682 _sum1 = vmlaq_f32(_sum1, _k22, _r32);
683 _sum1 = vmlaq_f32(_sum1, _k23, _r33);
684 _sum1 = vmlaq_f32(_sum1, _k24, _r34);
685
686 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
687 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
688 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
689 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
690 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
691
692 float32x4_t _r40 = vld1q_f32(r4);
693 float32x4_t _r41 = vld1q_f32(r4 + 4);
694 float32x4_t _r42 = vld1q_f32(r4 + 8);
695 float32x4_t _r43 = vld1q_f32(r4 + 12);
696 float32x4_t _r44 = vld1q_f32(r4 + 16);
697
698 float32x4_t _k40 = vld1q_f32(k0);
699 float32x4_t _k41 = vld1q_f32(k0 + 4);
700 float32x4_t _k42 = vld1q_f32(k0 + 8);
701 float32x4_t _k43 = vld1q_f32(k0 + 12);
702 float32x4_t _k44 = vld1q_f32(k0 + 16);
703 k0 -= 80;
704
705 _sum1 = vmlaq_f32(_sum1, _k30, _r40);
706 _sum1 = vmlaq_f32(_sum1, _k31, _r41);
707 _sum1 = vmlaq_f32(_sum1, _k32, _r42);
708 _sum1 = vmlaq_f32(_sum1, _k33, _r43);
709 _sum1 = vmlaq_f32(_sum1, _k34, _r44);
710
711 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
712 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
713 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
714 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
715 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
716
717 float32x4_t _r50 = vld1q_f32(r5);
718 float32x4_t _r51 = vld1q_f32(r5 + 4);
719 float32x4_t _r52 = vld1q_f32(r5 + 8);
720 float32x4_t _r53 = vld1q_f32(r5 + 12);
721 float32x4_t _r54 = vld1q_f32(r5 + 16);
722
723 _sum1 = vmlaq_f32(_sum1, _k40, _r50);
724 _sum1 = vmlaq_f32(_sum1, _k41, _r51);
725 _sum1 = vmlaq_f32(_sum1, _k42, _r52);
726 _sum1 = vmlaq_f32(_sum1, _k43, _r53);
727 _sum1 = vmlaq_f32(_sum1, _k44, _r54);
728
729 vst1q_f32(outptr0, _sum0);
730 vst1q_f32(outptr1, _sum1);
731
732 r0 += 4;
733 r1 += 4;
734 r2 += 4;
735 r3 += 4;
736 r4 += 4;
737 r5 += 4;
738 outptr0 += 4;
739 outptr1 += 4;
740 }
741
742 r0 += 4 * 4 + w * 4;
743 r1 += 4 * 4 + w * 4;
744 r2 += 4 * 4 + w * 4;
745 r3 += 4 * 4 + w * 4;
746 r4 += 4 * 4 + w * 4;
747 r5 += 4 * 4 + w * 4;
748
749 outptr0 += outw * 4;
750 outptr1 += outw * 4;
751 }
752 #endif // __aarch64__
753 for (; i < outh; i++)
754 {
755 int j = 0;
756
757 for (; j + 3 < outw; j += 4)
758 {
759 float32x4_t _sum0 = _bias0;
760 float32x4_t _sum1 = _bias0;
761 float32x4_t _sum2 = _bias0;
762 float32x4_t _sum3 = _bias0;
763
764 float32x4_t _r00 = vld1q_f32(r0);
765 float32x4_t _r01 = vld1q_f32(r0 + 4);
766 float32x4_t _r02 = vld1q_f32(r0 + 8);
767 float32x4_t _r03 = vld1q_f32(r0 + 12);
768 float32x4_t _r04 = vld1q_f32(r0 + 16);
769 float32x4_t _r05 = vld1q_f32(r0 + 20);
770 float32x4_t _r06 = vld1q_f32(r0 + 24);
771 float32x4_t _r07 = vld1q_f32(r0 + 28);
772
773 float32x4_t _k00 = vld1q_f32(k0);
774 float32x4_t _k01 = vld1q_f32(k0 + 4);
775 float32x4_t _k02 = vld1q_f32(k0 + 8);
776 float32x4_t _k03 = vld1q_f32(k0 + 12);
777 float32x4_t _k04 = vld1q_f32(k0 + 16);
778 k0 += 20;
779
780 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
781 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
782 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
783 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
784 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
785 _sum1 = vmlaq_f32(_sum1, _k00, _r01);
786 _sum1 = vmlaq_f32(_sum1, _k01, _r02);
787 _sum1 = vmlaq_f32(_sum1, _k02, _r03);
788 _sum1 = vmlaq_f32(_sum1, _k03, _r04);
789 _sum1 = vmlaq_f32(_sum1, _k04, _r05);
790 _sum2 = vmlaq_f32(_sum2, _k00, _r02);
791 _sum2 = vmlaq_f32(_sum2, _k01, _r03);
792 _sum2 = vmlaq_f32(_sum2, _k02, _r04);
793 _sum2 = vmlaq_f32(_sum2, _k03, _r05);
794 _sum2 = vmlaq_f32(_sum2, _k04, _r06);
795 _sum3 = vmlaq_f32(_sum3, _k00, _r03);
796 _sum3 = vmlaq_f32(_sum3, _k01, _r04);
797 _sum3 = vmlaq_f32(_sum3, _k02, _r05);
798 _sum3 = vmlaq_f32(_sum3, _k03, _r06);
799 _sum3 = vmlaq_f32(_sum3, _k04, _r07);
800
801 float32x4_t _r10 = vld1q_f32(r1);
802 float32x4_t _r11 = vld1q_f32(r1 + 4);
803 float32x4_t _r12 = vld1q_f32(r1 + 8);
804 float32x4_t _r13 = vld1q_f32(r1 + 12);
805 float32x4_t _r14 = vld1q_f32(r1 + 16);
806 float32x4_t _r15 = vld1q_f32(r1 + 20);
807 float32x4_t _r16 = vld1q_f32(r1 + 24);
808 float32x4_t _r17 = vld1q_f32(r1 + 28);
809
810 float32x4_t _k10 = vld1q_f32(k0);
811 float32x4_t _k11 = vld1q_f32(k0 + 4);
812 float32x4_t _k12 = vld1q_f32(k0 + 8);
813 float32x4_t _k13 = vld1q_f32(k0 + 12);
814 float32x4_t _k14 = vld1q_f32(k0 + 16);
815 k0 += 20;
816
817 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
818 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
819 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
820 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
821 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
822 _sum1 = vmlaq_f32(_sum1, _k10, _r11);
823 _sum1 = vmlaq_f32(_sum1, _k11, _r12);
824 _sum1 = vmlaq_f32(_sum1, _k12, _r13);
825 _sum1 = vmlaq_f32(_sum1, _k13, _r14);
826 _sum1 = vmlaq_f32(_sum1, _k14, _r15);
827 _sum2 = vmlaq_f32(_sum2, _k10, _r12);
828 _sum2 = vmlaq_f32(_sum2, _k11, _r13);
829 _sum2 = vmlaq_f32(_sum2, _k12, _r14);
830 _sum2 = vmlaq_f32(_sum2, _k13, _r15);
831 _sum2 = vmlaq_f32(_sum2, _k14, _r16);
832 _sum3 = vmlaq_f32(_sum3, _k10, _r13);
833 _sum3 = vmlaq_f32(_sum3, _k11, _r14);
834 _sum3 = vmlaq_f32(_sum3, _k12, _r15);
835 _sum3 = vmlaq_f32(_sum3, _k13, _r16);
836 _sum3 = vmlaq_f32(_sum3, _k14, _r17);
837
838 float32x4_t _r20 = vld1q_f32(r2);
839 float32x4_t _r21 = vld1q_f32(r2 + 4);
840 float32x4_t _r22 = vld1q_f32(r2 + 8);
841 float32x4_t _r23 = vld1q_f32(r2 + 12);
842 float32x4_t _r24 = vld1q_f32(r2 + 16);
843 float32x4_t _r25 = vld1q_f32(r2 + 20);
844 float32x4_t _r26 = vld1q_f32(r2 + 24);
845 float32x4_t _r27 = vld1q_f32(r2 + 28);
846
847 float32x4_t _k20 = vld1q_f32(k0);
848 float32x4_t _k21 = vld1q_f32(k0 + 4);
849 float32x4_t _k22 = vld1q_f32(k0 + 8);
850 float32x4_t _k23 = vld1q_f32(k0 + 12);
851 float32x4_t _k24 = vld1q_f32(k0 + 16);
852 k0 += 20;
853
854 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
855 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
856 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
857 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
858 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
859 _sum1 = vmlaq_f32(_sum1, _k20, _r21);
860 _sum1 = vmlaq_f32(_sum1, _k21, _r22);
861 _sum1 = vmlaq_f32(_sum1, _k22, _r23);
862 _sum1 = vmlaq_f32(_sum1, _k23, _r24);
863 _sum1 = vmlaq_f32(_sum1, _k24, _r25);
864 _sum2 = vmlaq_f32(_sum2, _k20, _r22);
865 _sum2 = vmlaq_f32(_sum2, _k21, _r23);
866 _sum2 = vmlaq_f32(_sum2, _k22, _r24);
867 _sum2 = vmlaq_f32(_sum2, _k23, _r25);
868 _sum2 = vmlaq_f32(_sum2, _k24, _r26);
869 _sum3 = vmlaq_f32(_sum3, _k20, _r23);
870 _sum3 = vmlaq_f32(_sum3, _k21, _r24);
871 _sum3 = vmlaq_f32(_sum3, _k22, _r25);
872 _sum3 = vmlaq_f32(_sum3, _k23, _r26);
873 _sum3 = vmlaq_f32(_sum3, _k24, _r27);
874
875 float32x4_t _r30 = vld1q_f32(r3);
876 float32x4_t _r31 = vld1q_f32(r3 + 4);
877 float32x4_t _r32 = vld1q_f32(r3 + 8);
878 float32x4_t _r33 = vld1q_f32(r3 + 12);
879 float32x4_t _r34 = vld1q_f32(r3 + 16);
880 float32x4_t _r35 = vld1q_f32(r3 + 20);
881 float32x4_t _r36 = vld1q_f32(r3 + 24);
882 float32x4_t _r37 = vld1q_f32(r3 + 28);
883
884 float32x4_t _k30 = vld1q_f32(k0);
885 float32x4_t _k31 = vld1q_f32(k0 + 4);
886 float32x4_t _k32 = vld1q_f32(k0 + 8);
887 float32x4_t _k33 = vld1q_f32(k0 + 12);
888 float32x4_t _k34 = vld1q_f32(k0 + 16);
889 k0 += 20;
890
891 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
892 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
893 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
894 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
895 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
896 _sum1 = vmlaq_f32(_sum1, _k30, _r31);
897 _sum1 = vmlaq_f32(_sum1, _k31, _r32);
898 _sum1 = vmlaq_f32(_sum1, _k32, _r33);
899 _sum1 = vmlaq_f32(_sum1, _k33, _r34);
900 _sum1 = vmlaq_f32(_sum1, _k34, _r35);
901 _sum2 = vmlaq_f32(_sum2, _k30, _r32);
902 _sum2 = vmlaq_f32(_sum2, _k31, _r33);
903 _sum2 = vmlaq_f32(_sum2, _k32, _r34);
904 _sum2 = vmlaq_f32(_sum2, _k33, _r35);
905 _sum2 = vmlaq_f32(_sum2, _k34, _r36);
906 _sum3 = vmlaq_f32(_sum3, _k30, _r33);
907 _sum3 = vmlaq_f32(_sum3, _k31, _r34);
908 _sum3 = vmlaq_f32(_sum3, _k32, _r35);
909 _sum3 = vmlaq_f32(_sum3, _k33, _r36);
910 _sum3 = vmlaq_f32(_sum3, _k34, _r37);
911
912 float32x4_t _r40 = vld1q_f32(r4);
913 float32x4_t _r41 = vld1q_f32(r4 + 4);
914 float32x4_t _r42 = vld1q_f32(r4 + 8);
915 float32x4_t _r43 = vld1q_f32(r4 + 12);
916 float32x4_t _r44 = vld1q_f32(r4 + 16);
917 float32x4_t _r45 = vld1q_f32(r4 + 20);
918 float32x4_t _r46 = vld1q_f32(r4 + 24);
919 float32x4_t _r47 = vld1q_f32(r4 + 28);
920
921 float32x4_t _k40 = vld1q_f32(k0);
922 float32x4_t _k41 = vld1q_f32(k0 + 4);
923 float32x4_t _k42 = vld1q_f32(k0 + 8);
924 float32x4_t _k43 = vld1q_f32(k0 + 12);
925 float32x4_t _k44 = vld1q_f32(k0 + 16);
926 k0 -= 80;
927
928 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
929 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
930 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
931 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
932 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
933 _sum1 = vmlaq_f32(_sum1, _k40, _r41);
934 _sum1 = vmlaq_f32(_sum1, _k41, _r42);
935 _sum1 = vmlaq_f32(_sum1, _k42, _r43);
936 _sum1 = vmlaq_f32(_sum1, _k43, _r44);
937 _sum1 = vmlaq_f32(_sum1, _k44, _r45);
938 _sum2 = vmlaq_f32(_sum2, _k40, _r42);
939 _sum2 = vmlaq_f32(_sum2, _k41, _r43);
940 _sum2 = vmlaq_f32(_sum2, _k42, _r44);
941 _sum2 = vmlaq_f32(_sum2, _k43, _r45);
942 _sum2 = vmlaq_f32(_sum2, _k44, _r46);
943 _sum3 = vmlaq_f32(_sum3, _k40, _r43);
944 _sum3 = vmlaq_f32(_sum3, _k41, _r44);
945 _sum3 = vmlaq_f32(_sum3, _k42, _r45);
946 _sum3 = vmlaq_f32(_sum3, _k43, _r46);
947 _sum3 = vmlaq_f32(_sum3, _k44, _r47);
948
949 vst1q_f32(outptr0, _sum0);
950 vst1q_f32(outptr0 + 4, _sum1);
951 vst1q_f32(outptr0 + 8, _sum2);
952 vst1q_f32(outptr0 + 12, _sum3);
953
954 r0 += 16;
955 r1 += 16;
956 r2 += 16;
957 r3 += 16;
958 r4 += 16;
959 outptr0 += 16;
960 }
961 for (; j + 1 < outw; j += 2)
962 {
963 float32x4_t _sum0 = _bias0;
964 float32x4_t _sum1 = _bias0;
965
966 float32x4_t _r00 = vld1q_f32(r0);
967 float32x4_t _r01 = vld1q_f32(r0 + 4);
968 float32x4_t _r02 = vld1q_f32(r0 + 8);
969 float32x4_t _r03 = vld1q_f32(r0 + 12);
970 float32x4_t _r04 = vld1q_f32(r0 + 16);
971 float32x4_t _r05 = vld1q_f32(r0 + 20);
972
973 float32x4_t _k00 = vld1q_f32(k0);
974 float32x4_t _k01 = vld1q_f32(k0 + 4);
975 float32x4_t _k02 = vld1q_f32(k0 + 8);
976 float32x4_t _k03 = vld1q_f32(k0 + 12);
977 float32x4_t _k04 = vld1q_f32(k0 + 16);
978 k0 += 20;
979
980 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
981 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
982 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
983 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
984 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
985 _sum1 = vmlaq_f32(_sum1, _k00, _r01);
986 _sum1 = vmlaq_f32(_sum1, _k01, _r02);
987 _sum1 = vmlaq_f32(_sum1, _k02, _r03);
988 _sum1 = vmlaq_f32(_sum1, _k03, _r04);
989 _sum1 = vmlaq_f32(_sum1, _k04, _r05);
990
991 float32x4_t _r10 = vld1q_f32(r1);
992 float32x4_t _r11 = vld1q_f32(r1 + 4);
993 float32x4_t _r12 = vld1q_f32(r1 + 8);
994 float32x4_t _r13 = vld1q_f32(r1 + 12);
995 float32x4_t _r14 = vld1q_f32(r1 + 16);
996 float32x4_t _r15 = vld1q_f32(r1 + 20);
997
998 float32x4_t _k10 = vld1q_f32(k0);
999 float32x4_t _k11 = vld1q_f32(k0 + 4);
1000 float32x4_t _k12 = vld1q_f32(k0 + 8);
1001 float32x4_t _k13 = vld1q_f32(k0 + 12);
1002 float32x4_t _k14 = vld1q_f32(k0 + 16);
1003 k0 += 20;
1004
1005 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1006 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1007 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1008 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1009 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1010 _sum1 = vmlaq_f32(_sum1, _k10, _r11);
1011 _sum1 = vmlaq_f32(_sum1, _k11, _r12);
1012 _sum1 = vmlaq_f32(_sum1, _k12, _r13);
1013 _sum1 = vmlaq_f32(_sum1, _k13, _r14);
1014 _sum1 = vmlaq_f32(_sum1, _k14, _r15);
1015
1016 float32x4_t _r20 = vld1q_f32(r2);
1017 float32x4_t _r21 = vld1q_f32(r2 + 4);
1018 float32x4_t _r22 = vld1q_f32(r2 + 8);
1019 float32x4_t _r23 = vld1q_f32(r2 + 12);
1020 float32x4_t _r24 = vld1q_f32(r2 + 16);
1021 float32x4_t _r25 = vld1q_f32(r2 + 20);
1022
1023 float32x4_t _k20 = vld1q_f32(k0);
1024 float32x4_t _k21 = vld1q_f32(k0 + 4);
1025 float32x4_t _k22 = vld1q_f32(k0 + 8);
1026 float32x4_t _k23 = vld1q_f32(k0 + 12);
1027 float32x4_t _k24 = vld1q_f32(k0 + 16);
1028 k0 += 20;
1029
1030 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1031 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1032 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1033 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1034 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1035 _sum1 = vmlaq_f32(_sum1, _k20, _r21);
1036 _sum1 = vmlaq_f32(_sum1, _k21, _r22);
1037 _sum1 = vmlaq_f32(_sum1, _k22, _r23);
1038 _sum1 = vmlaq_f32(_sum1, _k23, _r24);
1039 _sum1 = vmlaq_f32(_sum1, _k24, _r25);
1040
1041 float32x4_t _r30 = vld1q_f32(r3);
1042 float32x4_t _r31 = vld1q_f32(r3 + 4);
1043 float32x4_t _r32 = vld1q_f32(r3 + 8);
1044 float32x4_t _r33 = vld1q_f32(r3 + 12);
1045 float32x4_t _r34 = vld1q_f32(r3 + 16);
1046 float32x4_t _r35 = vld1q_f32(r3 + 20);
1047
1048 float32x4_t _k30 = vld1q_f32(k0);
1049 float32x4_t _k31 = vld1q_f32(k0 + 4);
1050 float32x4_t _k32 = vld1q_f32(k0 + 8);
1051 float32x4_t _k33 = vld1q_f32(k0 + 12);
1052 float32x4_t _k34 = vld1q_f32(k0 + 16);
1053 k0 += 20;
1054
1055 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1056 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1057 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1058 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1059 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1060 _sum1 = vmlaq_f32(_sum1, _k30, _r31);
1061 _sum1 = vmlaq_f32(_sum1, _k31, _r32);
1062 _sum1 = vmlaq_f32(_sum1, _k32, _r33);
1063 _sum1 = vmlaq_f32(_sum1, _k33, _r34);
1064 _sum1 = vmlaq_f32(_sum1, _k34, _r35);
1065
1066 float32x4_t _r40 = vld1q_f32(r4);
1067 float32x4_t _r41 = vld1q_f32(r4 + 4);
1068 float32x4_t _r42 = vld1q_f32(r4 + 8);
1069 float32x4_t _r43 = vld1q_f32(r4 + 12);
1070 float32x4_t _r44 = vld1q_f32(r4 + 16);
1071 float32x4_t _r45 = vld1q_f32(r4 + 20);
1072
1073 float32x4_t _k40 = vld1q_f32(k0);
1074 float32x4_t _k41 = vld1q_f32(k0 + 4);
1075 float32x4_t _k42 = vld1q_f32(k0 + 8);
1076 float32x4_t _k43 = vld1q_f32(k0 + 12);
1077 float32x4_t _k44 = vld1q_f32(k0 + 16);
1078 k0 -= 80;
1079
1080 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1081 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1082 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1083 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1084 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1085 _sum1 = vmlaq_f32(_sum1, _k40, _r41);
1086 _sum1 = vmlaq_f32(_sum1, _k41, _r42);
1087 _sum1 = vmlaq_f32(_sum1, _k42, _r43);
1088 _sum1 = vmlaq_f32(_sum1, _k43, _r44);
1089 _sum1 = vmlaq_f32(_sum1, _k44, _r45);
1090
1091 vst1q_f32(outptr0, _sum0);
1092 vst1q_f32(outptr0 + 4, _sum1);
1093
1094 r0 += 8;
1095 r1 += 8;
1096 r2 += 8;
1097 r3 += 8;
1098 r4 += 8;
1099 outptr0 += 8;
1100 }
1101 for (; j < outw; j++)
1102 {
1103 float32x4_t _sum0 = _bias0;
1104
1105 float32x4_t _r00 = vld1q_f32(r0);
1106 float32x4_t _r01 = vld1q_f32(r0 + 4);
1107 float32x4_t _r02 = vld1q_f32(r0 + 8);
1108 float32x4_t _r03 = vld1q_f32(r0 + 12);
1109 float32x4_t _r04 = vld1q_f32(r0 + 16);
1110
1111 float32x4_t _k00 = vld1q_f32(k0);
1112 float32x4_t _k01 = vld1q_f32(k0 + 4);
1113 float32x4_t _k02 = vld1q_f32(k0 + 8);
1114 float32x4_t _k03 = vld1q_f32(k0 + 12);
1115 float32x4_t _k04 = vld1q_f32(k0 + 16);
1116 k0 += 20;
1117
1118 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1119 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1120 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1121 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1122 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1123
1124 float32x4_t _r10 = vld1q_f32(r1);
1125 float32x4_t _r11 = vld1q_f32(r1 + 4);
1126 float32x4_t _r12 = vld1q_f32(r1 + 8);
1127 float32x4_t _r13 = vld1q_f32(r1 + 12);
1128 float32x4_t _r14 = vld1q_f32(r1 + 16);
1129
1130 float32x4_t _k10 = vld1q_f32(k0);
1131 float32x4_t _k11 = vld1q_f32(k0 + 4);
1132 float32x4_t _k12 = vld1q_f32(k0 + 8);
1133 float32x4_t _k13 = vld1q_f32(k0 + 12);
1134 float32x4_t _k14 = vld1q_f32(k0 + 16);
1135 k0 += 20;
1136
1137 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1138 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1139 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1140 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1141 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1142
1143 float32x4_t _r20 = vld1q_f32(r2);
1144 float32x4_t _r21 = vld1q_f32(r2 + 4);
1145 float32x4_t _r22 = vld1q_f32(r2 + 8);
1146 float32x4_t _r23 = vld1q_f32(r2 + 12);
1147 float32x4_t _r24 = vld1q_f32(r2 + 16);
1148
1149 float32x4_t _k20 = vld1q_f32(k0);
1150 float32x4_t _k21 = vld1q_f32(k0 + 4);
1151 float32x4_t _k22 = vld1q_f32(k0 + 8);
1152 float32x4_t _k23 = vld1q_f32(k0 + 12);
1153 float32x4_t _k24 = vld1q_f32(k0 + 16);
1154 k0 += 20;
1155
1156 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1157 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1158 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1159 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1160 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1161
1162 float32x4_t _r30 = vld1q_f32(r3);
1163 float32x4_t _r31 = vld1q_f32(r3 + 4);
1164 float32x4_t _r32 = vld1q_f32(r3 + 8);
1165 float32x4_t _r33 = vld1q_f32(r3 + 12);
1166 float32x4_t _r34 = vld1q_f32(r3 + 16);
1167
1168 float32x4_t _k30 = vld1q_f32(k0);
1169 float32x4_t _k31 = vld1q_f32(k0 + 4);
1170 float32x4_t _k32 = vld1q_f32(k0 + 8);
1171 float32x4_t _k33 = vld1q_f32(k0 + 12);
1172 float32x4_t _k34 = vld1q_f32(k0 + 16);
1173 k0 += 20;
1174
1175 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1176 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1177 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1178 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1179 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1180
1181 float32x4_t _r40 = vld1q_f32(r4);
1182 float32x4_t _r41 = vld1q_f32(r4 + 4);
1183 float32x4_t _r42 = vld1q_f32(r4 + 8);
1184 float32x4_t _r43 = vld1q_f32(r4 + 12);
1185 float32x4_t _r44 = vld1q_f32(r4 + 16);
1186
1187 float32x4_t _k40 = vld1q_f32(k0);
1188 float32x4_t _k41 = vld1q_f32(k0 + 4);
1189 float32x4_t _k42 = vld1q_f32(k0 + 8);
1190 float32x4_t _k43 = vld1q_f32(k0 + 12);
1191 float32x4_t _k44 = vld1q_f32(k0 + 16);
1192 k0 -= 80;
1193
1194 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1195 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1196 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1197 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1198 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1199
1200 vst1q_f32(outptr0, _sum0);
1201
1202 r0 += 4;
1203 r1 += 4;
1204 r2 += 4;
1205 r3 += 4;
1206 r4 += 4;
1207 outptr0 += 4;
1208 }
1209
1210 r0 += 4 * 4;
1211 r1 += 4 * 4;
1212 r2 += 4 * 4;
1213 r3 += 4 * 4;
1214 r4 += 4 * 4;
1215 }
1216 }
1217 }
1218
convdw5x5s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)1219 static void convdw5x5s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
1220 {
1221 int w = bottom_blob.w;
1222
1223 int outw = top_blob.w;
1224 int outh = top_blob.h;
1225
1226 const int group = bottom_blob.c;
1227
1228 const int tailstep = (w - 2 * outw + w) * 4;
1229
1230 const float* bias = _bias;
1231
1232 #pragma omp parallel for num_threads(opt.num_threads)
1233 for (int g = 0; g < group; g++)
1234 {
1235 Mat out = top_blob.channel(g);
1236
1237 float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);
1238
1239 const float* k0 = kernel.row(g);
1240
1241 float* outptr0 = out;
1242
1243 const Mat img0 = bottom_blob.channel(g);
1244
1245 const float* r0 = img0.row(0);
1246 const float* r1 = img0.row(1);
1247 const float* r2 = img0.row(2);
1248 const float* r3 = img0.row(3);
1249 const float* r4 = img0.row(4);
1250
1251 int i = 0;
1252
1253 for (; i < outh; i++)
1254 {
1255 int j = 0;
1256
1257 for (; j + 3 < outw; j += 4)
1258 {
1259 float32x4_t _sum0 = _bias0;
1260 float32x4_t _sum1 = _bias0;
1261 float32x4_t _sum2 = _bias0;
1262 float32x4_t _sum3 = _bias0;
1263
1264 float32x4_t _r00 = vld1q_f32(r0);
1265 float32x4_t _r01 = vld1q_f32(r0 + 4);
1266 float32x4_t _r02 = vld1q_f32(r0 + 8);
1267 float32x4_t _r03 = vld1q_f32(r0 + 12);
1268 float32x4_t _r04 = vld1q_f32(r0 + 16);
1269 float32x4_t _r05 = vld1q_f32(r0 + 20);
1270 float32x4_t _r06 = vld1q_f32(r0 + 24);
1271 float32x4_t _r07 = vld1q_f32(r0 + 28);
1272 float32x4_t _r08 = vld1q_f32(r0 + 32);
1273 float32x4_t _r09 = vld1q_f32(r0 + 36);
1274 float32x4_t _r010 = vld1q_f32(r0 + 40);
1275
1276 float32x4_t _k00 = vld1q_f32(k0);
1277 float32x4_t _k01 = vld1q_f32(k0 + 4);
1278 float32x4_t _k02 = vld1q_f32(k0 + 8);
1279 float32x4_t _k03 = vld1q_f32(k0 + 12);
1280 float32x4_t _k04 = vld1q_f32(k0 + 16);
1281 k0 += 20;
1282
1283 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1284 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1285 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1286 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1287 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1288 _sum1 = vmlaq_f32(_sum1, _k00, _r02);
1289 _sum1 = vmlaq_f32(_sum1, _k01, _r03);
1290 _sum1 = vmlaq_f32(_sum1, _k02, _r04);
1291 _sum1 = vmlaq_f32(_sum1, _k03, _r05);
1292 _sum1 = vmlaq_f32(_sum1, _k04, _r06);
1293 _sum2 = vmlaq_f32(_sum2, _k00, _r04);
1294 _sum2 = vmlaq_f32(_sum2, _k01, _r05);
1295 _sum2 = vmlaq_f32(_sum2, _k02, _r06);
1296 _sum2 = vmlaq_f32(_sum2, _k03, _r07);
1297 _sum2 = vmlaq_f32(_sum2, _k04, _r08);
1298 _sum3 = vmlaq_f32(_sum3, _k00, _r06);
1299 _sum3 = vmlaq_f32(_sum3, _k01, _r07);
1300 _sum3 = vmlaq_f32(_sum3, _k02, _r08);
1301 _sum3 = vmlaq_f32(_sum3, _k03, _r09);
1302 _sum3 = vmlaq_f32(_sum3, _k04, _r010);
1303
1304 float32x4_t _r10 = vld1q_f32(r1);
1305 float32x4_t _r11 = vld1q_f32(r1 + 4);
1306 float32x4_t _r12 = vld1q_f32(r1 + 8);
1307 float32x4_t _r13 = vld1q_f32(r1 + 12);
1308 float32x4_t _r14 = vld1q_f32(r1 + 16);
1309 float32x4_t _r15 = vld1q_f32(r1 + 20);
1310 float32x4_t _r16 = vld1q_f32(r1 + 24);
1311 float32x4_t _r17 = vld1q_f32(r1 + 28);
1312 float32x4_t _r18 = vld1q_f32(r1 + 32);
1313 float32x4_t _r19 = vld1q_f32(r1 + 36);
1314 float32x4_t _r110 = vld1q_f32(r1 + 40);
1315
1316 float32x4_t _k10 = vld1q_f32(k0);
1317 float32x4_t _k11 = vld1q_f32(k0 + 4);
1318 float32x4_t _k12 = vld1q_f32(k0 + 8);
1319 float32x4_t _k13 = vld1q_f32(k0 + 12);
1320 float32x4_t _k14 = vld1q_f32(k0 + 16);
1321 k0 += 20;
1322
1323 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1324 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1325 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1326 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1327 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1328 _sum1 = vmlaq_f32(_sum1, _k10, _r12);
1329 _sum1 = vmlaq_f32(_sum1, _k11, _r13);
1330 _sum1 = vmlaq_f32(_sum1, _k12, _r14);
1331 _sum1 = vmlaq_f32(_sum1, _k13, _r15);
1332 _sum1 = vmlaq_f32(_sum1, _k14, _r16);
1333 _sum2 = vmlaq_f32(_sum2, _k10, _r14);
1334 _sum2 = vmlaq_f32(_sum2, _k11, _r15);
1335 _sum2 = vmlaq_f32(_sum2, _k12, _r16);
1336 _sum2 = vmlaq_f32(_sum2, _k13, _r17);
1337 _sum2 = vmlaq_f32(_sum2, _k14, _r18);
1338 _sum3 = vmlaq_f32(_sum3, _k10, _r16);
1339 _sum3 = vmlaq_f32(_sum3, _k11, _r17);
1340 _sum3 = vmlaq_f32(_sum3, _k12, _r18);
1341 _sum3 = vmlaq_f32(_sum3, _k13, _r19);
1342 _sum3 = vmlaq_f32(_sum3, _k14, _r110);
1343
1344 float32x4_t _r20 = vld1q_f32(r2);
1345 float32x4_t _r21 = vld1q_f32(r2 + 4);
1346 float32x4_t _r22 = vld1q_f32(r2 + 8);
1347 float32x4_t _r23 = vld1q_f32(r2 + 12);
1348 float32x4_t _r24 = vld1q_f32(r2 + 16);
1349 float32x4_t _r25 = vld1q_f32(r2 + 20);
1350 float32x4_t _r26 = vld1q_f32(r2 + 24);
1351 float32x4_t _r27 = vld1q_f32(r2 + 28);
1352 float32x4_t _r28 = vld1q_f32(r2 + 32);
1353 float32x4_t _r29 = vld1q_f32(r2 + 36);
1354 float32x4_t _r210 = vld1q_f32(r2 + 40);
1355
1356 float32x4_t _k20 = vld1q_f32(k0);
1357 float32x4_t _k21 = vld1q_f32(k0 + 4);
1358 float32x4_t _k22 = vld1q_f32(k0 + 8);
1359 float32x4_t _k23 = vld1q_f32(k0 + 12);
1360 float32x4_t _k24 = vld1q_f32(k0 + 16);
1361 k0 += 20;
1362
1363 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1364 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1365 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1366 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1367 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1368 _sum1 = vmlaq_f32(_sum1, _k20, _r22);
1369 _sum1 = vmlaq_f32(_sum1, _k21, _r23);
1370 _sum1 = vmlaq_f32(_sum1, _k22, _r24);
1371 _sum1 = vmlaq_f32(_sum1, _k23, _r25);
1372 _sum1 = vmlaq_f32(_sum1, _k24, _r26);
1373 _sum2 = vmlaq_f32(_sum2, _k20, _r24);
1374 _sum2 = vmlaq_f32(_sum2, _k21, _r25);
1375 _sum2 = vmlaq_f32(_sum2, _k22, _r26);
1376 _sum2 = vmlaq_f32(_sum2, _k23, _r27);
1377 _sum2 = vmlaq_f32(_sum2, _k24, _r28);
1378 _sum3 = vmlaq_f32(_sum3, _k20, _r26);
1379 _sum3 = vmlaq_f32(_sum3, _k21, _r27);
1380 _sum3 = vmlaq_f32(_sum3, _k22, _r28);
1381 _sum3 = vmlaq_f32(_sum3, _k23, _r29);
1382 _sum3 = vmlaq_f32(_sum3, _k24, _r210);
1383
1384 float32x4_t _r30 = vld1q_f32(r3);
1385 float32x4_t _r31 = vld1q_f32(r3 + 4);
1386 float32x4_t _r32 = vld1q_f32(r3 + 8);
1387 float32x4_t _r33 = vld1q_f32(r3 + 12);
1388 float32x4_t _r34 = vld1q_f32(r3 + 16);
1389 float32x4_t _r35 = vld1q_f32(r3 + 20);
1390 float32x4_t _r36 = vld1q_f32(r3 + 24);
1391 float32x4_t _r37 = vld1q_f32(r3 + 28);
1392 float32x4_t _r38 = vld1q_f32(r3 + 32);
1393 float32x4_t _r39 = vld1q_f32(r3 + 36);
1394 float32x4_t _r310 = vld1q_f32(r3 + 40);
1395
1396 float32x4_t _k30 = vld1q_f32(k0);
1397 float32x4_t _k31 = vld1q_f32(k0 + 4);
1398 float32x4_t _k32 = vld1q_f32(k0 + 8);
1399 float32x4_t _k33 = vld1q_f32(k0 + 12);
1400 float32x4_t _k34 = vld1q_f32(k0 + 16);
1401 k0 += 20;
1402
1403 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1404 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1405 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1406 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1407 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1408 _sum1 = vmlaq_f32(_sum1, _k30, _r32);
1409 _sum1 = vmlaq_f32(_sum1, _k31, _r33);
1410 _sum1 = vmlaq_f32(_sum1, _k32, _r34);
1411 _sum1 = vmlaq_f32(_sum1, _k33, _r35);
1412 _sum1 = vmlaq_f32(_sum1, _k34, _r36);
1413 _sum2 = vmlaq_f32(_sum2, _k30, _r34);
1414 _sum2 = vmlaq_f32(_sum2, _k31, _r35);
1415 _sum2 = vmlaq_f32(_sum2, _k32, _r36);
1416 _sum2 = vmlaq_f32(_sum2, _k33, _r37);
1417 _sum2 = vmlaq_f32(_sum2, _k34, _r38);
1418 _sum3 = vmlaq_f32(_sum3, _k30, _r36);
1419 _sum3 = vmlaq_f32(_sum3, _k31, _r37);
1420 _sum3 = vmlaq_f32(_sum3, _k32, _r38);
1421 _sum3 = vmlaq_f32(_sum3, _k33, _r39);
1422 _sum3 = vmlaq_f32(_sum3, _k34, _r310);
1423
1424 float32x4_t _r40 = vld1q_f32(r4);
1425 float32x4_t _r41 = vld1q_f32(r4 + 4);
1426 float32x4_t _r42 = vld1q_f32(r4 + 8);
1427 float32x4_t _r43 = vld1q_f32(r4 + 12);
1428 float32x4_t _r44 = vld1q_f32(r4 + 16);
1429 float32x4_t _r45 = vld1q_f32(r4 + 20);
1430 float32x4_t _r46 = vld1q_f32(r4 + 24);
1431 float32x4_t _r47 = vld1q_f32(r4 + 28);
1432 float32x4_t _r48 = vld1q_f32(r4 + 32);
1433 float32x4_t _r49 = vld1q_f32(r4 + 36);
1434 float32x4_t _r410 = vld1q_f32(r4 + 40);
1435
1436 float32x4_t _k40 = vld1q_f32(k0);
1437 float32x4_t _k41 = vld1q_f32(k0 + 4);
1438 float32x4_t _k42 = vld1q_f32(k0 + 8);
1439 float32x4_t _k43 = vld1q_f32(k0 + 12);
1440 float32x4_t _k44 = vld1q_f32(k0 + 16);
1441 k0 -= 80;
1442
1443 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1444 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1445 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1446 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1447 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1448 _sum1 = vmlaq_f32(_sum1, _k40, _r42);
1449 _sum1 = vmlaq_f32(_sum1, _k41, _r43);
1450 _sum1 = vmlaq_f32(_sum1, _k42, _r44);
1451 _sum1 = vmlaq_f32(_sum1, _k43, _r45);
1452 _sum1 = vmlaq_f32(_sum1, _k44, _r46);
1453 _sum2 = vmlaq_f32(_sum2, _k40, _r44);
1454 _sum2 = vmlaq_f32(_sum2, _k41, _r45);
1455 _sum2 = vmlaq_f32(_sum2, _k42, _r46);
1456 _sum2 = vmlaq_f32(_sum2, _k43, _r47);
1457 _sum2 = vmlaq_f32(_sum2, _k44, _r48);
1458 _sum3 = vmlaq_f32(_sum3, _k40, _r46);
1459 _sum3 = vmlaq_f32(_sum3, _k41, _r47);
1460 _sum3 = vmlaq_f32(_sum3, _k42, _r48);
1461 _sum3 = vmlaq_f32(_sum3, _k43, _r49);
1462 _sum3 = vmlaq_f32(_sum3, _k44, _r410);
1463
1464 vst1q_f32(outptr0, _sum0);
1465 vst1q_f32(outptr0 + 4, _sum1);
1466 vst1q_f32(outptr0 + 8, _sum2);
1467 vst1q_f32(outptr0 + 12, _sum3);
1468
1469 r0 += 8 * 4;
1470 r1 += 8 * 4;
1471 r2 += 8 * 4;
1472 r3 += 8 * 4;
1473 r4 += 8 * 4;
1474 outptr0 += 16;
1475 }
1476 for (; j + 1 < outw; j += 2)
1477 {
1478 float32x4_t _sum0 = _bias0;
1479 float32x4_t _sum1 = _bias0;
1480
1481 float32x4_t _r00 = vld1q_f32(r0);
1482 float32x4_t _r01 = vld1q_f32(r0 + 4);
1483 float32x4_t _r02 = vld1q_f32(r0 + 8);
1484 float32x4_t _r03 = vld1q_f32(r0 + 12);
1485 float32x4_t _r04 = vld1q_f32(r0 + 16);
1486 float32x4_t _r05 = vld1q_f32(r0 + 20);
1487 float32x4_t _r06 = vld1q_f32(r0 + 24);
1488
1489 float32x4_t _k00 = vld1q_f32(k0);
1490 float32x4_t _k01 = vld1q_f32(k0 + 4);
1491 float32x4_t _k02 = vld1q_f32(k0 + 8);
1492 float32x4_t _k03 = vld1q_f32(k0 + 12);
1493 float32x4_t _k04 = vld1q_f32(k0 + 16);
1494 k0 += 20;
1495
1496 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1497 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1498 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1499 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1500 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1501 _sum1 = vmlaq_f32(_sum1, _k00, _r02);
1502 _sum1 = vmlaq_f32(_sum1, _k01, _r03);
1503 _sum1 = vmlaq_f32(_sum1, _k02, _r04);
1504 _sum1 = vmlaq_f32(_sum1, _k03, _r05);
1505 _sum1 = vmlaq_f32(_sum1, _k04, _r06);
1506
1507 float32x4_t _r10 = vld1q_f32(r1);
1508 float32x4_t _r11 = vld1q_f32(r1 + 4);
1509 float32x4_t _r12 = vld1q_f32(r1 + 8);
1510 float32x4_t _r13 = vld1q_f32(r1 + 12);
1511 float32x4_t _r14 = vld1q_f32(r1 + 16);
1512 float32x4_t _r15 = vld1q_f32(r1 + 20);
1513 float32x4_t _r16 = vld1q_f32(r1 + 24);
1514
1515 float32x4_t _k10 = vld1q_f32(k0);
1516 float32x4_t _k11 = vld1q_f32(k0 + 4);
1517 float32x4_t _k12 = vld1q_f32(k0 + 8);
1518 float32x4_t _k13 = vld1q_f32(k0 + 12);
1519 float32x4_t _k14 = vld1q_f32(k0 + 16);
1520 k0 += 20;
1521
1522 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1523 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1524 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1525 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1526 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1527 _sum1 = vmlaq_f32(_sum1, _k10, _r12);
1528 _sum1 = vmlaq_f32(_sum1, _k11, _r13);
1529 _sum1 = vmlaq_f32(_sum1, _k12, _r14);
1530 _sum1 = vmlaq_f32(_sum1, _k13, _r15);
1531 _sum1 = vmlaq_f32(_sum1, _k14, _r16);
1532
1533 float32x4_t _r20 = vld1q_f32(r2);
1534 float32x4_t _r21 = vld1q_f32(r2 + 4);
1535 float32x4_t _r22 = vld1q_f32(r2 + 8);
1536 float32x4_t _r23 = vld1q_f32(r2 + 12);
1537 float32x4_t _r24 = vld1q_f32(r2 + 16);
1538 float32x4_t _r25 = vld1q_f32(r2 + 20);
1539 float32x4_t _r26 = vld1q_f32(r2 + 24);
1540
1541 float32x4_t _k20 = vld1q_f32(k0);
1542 float32x4_t _k21 = vld1q_f32(k0 + 4);
1543 float32x4_t _k22 = vld1q_f32(k0 + 8);
1544 float32x4_t _k23 = vld1q_f32(k0 + 12);
1545 float32x4_t _k24 = vld1q_f32(k0 + 16);
1546 k0 += 20;
1547
1548 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1549 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1550 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1551 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1552 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1553 _sum1 = vmlaq_f32(_sum1, _k20, _r22);
1554 _sum1 = vmlaq_f32(_sum1, _k21, _r23);
1555 _sum1 = vmlaq_f32(_sum1, _k22, _r24);
1556 _sum1 = vmlaq_f32(_sum1, _k23, _r25);
1557 _sum1 = vmlaq_f32(_sum1, _k24, _r26);
1558
1559 float32x4_t _r30 = vld1q_f32(r3);
1560 float32x4_t _r31 = vld1q_f32(r3 + 4);
1561 float32x4_t _r32 = vld1q_f32(r3 + 8);
1562 float32x4_t _r33 = vld1q_f32(r3 + 12);
1563 float32x4_t _r34 = vld1q_f32(r3 + 16);
1564 float32x4_t _r35 = vld1q_f32(r3 + 20);
1565 float32x4_t _r36 = vld1q_f32(r3 + 24);
1566
1567 float32x4_t _k30 = vld1q_f32(k0);
1568 float32x4_t _k31 = vld1q_f32(k0 + 4);
1569 float32x4_t _k32 = vld1q_f32(k0 + 8);
1570 float32x4_t _k33 = vld1q_f32(k0 + 12);
1571 float32x4_t _k34 = vld1q_f32(k0 + 16);
1572 k0 += 20;
1573
1574 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1575 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1576 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1577 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1578 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1579 _sum1 = vmlaq_f32(_sum1, _k30, _r32);
1580 _sum1 = vmlaq_f32(_sum1, _k31, _r33);
1581 _sum1 = vmlaq_f32(_sum1, _k32, _r34);
1582 _sum1 = vmlaq_f32(_sum1, _k33, _r35);
1583 _sum1 = vmlaq_f32(_sum1, _k34, _r36);
1584
1585 float32x4_t _r40 = vld1q_f32(r4);
1586 float32x4_t _r41 = vld1q_f32(r4 + 4);
1587 float32x4_t _r42 = vld1q_f32(r4 + 8);
1588 float32x4_t _r43 = vld1q_f32(r4 + 12);
1589 float32x4_t _r44 = vld1q_f32(r4 + 16);
1590 float32x4_t _r45 = vld1q_f32(r4 + 20);
1591 float32x4_t _r46 = vld1q_f32(r4 + 24);
1592
1593 float32x4_t _k40 = vld1q_f32(k0);
1594 float32x4_t _k41 = vld1q_f32(k0 + 4);
1595 float32x4_t _k42 = vld1q_f32(k0 + 8);
1596 float32x4_t _k43 = vld1q_f32(k0 + 12);
1597 float32x4_t _k44 = vld1q_f32(k0 + 16);
1598 k0 -= 80;
1599
1600 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1601 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1602 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1603 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1604 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1605 _sum1 = vmlaq_f32(_sum1, _k40, _r42);
1606 _sum1 = vmlaq_f32(_sum1, _k41, _r43);
1607 _sum1 = vmlaq_f32(_sum1, _k42, _r44);
1608 _sum1 = vmlaq_f32(_sum1, _k43, _r45);
1609 _sum1 = vmlaq_f32(_sum1, _k44, _r46);
1610
1611 vst1q_f32(outptr0, _sum0);
1612 vst1q_f32(outptr0 + 4, _sum1);
1613
1614 r0 += 4 * 4;
1615 r1 += 4 * 4;
1616 r2 += 4 * 4;
1617 r3 += 4 * 4;
1618 r4 += 4 * 4;
1619 outptr0 += 8;
1620 }
1621 for (; j < outw; j++)
1622 {
1623 float32x4_t _sum0 = _bias0;
1624
1625 float32x4_t _r00 = vld1q_f32(r0);
1626 float32x4_t _r01 = vld1q_f32(r0 + 4);
1627 float32x4_t _r02 = vld1q_f32(r0 + 8);
1628 float32x4_t _r03 = vld1q_f32(r0 + 12);
1629 float32x4_t _r04 = vld1q_f32(r0 + 16);
1630
1631 float32x4_t _k00 = vld1q_f32(k0);
1632 float32x4_t _k01 = vld1q_f32(k0 + 4);
1633 float32x4_t _k02 = vld1q_f32(k0 + 8);
1634 float32x4_t _k03 = vld1q_f32(k0 + 12);
1635 float32x4_t _k04 = vld1q_f32(k0 + 16);
1636 k0 += 20;
1637
1638 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1639 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1640 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1641 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1642 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1643
1644 float32x4_t _r10 = vld1q_f32(r1);
1645 float32x4_t _r11 = vld1q_f32(r1 + 4);
1646 float32x4_t _r12 = vld1q_f32(r1 + 8);
1647 float32x4_t _r13 = vld1q_f32(r1 + 12);
1648 float32x4_t _r14 = vld1q_f32(r1 + 16);
1649
1650 float32x4_t _k10 = vld1q_f32(k0);
1651 float32x4_t _k11 = vld1q_f32(k0 + 4);
1652 float32x4_t _k12 = vld1q_f32(k0 + 8);
1653 float32x4_t _k13 = vld1q_f32(k0 + 12);
1654 float32x4_t _k14 = vld1q_f32(k0 + 16);
1655 k0 += 20;
1656
1657 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1658 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1659 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1660 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1661 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1662
1663 float32x4_t _r20 = vld1q_f32(r2);
1664 float32x4_t _r21 = vld1q_f32(r2 + 4);
1665 float32x4_t _r22 = vld1q_f32(r2 + 8);
1666 float32x4_t _r23 = vld1q_f32(r2 + 12);
1667 float32x4_t _r24 = vld1q_f32(r2 + 16);
1668
1669 float32x4_t _k20 = vld1q_f32(k0);
1670 float32x4_t _k21 = vld1q_f32(k0 + 4);
1671 float32x4_t _k22 = vld1q_f32(k0 + 8);
1672 float32x4_t _k23 = vld1q_f32(k0 + 12);
1673 float32x4_t _k24 = vld1q_f32(k0 + 16);
1674 k0 += 20;
1675
1676 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1677 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1678 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1679 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1680 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1681
1682 float32x4_t _r30 = vld1q_f32(r3);
1683 float32x4_t _r31 = vld1q_f32(r3 + 4);
1684 float32x4_t _r32 = vld1q_f32(r3 + 8);
1685 float32x4_t _r33 = vld1q_f32(r3 + 12);
1686 float32x4_t _r34 = vld1q_f32(r3 + 16);
1687
1688 float32x4_t _k30 = vld1q_f32(k0);
1689 float32x4_t _k31 = vld1q_f32(k0 + 4);
1690 float32x4_t _k32 = vld1q_f32(k0 + 8);
1691 float32x4_t _k33 = vld1q_f32(k0 + 12);
1692 float32x4_t _k34 = vld1q_f32(k0 + 16);
1693 k0 += 20;
1694
1695 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1696 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1697 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1698 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1699 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1700
1701 float32x4_t _r40 = vld1q_f32(r4);
1702 float32x4_t _r41 = vld1q_f32(r4 + 4);
1703 float32x4_t _r42 = vld1q_f32(r4 + 8);
1704 float32x4_t _r43 = vld1q_f32(r4 + 12);
1705 float32x4_t _r44 = vld1q_f32(r4 + 16);
1706
1707 float32x4_t _k40 = vld1q_f32(k0);
1708 float32x4_t _k41 = vld1q_f32(k0 + 4);
1709 float32x4_t _k42 = vld1q_f32(k0 + 8);
1710 float32x4_t _k43 = vld1q_f32(k0 + 12);
1711 float32x4_t _k44 = vld1q_f32(k0 + 16);
1712 k0 -= 80;
1713
1714 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1715 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1716 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1717 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1718 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1719
1720 vst1q_f32(outptr0, _sum0);
1721
1722 r0 += 2 * 4;
1723 r1 += 2 * 4;
1724 r2 += 2 * 4;
1725 r3 += 2 * 4;
1726 r4 += 2 * 4;
1727 outptr0 += 4;
1728 }
1729
1730 r0 += tailstep;
1731 r1 += tailstep;
1732 r2 += tailstep;
1733 r3 += tailstep;
1734 r4 += tailstep;
1735 }
1736 }
1737 }
1738