1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
convdw5x5s1_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void convdw5x5s1_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 #if __aarch64__
18     const int w = bottom_blob.w;
19 #endif
20 
21     const int outw = top_blob.w;
22     const int outh = top_blob.h;
23 
24     const int group = bottom_blob.c;
25 
26     const float* bias = _bias;
27 
28     #pragma omp parallel for num_threads(opt.num_threads)
29     for (int g = 0; g < group; g++)
30     {
31         Mat out = top_blob.channel(g);
32 
33         float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);
34 
35         const float* k0 = kernel.row(g);
36 
37         float* outptr0 = out.row(0);
38 
39         const Mat img0 = bottom_blob.channel(g);
40 
41         const float* r0 = img0.row(0);
42         const float* r1 = img0.row(1);
43         const float* r2 = img0.row(2);
44         const float* r3 = img0.row(3);
45         const float* r4 = img0.row(4);
46 
47         int i = 0;
48 
49 #if __aarch64__
50         float* outptr1 = out.row(1);
51         const float* r5 = img0.row(5);
52 
53         for (; i + 1 < outh; i += 2)
54         {
55             int j = 0;
56 
57             for (; j + 3 < outw; j += 4)
58             {
59                 float32x4_t _sum00 = _bias0;
60                 float32x4_t _sum01 = _bias0;
61                 float32x4_t _sum02 = _bias0;
62                 float32x4_t _sum03 = _bias0;
63                 float32x4_t _sum10 = _bias0;
64                 float32x4_t _sum11 = _bias0;
65                 float32x4_t _sum12 = _bias0;
66                 float32x4_t _sum13 = _bias0;
67 
68                 float32x4_t _r00 = vld1q_f32(r0);
69                 float32x4_t _r01 = vld1q_f32(r0 + 4);
70                 float32x4_t _r02 = vld1q_f32(r0 + 8);
71                 float32x4_t _r03 = vld1q_f32(r0 + 12);
72                 float32x4_t _r04 = vld1q_f32(r0 + 16);
73                 float32x4_t _r05 = vld1q_f32(r0 + 20);
74                 float32x4_t _r06 = vld1q_f32(r0 + 24);
75                 float32x4_t _r07 = vld1q_f32(r0 + 28);
76 
77                 float32x4_t _k00 = vld1q_f32(k0);
78                 float32x4_t _k01 = vld1q_f32(k0 + 4);
79                 float32x4_t _k02 = vld1q_f32(k0 + 8);
80                 float32x4_t _k03 = vld1q_f32(k0 + 12);
81                 float32x4_t _k04 = vld1q_f32(k0 + 16);
82                 k0 += 20;
83 
84                 _sum00 = vmlaq_f32(_sum00, _k00, _r00);
85                 _sum00 = vmlaq_f32(_sum00, _k01, _r01);
86                 _sum00 = vmlaq_f32(_sum00, _k02, _r02);
87                 _sum00 = vmlaq_f32(_sum00, _k03, _r03);
88                 _sum00 = vmlaq_f32(_sum00, _k04, _r04);
89                 _sum01 = vmlaq_f32(_sum01, _k00, _r01);
90                 _sum01 = vmlaq_f32(_sum01, _k01, _r02);
91                 _sum01 = vmlaq_f32(_sum01, _k02, _r03);
92                 _sum01 = vmlaq_f32(_sum01, _k03, _r04);
93                 _sum01 = vmlaq_f32(_sum01, _k04, _r05);
94                 _sum02 = vmlaq_f32(_sum02, _k00, _r02);
95                 _sum02 = vmlaq_f32(_sum02, _k01, _r03);
96                 _sum02 = vmlaq_f32(_sum02, _k02, _r04);
97                 _sum02 = vmlaq_f32(_sum02, _k03, _r05);
98                 _sum02 = vmlaq_f32(_sum02, _k04, _r06);
99                 _sum03 = vmlaq_f32(_sum03, _k00, _r03);
100                 _sum03 = vmlaq_f32(_sum03, _k01, _r04);
101                 _sum03 = vmlaq_f32(_sum03, _k02, _r05);
102                 _sum03 = vmlaq_f32(_sum03, _k03, _r06);
103                 _sum03 = vmlaq_f32(_sum03, _k04, _r07);
104 
105                 float32x4_t _r10 = vld1q_f32(r1);
106                 float32x4_t _r11 = vld1q_f32(r1 + 4);
107                 float32x4_t _r12 = vld1q_f32(r1 + 8);
108                 float32x4_t _r13 = vld1q_f32(r1 + 12);
109                 float32x4_t _r14 = vld1q_f32(r1 + 16);
110                 float32x4_t _r15 = vld1q_f32(r1 + 20);
111                 float32x4_t _r16 = vld1q_f32(r1 + 24);
112                 float32x4_t _r17 = vld1q_f32(r1 + 28);
113 
114                 float32x4_t _k10 = vld1q_f32(k0);
115                 float32x4_t _k11 = vld1q_f32(k0 + 4);
116                 float32x4_t _k12 = vld1q_f32(k0 + 8);
117                 float32x4_t _k13 = vld1q_f32(k0 + 12);
118                 float32x4_t _k14 = vld1q_f32(k0 + 16);
119                 k0 += 20;
120 
121                 _sum10 = vmlaq_f32(_sum10, _k00, _r10);
122                 _sum10 = vmlaq_f32(_sum10, _k01, _r11);
123                 _sum10 = vmlaq_f32(_sum10, _k02, _r12);
124                 _sum10 = vmlaq_f32(_sum10, _k03, _r13);
125                 _sum10 = vmlaq_f32(_sum10, _k04, _r14);
126                 _sum11 = vmlaq_f32(_sum11, _k00, _r11);
127                 _sum11 = vmlaq_f32(_sum11, _k01, _r12);
128                 _sum11 = vmlaq_f32(_sum11, _k02, _r13);
129                 _sum11 = vmlaq_f32(_sum11, _k03, _r14);
130                 _sum11 = vmlaq_f32(_sum11, _k04, _r15);
131                 _sum12 = vmlaq_f32(_sum12, _k00, _r12);
132                 _sum12 = vmlaq_f32(_sum12, _k01, _r13);
133                 _sum12 = vmlaq_f32(_sum12, _k02, _r14);
134                 _sum12 = vmlaq_f32(_sum12, _k03, _r15);
135                 _sum12 = vmlaq_f32(_sum12, _k04, _r16);
136                 _sum13 = vmlaq_f32(_sum13, _k00, _r13);
137                 _sum13 = vmlaq_f32(_sum13, _k01, _r14);
138                 _sum13 = vmlaq_f32(_sum13, _k02, _r15);
139                 _sum13 = vmlaq_f32(_sum13, _k03, _r16);
140                 _sum13 = vmlaq_f32(_sum13, _k04, _r17);
141 
142                 _sum00 = vmlaq_f32(_sum00, _k10, _r10);
143                 _sum00 = vmlaq_f32(_sum00, _k11, _r11);
144                 _sum00 = vmlaq_f32(_sum00, _k12, _r12);
145                 _sum00 = vmlaq_f32(_sum00, _k13, _r13);
146                 _sum00 = vmlaq_f32(_sum00, _k14, _r14);
147                 _sum01 = vmlaq_f32(_sum01, _k10, _r11);
148                 _sum01 = vmlaq_f32(_sum01, _k11, _r12);
149                 _sum01 = vmlaq_f32(_sum01, _k12, _r13);
150                 _sum01 = vmlaq_f32(_sum01, _k13, _r14);
151                 _sum01 = vmlaq_f32(_sum01, _k14, _r15);
152                 _sum02 = vmlaq_f32(_sum02, _k10, _r12);
153                 _sum02 = vmlaq_f32(_sum02, _k11, _r13);
154                 _sum02 = vmlaq_f32(_sum02, _k12, _r14);
155                 _sum02 = vmlaq_f32(_sum02, _k13, _r15);
156                 _sum02 = vmlaq_f32(_sum02, _k14, _r16);
157                 _sum03 = vmlaq_f32(_sum03, _k10, _r13);
158                 _sum03 = vmlaq_f32(_sum03, _k11, _r14);
159                 _sum03 = vmlaq_f32(_sum03, _k12, _r15);
160                 _sum03 = vmlaq_f32(_sum03, _k13, _r16);
161                 _sum03 = vmlaq_f32(_sum03, _k14, _r17);
162 
163                 float32x4_t _r20 = vld1q_f32(r2);
164                 float32x4_t _r21 = vld1q_f32(r2 + 4);
165                 float32x4_t _r22 = vld1q_f32(r2 + 8);
166                 float32x4_t _r23 = vld1q_f32(r2 + 12);
167                 float32x4_t _r24 = vld1q_f32(r2 + 16);
168                 float32x4_t _r25 = vld1q_f32(r2 + 20);
169                 float32x4_t _r26 = vld1q_f32(r2 + 24);
170                 float32x4_t _r27 = vld1q_f32(r2 + 28);
171 
172                 float32x4_t _k20 = vld1q_f32(k0);
173                 float32x4_t _k21 = vld1q_f32(k0 + 4);
174                 float32x4_t _k22 = vld1q_f32(k0 + 8);
175                 float32x4_t _k23 = vld1q_f32(k0 + 12);
176                 float32x4_t _k24 = vld1q_f32(k0 + 16);
177                 k0 += 20;
178 
179                 _sum10 = vmlaq_f32(_sum10, _k10, _r20);
180                 _sum10 = vmlaq_f32(_sum10, _k11, _r21);
181                 _sum10 = vmlaq_f32(_sum10, _k12, _r22);
182                 _sum10 = vmlaq_f32(_sum10, _k13, _r23);
183                 _sum10 = vmlaq_f32(_sum10, _k14, _r24);
184                 _sum11 = vmlaq_f32(_sum11, _k10, _r21);
185                 _sum11 = vmlaq_f32(_sum11, _k11, _r22);
186                 _sum11 = vmlaq_f32(_sum11, _k12, _r23);
187                 _sum11 = vmlaq_f32(_sum11, _k13, _r24);
188                 _sum11 = vmlaq_f32(_sum11, _k14, _r25);
189                 _sum12 = vmlaq_f32(_sum12, _k10, _r22);
190                 _sum12 = vmlaq_f32(_sum12, _k11, _r23);
191                 _sum12 = vmlaq_f32(_sum12, _k12, _r24);
192                 _sum12 = vmlaq_f32(_sum12, _k13, _r25);
193                 _sum12 = vmlaq_f32(_sum12, _k14, _r26);
194                 _sum13 = vmlaq_f32(_sum13, _k10, _r23);
195                 _sum13 = vmlaq_f32(_sum13, _k11, _r24);
196                 _sum13 = vmlaq_f32(_sum13, _k12, _r25);
197                 _sum13 = vmlaq_f32(_sum13, _k13, _r26);
198                 _sum13 = vmlaq_f32(_sum13, _k14, _r27);
199 
200                 _sum00 = vmlaq_f32(_sum00, _k20, _r20);
201                 _sum00 = vmlaq_f32(_sum00, _k21, _r21);
202                 _sum00 = vmlaq_f32(_sum00, _k22, _r22);
203                 _sum00 = vmlaq_f32(_sum00, _k23, _r23);
204                 _sum00 = vmlaq_f32(_sum00, _k24, _r24);
205                 _sum01 = vmlaq_f32(_sum01, _k20, _r21);
206                 _sum01 = vmlaq_f32(_sum01, _k21, _r22);
207                 _sum01 = vmlaq_f32(_sum01, _k22, _r23);
208                 _sum01 = vmlaq_f32(_sum01, _k23, _r24);
209                 _sum01 = vmlaq_f32(_sum01, _k24, _r25);
210                 _sum02 = vmlaq_f32(_sum02, _k20, _r22);
211                 _sum02 = vmlaq_f32(_sum02, _k21, _r23);
212                 _sum02 = vmlaq_f32(_sum02, _k22, _r24);
213                 _sum02 = vmlaq_f32(_sum02, _k23, _r25);
214                 _sum02 = vmlaq_f32(_sum02, _k24, _r26);
215                 _sum03 = vmlaq_f32(_sum03, _k20, _r23);
216                 _sum03 = vmlaq_f32(_sum03, _k21, _r24);
217                 _sum03 = vmlaq_f32(_sum03, _k22, _r25);
218                 _sum03 = vmlaq_f32(_sum03, _k23, _r26);
219                 _sum03 = vmlaq_f32(_sum03, _k24, _r27);
220 
221                 float32x4_t _r30 = vld1q_f32(r3);
222                 float32x4_t _r31 = vld1q_f32(r3 + 4);
223                 float32x4_t _r32 = vld1q_f32(r3 + 8);
224                 float32x4_t _r33 = vld1q_f32(r3 + 12);
225                 float32x4_t _r34 = vld1q_f32(r3 + 16);
226                 float32x4_t _r35 = vld1q_f32(r3 + 20);
227                 float32x4_t _r36 = vld1q_f32(r3 + 24);
228                 float32x4_t _r37 = vld1q_f32(r3 + 28);
229 
230                 float32x4_t _k30 = vld1q_f32(k0);
231                 float32x4_t _k31 = vld1q_f32(k0 + 4);
232                 float32x4_t _k32 = vld1q_f32(k0 + 8);
233                 float32x4_t _k33 = vld1q_f32(k0 + 12);
234                 float32x4_t _k34 = vld1q_f32(k0 + 16);
235                 k0 += 20;
236 
237                 _sum10 = vmlaq_f32(_sum10, _k20, _r30);
238                 _sum10 = vmlaq_f32(_sum10, _k21, _r31);
239                 _sum10 = vmlaq_f32(_sum10, _k22, _r32);
240                 _sum10 = vmlaq_f32(_sum10, _k23, _r33);
241                 _sum10 = vmlaq_f32(_sum10, _k24, _r34);
242                 _sum11 = vmlaq_f32(_sum11, _k20, _r31);
243                 _sum11 = vmlaq_f32(_sum11, _k21, _r32);
244                 _sum11 = vmlaq_f32(_sum11, _k22, _r33);
245                 _sum11 = vmlaq_f32(_sum11, _k23, _r34);
246                 _sum11 = vmlaq_f32(_sum11, _k24, _r35);
247                 _sum12 = vmlaq_f32(_sum12, _k20, _r32);
248                 _sum12 = vmlaq_f32(_sum12, _k21, _r33);
249                 _sum12 = vmlaq_f32(_sum12, _k22, _r34);
250                 _sum12 = vmlaq_f32(_sum12, _k23, _r35);
251                 _sum12 = vmlaq_f32(_sum12, _k24, _r36);
252                 _sum13 = vmlaq_f32(_sum13, _k20, _r33);
253                 _sum13 = vmlaq_f32(_sum13, _k21, _r34);
254                 _sum13 = vmlaq_f32(_sum13, _k22, _r35);
255                 _sum13 = vmlaq_f32(_sum13, _k23, _r36);
256                 _sum13 = vmlaq_f32(_sum13, _k24, _r37);
257 
258                 _sum00 = vmlaq_f32(_sum00, _k30, _r30);
259                 _sum00 = vmlaq_f32(_sum00, _k31, _r31);
260                 _sum00 = vmlaq_f32(_sum00, _k32, _r32);
261                 _sum00 = vmlaq_f32(_sum00, _k33, _r33);
262                 _sum00 = vmlaq_f32(_sum00, _k34, _r34);
263                 _sum01 = vmlaq_f32(_sum01, _k30, _r31);
264                 _sum01 = vmlaq_f32(_sum01, _k31, _r32);
265                 _sum01 = vmlaq_f32(_sum01, _k32, _r33);
266                 _sum01 = vmlaq_f32(_sum01, _k33, _r34);
267                 _sum01 = vmlaq_f32(_sum01, _k34, _r35);
268                 _sum02 = vmlaq_f32(_sum02, _k30, _r32);
269                 _sum02 = vmlaq_f32(_sum02, _k31, _r33);
270                 _sum02 = vmlaq_f32(_sum02, _k32, _r34);
271                 _sum02 = vmlaq_f32(_sum02, _k33, _r35);
272                 _sum02 = vmlaq_f32(_sum02, _k34, _r36);
273                 _sum03 = vmlaq_f32(_sum03, _k30, _r33);
274                 _sum03 = vmlaq_f32(_sum03, _k31, _r34);
275                 _sum03 = vmlaq_f32(_sum03, _k32, _r35);
276                 _sum03 = vmlaq_f32(_sum03, _k33, _r36);
277                 _sum03 = vmlaq_f32(_sum03, _k34, _r37);
278 
279                 float32x4_t _r40 = vld1q_f32(r4);
280                 float32x4_t _r41 = vld1q_f32(r4 + 4);
281                 float32x4_t _r42 = vld1q_f32(r4 + 8);
282                 float32x4_t _r43 = vld1q_f32(r4 + 12);
283                 float32x4_t _r44 = vld1q_f32(r4 + 16);
284                 float32x4_t _r45 = vld1q_f32(r4 + 20);
285                 float32x4_t _r46 = vld1q_f32(r4 + 24);
286                 float32x4_t _r47 = vld1q_f32(r4 + 28);
287 
288                 float32x4_t _k40 = vld1q_f32(k0);
289                 float32x4_t _k41 = vld1q_f32(k0 + 4);
290                 float32x4_t _k42 = vld1q_f32(k0 + 8);
291                 float32x4_t _k43 = vld1q_f32(k0 + 12);
292                 float32x4_t _k44 = vld1q_f32(k0 + 16);
293                 k0 -= 80;
294 
295                 _sum10 = vmlaq_f32(_sum10, _k30, _r40);
296                 _sum10 = vmlaq_f32(_sum10, _k31, _r41);
297                 _sum10 = vmlaq_f32(_sum10, _k32, _r42);
298                 _sum10 = vmlaq_f32(_sum10, _k33, _r43);
299                 _sum10 = vmlaq_f32(_sum10, _k34, _r44);
300                 _sum11 = vmlaq_f32(_sum11, _k30, _r41);
301                 _sum11 = vmlaq_f32(_sum11, _k31, _r42);
302                 _sum11 = vmlaq_f32(_sum11, _k32, _r43);
303                 _sum11 = vmlaq_f32(_sum11, _k33, _r44);
304                 _sum11 = vmlaq_f32(_sum11, _k34, _r45);
305                 _sum12 = vmlaq_f32(_sum12, _k30, _r42);
306                 _sum12 = vmlaq_f32(_sum12, _k31, _r43);
307                 _sum12 = vmlaq_f32(_sum12, _k32, _r44);
308                 _sum12 = vmlaq_f32(_sum12, _k33, _r45);
309                 _sum12 = vmlaq_f32(_sum12, _k34, _r46);
310                 _sum13 = vmlaq_f32(_sum13, _k30, _r43);
311                 _sum13 = vmlaq_f32(_sum13, _k31, _r44);
312                 _sum13 = vmlaq_f32(_sum13, _k32, _r45);
313                 _sum13 = vmlaq_f32(_sum13, _k33, _r46);
314                 _sum13 = vmlaq_f32(_sum13, _k34, _r47);
315 
316                 _sum00 = vmlaq_f32(_sum00, _k40, _r40);
317                 _sum00 = vmlaq_f32(_sum00, _k41, _r41);
318                 _sum00 = vmlaq_f32(_sum00, _k42, _r42);
319                 _sum00 = vmlaq_f32(_sum00, _k43, _r43);
320                 _sum00 = vmlaq_f32(_sum00, _k44, _r44);
321                 _sum01 = vmlaq_f32(_sum01, _k40, _r41);
322                 _sum01 = vmlaq_f32(_sum01, _k41, _r42);
323                 _sum01 = vmlaq_f32(_sum01, _k42, _r43);
324                 _sum01 = vmlaq_f32(_sum01, _k43, _r44);
325                 _sum01 = vmlaq_f32(_sum01, _k44, _r45);
326                 _sum02 = vmlaq_f32(_sum02, _k40, _r42);
327                 _sum02 = vmlaq_f32(_sum02, _k41, _r43);
328                 _sum02 = vmlaq_f32(_sum02, _k42, _r44);
329                 _sum02 = vmlaq_f32(_sum02, _k43, _r45);
330                 _sum02 = vmlaq_f32(_sum02, _k44, _r46);
331                 _sum03 = vmlaq_f32(_sum03, _k40, _r43);
332                 _sum03 = vmlaq_f32(_sum03, _k41, _r44);
333                 _sum03 = vmlaq_f32(_sum03, _k42, _r45);
334                 _sum03 = vmlaq_f32(_sum03, _k43, _r46);
335                 _sum03 = vmlaq_f32(_sum03, _k44, _r47);
336 
337                 float32x4_t _r50 = vld1q_f32(r5);
338                 float32x4_t _r51 = vld1q_f32(r5 + 4);
339                 float32x4_t _r52 = vld1q_f32(r5 + 8);
340                 float32x4_t _r53 = vld1q_f32(r5 + 12);
341                 float32x4_t _r54 = vld1q_f32(r5 + 16);
342                 float32x4_t _r55 = vld1q_f32(r5 + 20);
343                 float32x4_t _r56 = vld1q_f32(r5 + 24);
344                 float32x4_t _r57 = vld1q_f32(r5 + 28);
345 
346                 _sum10 = vmlaq_f32(_sum10, _k40, _r50);
347                 _sum10 = vmlaq_f32(_sum10, _k41, _r51);
348                 _sum10 = vmlaq_f32(_sum10, _k42, _r52);
349                 _sum10 = vmlaq_f32(_sum10, _k43, _r53);
350                 _sum10 = vmlaq_f32(_sum10, _k44, _r54);
351                 _sum11 = vmlaq_f32(_sum11, _k40, _r51);
352                 _sum11 = vmlaq_f32(_sum11, _k41, _r52);
353                 _sum11 = vmlaq_f32(_sum11, _k42, _r53);
354                 _sum11 = vmlaq_f32(_sum11, _k43, _r54);
355                 _sum11 = vmlaq_f32(_sum11, _k44, _r55);
356                 _sum12 = vmlaq_f32(_sum12, _k40, _r52);
357                 _sum12 = vmlaq_f32(_sum12, _k41, _r53);
358                 _sum12 = vmlaq_f32(_sum12, _k42, _r54);
359                 _sum12 = vmlaq_f32(_sum12, _k43, _r55);
360                 _sum12 = vmlaq_f32(_sum12, _k44, _r56);
361                 _sum13 = vmlaq_f32(_sum13, _k40, _r53);
362                 _sum13 = vmlaq_f32(_sum13, _k41, _r54);
363                 _sum13 = vmlaq_f32(_sum13, _k42, _r55);
364                 _sum13 = vmlaq_f32(_sum13, _k43, _r56);
365                 _sum13 = vmlaq_f32(_sum13, _k44, _r57);
366 
367                 vst1q_f32(outptr0, _sum00);
368                 vst1q_f32(outptr0 + 4, _sum01);
369                 vst1q_f32(outptr0 + 8, _sum02);
370                 vst1q_f32(outptr0 + 12, _sum03);
371                 vst1q_f32(outptr1, _sum10);
372                 vst1q_f32(outptr1 + 4, _sum11);
373                 vst1q_f32(outptr1 + 8, _sum12);
374                 vst1q_f32(outptr1 + 12, _sum13);
375 
376                 r0 += 16;
377                 r1 += 16;
378                 r2 += 16;
379                 r3 += 16;
380                 r4 += 16;
381                 r5 += 16;
382                 outptr0 += 16;
383                 outptr1 += 16;
384             }
385             for (; j + 1 < outw; j += 2)
386             {
387                 float32x4_t _sum00 = _bias0;
388                 float32x4_t _sum01 = _bias0;
389                 float32x4_t _sum10 = _bias0;
390                 float32x4_t _sum11 = _bias0;
391 
392                 float32x4_t _r00 = vld1q_f32(r0);
393                 float32x4_t _r01 = vld1q_f32(r0 + 4);
394                 float32x4_t _r02 = vld1q_f32(r0 + 8);
395                 float32x4_t _r03 = vld1q_f32(r0 + 12);
396                 float32x4_t _r04 = vld1q_f32(r0 + 16);
397                 float32x4_t _r05 = vld1q_f32(r0 + 20);
398 
399                 float32x4_t _k00 = vld1q_f32(k0);
400                 float32x4_t _k01 = vld1q_f32(k0 + 4);
401                 float32x4_t _k02 = vld1q_f32(k0 + 8);
402                 float32x4_t _k03 = vld1q_f32(k0 + 12);
403                 float32x4_t _k04 = vld1q_f32(k0 + 16);
404                 k0 += 20;
405 
406                 _sum00 = vmlaq_f32(_sum00, _k00, _r00);
407                 _sum00 = vmlaq_f32(_sum00, _k01, _r01);
408                 _sum00 = vmlaq_f32(_sum00, _k02, _r02);
409                 _sum00 = vmlaq_f32(_sum00, _k03, _r03);
410                 _sum00 = vmlaq_f32(_sum00, _k04, _r04);
411                 _sum01 = vmlaq_f32(_sum01, _k00, _r01);
412                 _sum01 = vmlaq_f32(_sum01, _k01, _r02);
413                 _sum01 = vmlaq_f32(_sum01, _k02, _r03);
414                 _sum01 = vmlaq_f32(_sum01, _k03, _r04);
415                 _sum01 = vmlaq_f32(_sum01, _k04, _r05);
416 
417                 float32x4_t _r10 = vld1q_f32(r1);
418                 float32x4_t _r11 = vld1q_f32(r1 + 4);
419                 float32x4_t _r12 = vld1q_f32(r1 + 8);
420                 float32x4_t _r13 = vld1q_f32(r1 + 12);
421                 float32x4_t _r14 = vld1q_f32(r1 + 16);
422                 float32x4_t _r15 = vld1q_f32(r1 + 20);
423 
424                 float32x4_t _k10 = vld1q_f32(k0);
425                 float32x4_t _k11 = vld1q_f32(k0 + 4);
426                 float32x4_t _k12 = vld1q_f32(k0 + 8);
427                 float32x4_t _k13 = vld1q_f32(k0 + 12);
428                 float32x4_t _k14 = vld1q_f32(k0 + 16);
429                 k0 += 20;
430 
431                 _sum10 = vmlaq_f32(_sum10, _k00, _r10);
432                 _sum10 = vmlaq_f32(_sum10, _k01, _r11);
433                 _sum10 = vmlaq_f32(_sum10, _k02, _r12);
434                 _sum10 = vmlaq_f32(_sum10, _k03, _r13);
435                 _sum10 = vmlaq_f32(_sum10, _k04, _r14);
436                 _sum11 = vmlaq_f32(_sum11, _k00, _r11);
437                 _sum11 = vmlaq_f32(_sum11, _k01, _r12);
438                 _sum11 = vmlaq_f32(_sum11, _k02, _r13);
439                 _sum11 = vmlaq_f32(_sum11, _k03, _r14);
440                 _sum11 = vmlaq_f32(_sum11, _k04, _r15);
441 
442                 _sum00 = vmlaq_f32(_sum00, _k10, _r10);
443                 _sum00 = vmlaq_f32(_sum00, _k11, _r11);
444                 _sum00 = vmlaq_f32(_sum00, _k12, _r12);
445                 _sum00 = vmlaq_f32(_sum00, _k13, _r13);
446                 _sum00 = vmlaq_f32(_sum00, _k14, _r14);
447                 _sum01 = vmlaq_f32(_sum01, _k10, _r11);
448                 _sum01 = vmlaq_f32(_sum01, _k11, _r12);
449                 _sum01 = vmlaq_f32(_sum01, _k12, _r13);
450                 _sum01 = vmlaq_f32(_sum01, _k13, _r14);
451                 _sum01 = vmlaq_f32(_sum01, _k14, _r15);
452 
453                 float32x4_t _r20 = vld1q_f32(r2);
454                 float32x4_t _r21 = vld1q_f32(r2 + 4);
455                 float32x4_t _r22 = vld1q_f32(r2 + 8);
456                 float32x4_t _r23 = vld1q_f32(r2 + 12);
457                 float32x4_t _r24 = vld1q_f32(r2 + 16);
458                 float32x4_t _r25 = vld1q_f32(r2 + 20);
459 
460                 float32x4_t _k20 = vld1q_f32(k0);
461                 float32x4_t _k21 = vld1q_f32(k0 + 4);
462                 float32x4_t _k22 = vld1q_f32(k0 + 8);
463                 float32x4_t _k23 = vld1q_f32(k0 + 12);
464                 float32x4_t _k24 = vld1q_f32(k0 + 16);
465                 k0 += 20;
466 
467                 _sum10 = vmlaq_f32(_sum10, _k10, _r20);
468                 _sum10 = vmlaq_f32(_sum10, _k11, _r21);
469                 _sum10 = vmlaq_f32(_sum10, _k12, _r22);
470                 _sum10 = vmlaq_f32(_sum10, _k13, _r23);
471                 _sum10 = vmlaq_f32(_sum10, _k14, _r24);
472                 _sum11 = vmlaq_f32(_sum11, _k10, _r21);
473                 _sum11 = vmlaq_f32(_sum11, _k11, _r22);
474                 _sum11 = vmlaq_f32(_sum11, _k12, _r23);
475                 _sum11 = vmlaq_f32(_sum11, _k13, _r24);
476                 _sum11 = vmlaq_f32(_sum11, _k14, _r25);
477 
478                 _sum00 = vmlaq_f32(_sum00, _k20, _r20);
479                 _sum00 = vmlaq_f32(_sum00, _k21, _r21);
480                 _sum00 = vmlaq_f32(_sum00, _k22, _r22);
481                 _sum00 = vmlaq_f32(_sum00, _k23, _r23);
482                 _sum00 = vmlaq_f32(_sum00, _k24, _r24);
483                 _sum01 = vmlaq_f32(_sum01, _k20, _r21);
484                 _sum01 = vmlaq_f32(_sum01, _k21, _r22);
485                 _sum01 = vmlaq_f32(_sum01, _k22, _r23);
486                 _sum01 = vmlaq_f32(_sum01, _k23, _r24);
487                 _sum01 = vmlaq_f32(_sum01, _k24, _r25);
488 
489                 float32x4_t _r30 = vld1q_f32(r3);
490                 float32x4_t _r31 = vld1q_f32(r3 + 4);
491                 float32x4_t _r32 = vld1q_f32(r3 + 8);
492                 float32x4_t _r33 = vld1q_f32(r3 + 12);
493                 float32x4_t _r34 = vld1q_f32(r3 + 16);
494                 float32x4_t _r35 = vld1q_f32(r3 + 20);
495 
496                 float32x4_t _k30 = vld1q_f32(k0);
497                 float32x4_t _k31 = vld1q_f32(k0 + 4);
498                 float32x4_t _k32 = vld1q_f32(k0 + 8);
499                 float32x4_t _k33 = vld1q_f32(k0 + 12);
500                 float32x4_t _k34 = vld1q_f32(k0 + 16);
501                 k0 += 20;
502 
503                 _sum10 = vmlaq_f32(_sum10, _k20, _r30);
504                 _sum10 = vmlaq_f32(_sum10, _k21, _r31);
505                 _sum10 = vmlaq_f32(_sum10, _k22, _r32);
506                 _sum10 = vmlaq_f32(_sum10, _k23, _r33);
507                 _sum10 = vmlaq_f32(_sum10, _k24, _r34);
508                 _sum11 = vmlaq_f32(_sum11, _k20, _r31);
509                 _sum11 = vmlaq_f32(_sum11, _k21, _r32);
510                 _sum11 = vmlaq_f32(_sum11, _k22, _r33);
511                 _sum11 = vmlaq_f32(_sum11, _k23, _r34);
512                 _sum11 = vmlaq_f32(_sum11, _k24, _r35);
513 
514                 _sum00 = vmlaq_f32(_sum00, _k30, _r30);
515                 _sum00 = vmlaq_f32(_sum00, _k31, _r31);
516                 _sum00 = vmlaq_f32(_sum00, _k32, _r32);
517                 _sum00 = vmlaq_f32(_sum00, _k33, _r33);
518                 _sum00 = vmlaq_f32(_sum00, _k34, _r34);
519                 _sum01 = vmlaq_f32(_sum01, _k30, _r31);
520                 _sum01 = vmlaq_f32(_sum01, _k31, _r32);
521                 _sum01 = vmlaq_f32(_sum01, _k32, _r33);
522                 _sum01 = vmlaq_f32(_sum01, _k33, _r34);
523                 _sum01 = vmlaq_f32(_sum01, _k34, _r35);
524 
525                 float32x4_t _r40 = vld1q_f32(r4);
526                 float32x4_t _r41 = vld1q_f32(r4 + 4);
527                 float32x4_t _r42 = vld1q_f32(r4 + 8);
528                 float32x4_t _r43 = vld1q_f32(r4 + 12);
529                 float32x4_t _r44 = vld1q_f32(r4 + 16);
530                 float32x4_t _r45 = vld1q_f32(r4 + 20);
531 
532                 float32x4_t _k40 = vld1q_f32(k0);
533                 float32x4_t _k41 = vld1q_f32(k0 + 4);
534                 float32x4_t _k42 = vld1q_f32(k0 + 8);
535                 float32x4_t _k43 = vld1q_f32(k0 + 12);
536                 float32x4_t _k44 = vld1q_f32(k0 + 16);
537                 k0 -= 80;
538 
539                 _sum10 = vmlaq_f32(_sum10, _k30, _r40);
540                 _sum10 = vmlaq_f32(_sum10, _k31, _r41);
541                 _sum10 = vmlaq_f32(_sum10, _k32, _r42);
542                 _sum10 = vmlaq_f32(_sum10, _k33, _r43);
543                 _sum10 = vmlaq_f32(_sum10, _k34, _r44);
544                 _sum11 = vmlaq_f32(_sum11, _k30, _r41);
545                 _sum11 = vmlaq_f32(_sum11, _k31, _r42);
546                 _sum11 = vmlaq_f32(_sum11, _k32, _r43);
547                 _sum11 = vmlaq_f32(_sum11, _k33, _r44);
548                 _sum11 = vmlaq_f32(_sum11, _k34, _r45);
549 
550                 _sum00 = vmlaq_f32(_sum00, _k40, _r40);
551                 _sum00 = vmlaq_f32(_sum00, _k41, _r41);
552                 _sum00 = vmlaq_f32(_sum00, _k42, _r42);
553                 _sum00 = vmlaq_f32(_sum00, _k43, _r43);
554                 _sum00 = vmlaq_f32(_sum00, _k44, _r44);
555                 _sum01 = vmlaq_f32(_sum01, _k40, _r41);
556                 _sum01 = vmlaq_f32(_sum01, _k41, _r42);
557                 _sum01 = vmlaq_f32(_sum01, _k42, _r43);
558                 _sum01 = vmlaq_f32(_sum01, _k43, _r44);
559                 _sum01 = vmlaq_f32(_sum01, _k44, _r45);
560 
561                 float32x4_t _r50 = vld1q_f32(r5);
562                 float32x4_t _r51 = vld1q_f32(r5 + 4);
563                 float32x4_t _r52 = vld1q_f32(r5 + 8);
564                 float32x4_t _r53 = vld1q_f32(r5 + 12);
565                 float32x4_t _r54 = vld1q_f32(r5 + 16);
566                 float32x4_t _r55 = vld1q_f32(r5 + 20);
567 
568                 _sum10 = vmlaq_f32(_sum10, _k40, _r50);
569                 _sum10 = vmlaq_f32(_sum10, _k41, _r51);
570                 _sum10 = vmlaq_f32(_sum10, _k42, _r52);
571                 _sum10 = vmlaq_f32(_sum10, _k43, _r53);
572                 _sum10 = vmlaq_f32(_sum10, _k44, _r54);
573                 _sum11 = vmlaq_f32(_sum11, _k40, _r51);
574                 _sum11 = vmlaq_f32(_sum11, _k41, _r52);
575                 _sum11 = vmlaq_f32(_sum11, _k42, _r53);
576                 _sum11 = vmlaq_f32(_sum11, _k43, _r54);
577                 _sum11 = vmlaq_f32(_sum11, _k44, _r55);
578 
579                 vst1q_f32(outptr0, _sum00);
580                 vst1q_f32(outptr0 + 4, _sum01);
581                 vst1q_f32(outptr1, _sum10);
582                 vst1q_f32(outptr1 + 4, _sum11);
583 
584                 r0 += 8;
585                 r1 += 8;
586                 r2 += 8;
587                 r3 += 8;
588                 r4 += 8;
589                 r5 += 8;
590                 outptr0 += 8;
591                 outptr1 += 8;
592             }
593             for (; j < outw; j++)
594             {
595                 float32x4_t _sum0 = _bias0;
596                 float32x4_t _sum1 = _bias0;
597 
598                 float32x4_t _r00 = vld1q_f32(r0);
599                 float32x4_t _r01 = vld1q_f32(r0 + 4);
600                 float32x4_t _r02 = vld1q_f32(r0 + 8);
601                 float32x4_t _r03 = vld1q_f32(r0 + 12);
602                 float32x4_t _r04 = vld1q_f32(r0 + 16);
603 
604                 float32x4_t _k00 = vld1q_f32(k0);
605                 float32x4_t _k01 = vld1q_f32(k0 + 4);
606                 float32x4_t _k02 = vld1q_f32(k0 + 8);
607                 float32x4_t _k03 = vld1q_f32(k0 + 12);
608                 float32x4_t _k04 = vld1q_f32(k0 + 16);
609                 k0 += 20;
610 
611                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
612                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
613                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
614                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
615                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
616 
617                 float32x4_t _r10 = vld1q_f32(r1);
618                 float32x4_t _r11 = vld1q_f32(r1 + 4);
619                 float32x4_t _r12 = vld1q_f32(r1 + 8);
620                 float32x4_t _r13 = vld1q_f32(r1 + 12);
621                 float32x4_t _r14 = vld1q_f32(r1 + 16);
622 
623                 float32x4_t _k10 = vld1q_f32(k0);
624                 float32x4_t _k11 = vld1q_f32(k0 + 4);
625                 float32x4_t _k12 = vld1q_f32(k0 + 8);
626                 float32x4_t _k13 = vld1q_f32(k0 + 12);
627                 float32x4_t _k14 = vld1q_f32(k0 + 16);
628                 k0 += 20;
629 
630                 _sum1 = vmlaq_f32(_sum1, _k00, _r10);
631                 _sum1 = vmlaq_f32(_sum1, _k01, _r11);
632                 _sum1 = vmlaq_f32(_sum1, _k02, _r12);
633                 _sum1 = vmlaq_f32(_sum1, _k03, _r13);
634                 _sum1 = vmlaq_f32(_sum1, _k04, _r14);
635 
636                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
637                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
638                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
639                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
640                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
641 
642                 float32x4_t _r20 = vld1q_f32(r2);
643                 float32x4_t _r21 = vld1q_f32(r2 + 4);
644                 float32x4_t _r22 = vld1q_f32(r2 + 8);
645                 float32x4_t _r23 = vld1q_f32(r2 + 12);
646                 float32x4_t _r24 = vld1q_f32(r2 + 16);
647 
648                 float32x4_t _k20 = vld1q_f32(k0);
649                 float32x4_t _k21 = vld1q_f32(k0 + 4);
650                 float32x4_t _k22 = vld1q_f32(k0 + 8);
651                 float32x4_t _k23 = vld1q_f32(k0 + 12);
652                 float32x4_t _k24 = vld1q_f32(k0 + 16);
653                 k0 += 20;
654 
655                 _sum1 = vmlaq_f32(_sum1, _k10, _r20);
656                 _sum1 = vmlaq_f32(_sum1, _k11, _r21);
657                 _sum1 = vmlaq_f32(_sum1, _k12, _r22);
658                 _sum1 = vmlaq_f32(_sum1, _k13, _r23);
659                 _sum1 = vmlaq_f32(_sum1, _k14, _r24);
660 
661                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
662                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
663                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
664                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
665                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
666 
667                 float32x4_t _r30 = vld1q_f32(r3);
668                 float32x4_t _r31 = vld1q_f32(r3 + 4);
669                 float32x4_t _r32 = vld1q_f32(r3 + 8);
670                 float32x4_t _r33 = vld1q_f32(r3 + 12);
671                 float32x4_t _r34 = vld1q_f32(r3 + 16);
672 
673                 float32x4_t _k30 = vld1q_f32(k0);
674                 float32x4_t _k31 = vld1q_f32(k0 + 4);
675                 float32x4_t _k32 = vld1q_f32(k0 + 8);
676                 float32x4_t _k33 = vld1q_f32(k0 + 12);
677                 float32x4_t _k34 = vld1q_f32(k0 + 16);
678                 k0 += 20;
679 
680                 _sum1 = vmlaq_f32(_sum1, _k20, _r30);
681                 _sum1 = vmlaq_f32(_sum1, _k21, _r31);
682                 _sum1 = vmlaq_f32(_sum1, _k22, _r32);
683                 _sum1 = vmlaq_f32(_sum1, _k23, _r33);
684                 _sum1 = vmlaq_f32(_sum1, _k24, _r34);
685 
686                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
687                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
688                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
689                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
690                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
691 
692                 float32x4_t _r40 = vld1q_f32(r4);
693                 float32x4_t _r41 = vld1q_f32(r4 + 4);
694                 float32x4_t _r42 = vld1q_f32(r4 + 8);
695                 float32x4_t _r43 = vld1q_f32(r4 + 12);
696                 float32x4_t _r44 = vld1q_f32(r4 + 16);
697 
698                 float32x4_t _k40 = vld1q_f32(k0);
699                 float32x4_t _k41 = vld1q_f32(k0 + 4);
700                 float32x4_t _k42 = vld1q_f32(k0 + 8);
701                 float32x4_t _k43 = vld1q_f32(k0 + 12);
702                 float32x4_t _k44 = vld1q_f32(k0 + 16);
703                 k0 -= 80;
704 
705                 _sum1 = vmlaq_f32(_sum1, _k30, _r40);
706                 _sum1 = vmlaq_f32(_sum1, _k31, _r41);
707                 _sum1 = vmlaq_f32(_sum1, _k32, _r42);
708                 _sum1 = vmlaq_f32(_sum1, _k33, _r43);
709                 _sum1 = vmlaq_f32(_sum1, _k34, _r44);
710 
711                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
712                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
713                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
714                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
715                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
716 
717                 float32x4_t _r50 = vld1q_f32(r5);
718                 float32x4_t _r51 = vld1q_f32(r5 + 4);
719                 float32x4_t _r52 = vld1q_f32(r5 + 8);
720                 float32x4_t _r53 = vld1q_f32(r5 + 12);
721                 float32x4_t _r54 = vld1q_f32(r5 + 16);
722 
723                 _sum1 = vmlaq_f32(_sum1, _k40, _r50);
724                 _sum1 = vmlaq_f32(_sum1, _k41, _r51);
725                 _sum1 = vmlaq_f32(_sum1, _k42, _r52);
726                 _sum1 = vmlaq_f32(_sum1, _k43, _r53);
727                 _sum1 = vmlaq_f32(_sum1, _k44, _r54);
728 
729                 vst1q_f32(outptr0, _sum0);
730                 vst1q_f32(outptr1, _sum1);
731 
732                 r0 += 4;
733                 r1 += 4;
734                 r2 += 4;
735                 r3 += 4;
736                 r4 += 4;
737                 r5 += 4;
738                 outptr0 += 4;
739                 outptr1 += 4;
740             }
741 
742             r0 += 4 * 4 + w * 4;
743             r1 += 4 * 4 + w * 4;
744             r2 += 4 * 4 + w * 4;
745             r3 += 4 * 4 + w * 4;
746             r4 += 4 * 4 + w * 4;
747             r5 += 4 * 4 + w * 4;
748 
749             outptr0 += outw * 4;
750             outptr1 += outw * 4;
751         }
752 #endif // __aarch64__
753         for (; i < outh; i++)
754         {
755             int j = 0;
756 
757             for (; j + 3 < outw; j += 4)
758             {
759                 float32x4_t _sum0 = _bias0;
760                 float32x4_t _sum1 = _bias0;
761                 float32x4_t _sum2 = _bias0;
762                 float32x4_t _sum3 = _bias0;
763 
764                 float32x4_t _r00 = vld1q_f32(r0);
765                 float32x4_t _r01 = vld1q_f32(r0 + 4);
766                 float32x4_t _r02 = vld1q_f32(r0 + 8);
767                 float32x4_t _r03 = vld1q_f32(r0 + 12);
768                 float32x4_t _r04 = vld1q_f32(r0 + 16);
769                 float32x4_t _r05 = vld1q_f32(r0 + 20);
770                 float32x4_t _r06 = vld1q_f32(r0 + 24);
771                 float32x4_t _r07 = vld1q_f32(r0 + 28);
772 
773                 float32x4_t _k00 = vld1q_f32(k0);
774                 float32x4_t _k01 = vld1q_f32(k0 + 4);
775                 float32x4_t _k02 = vld1q_f32(k0 + 8);
776                 float32x4_t _k03 = vld1q_f32(k0 + 12);
777                 float32x4_t _k04 = vld1q_f32(k0 + 16);
778                 k0 += 20;
779 
780                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
781                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
782                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
783                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
784                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
785                 _sum1 = vmlaq_f32(_sum1, _k00, _r01);
786                 _sum1 = vmlaq_f32(_sum1, _k01, _r02);
787                 _sum1 = vmlaq_f32(_sum1, _k02, _r03);
788                 _sum1 = vmlaq_f32(_sum1, _k03, _r04);
789                 _sum1 = vmlaq_f32(_sum1, _k04, _r05);
790                 _sum2 = vmlaq_f32(_sum2, _k00, _r02);
791                 _sum2 = vmlaq_f32(_sum2, _k01, _r03);
792                 _sum2 = vmlaq_f32(_sum2, _k02, _r04);
793                 _sum2 = vmlaq_f32(_sum2, _k03, _r05);
794                 _sum2 = vmlaq_f32(_sum2, _k04, _r06);
795                 _sum3 = vmlaq_f32(_sum3, _k00, _r03);
796                 _sum3 = vmlaq_f32(_sum3, _k01, _r04);
797                 _sum3 = vmlaq_f32(_sum3, _k02, _r05);
798                 _sum3 = vmlaq_f32(_sum3, _k03, _r06);
799                 _sum3 = vmlaq_f32(_sum3, _k04, _r07);
800 
801                 float32x4_t _r10 = vld1q_f32(r1);
802                 float32x4_t _r11 = vld1q_f32(r1 + 4);
803                 float32x4_t _r12 = vld1q_f32(r1 + 8);
804                 float32x4_t _r13 = vld1q_f32(r1 + 12);
805                 float32x4_t _r14 = vld1q_f32(r1 + 16);
806                 float32x4_t _r15 = vld1q_f32(r1 + 20);
807                 float32x4_t _r16 = vld1q_f32(r1 + 24);
808                 float32x4_t _r17 = vld1q_f32(r1 + 28);
809 
810                 float32x4_t _k10 = vld1q_f32(k0);
811                 float32x4_t _k11 = vld1q_f32(k0 + 4);
812                 float32x4_t _k12 = vld1q_f32(k0 + 8);
813                 float32x4_t _k13 = vld1q_f32(k0 + 12);
814                 float32x4_t _k14 = vld1q_f32(k0 + 16);
815                 k0 += 20;
816 
817                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
818                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
819                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
820                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
821                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
822                 _sum1 = vmlaq_f32(_sum1, _k10, _r11);
823                 _sum1 = vmlaq_f32(_sum1, _k11, _r12);
824                 _sum1 = vmlaq_f32(_sum1, _k12, _r13);
825                 _sum1 = vmlaq_f32(_sum1, _k13, _r14);
826                 _sum1 = vmlaq_f32(_sum1, _k14, _r15);
827                 _sum2 = vmlaq_f32(_sum2, _k10, _r12);
828                 _sum2 = vmlaq_f32(_sum2, _k11, _r13);
829                 _sum2 = vmlaq_f32(_sum2, _k12, _r14);
830                 _sum2 = vmlaq_f32(_sum2, _k13, _r15);
831                 _sum2 = vmlaq_f32(_sum2, _k14, _r16);
832                 _sum3 = vmlaq_f32(_sum3, _k10, _r13);
833                 _sum3 = vmlaq_f32(_sum3, _k11, _r14);
834                 _sum3 = vmlaq_f32(_sum3, _k12, _r15);
835                 _sum3 = vmlaq_f32(_sum3, _k13, _r16);
836                 _sum3 = vmlaq_f32(_sum3, _k14, _r17);
837 
838                 float32x4_t _r20 = vld1q_f32(r2);
839                 float32x4_t _r21 = vld1q_f32(r2 + 4);
840                 float32x4_t _r22 = vld1q_f32(r2 + 8);
841                 float32x4_t _r23 = vld1q_f32(r2 + 12);
842                 float32x4_t _r24 = vld1q_f32(r2 + 16);
843                 float32x4_t _r25 = vld1q_f32(r2 + 20);
844                 float32x4_t _r26 = vld1q_f32(r2 + 24);
845                 float32x4_t _r27 = vld1q_f32(r2 + 28);
846 
847                 float32x4_t _k20 = vld1q_f32(k0);
848                 float32x4_t _k21 = vld1q_f32(k0 + 4);
849                 float32x4_t _k22 = vld1q_f32(k0 + 8);
850                 float32x4_t _k23 = vld1q_f32(k0 + 12);
851                 float32x4_t _k24 = vld1q_f32(k0 + 16);
852                 k0 += 20;
853 
854                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
855                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
856                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
857                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
858                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
859                 _sum1 = vmlaq_f32(_sum1, _k20, _r21);
860                 _sum1 = vmlaq_f32(_sum1, _k21, _r22);
861                 _sum1 = vmlaq_f32(_sum1, _k22, _r23);
862                 _sum1 = vmlaq_f32(_sum1, _k23, _r24);
863                 _sum1 = vmlaq_f32(_sum1, _k24, _r25);
864                 _sum2 = vmlaq_f32(_sum2, _k20, _r22);
865                 _sum2 = vmlaq_f32(_sum2, _k21, _r23);
866                 _sum2 = vmlaq_f32(_sum2, _k22, _r24);
867                 _sum2 = vmlaq_f32(_sum2, _k23, _r25);
868                 _sum2 = vmlaq_f32(_sum2, _k24, _r26);
869                 _sum3 = vmlaq_f32(_sum3, _k20, _r23);
870                 _sum3 = vmlaq_f32(_sum3, _k21, _r24);
871                 _sum3 = vmlaq_f32(_sum3, _k22, _r25);
872                 _sum3 = vmlaq_f32(_sum3, _k23, _r26);
873                 _sum3 = vmlaq_f32(_sum3, _k24, _r27);
874 
875                 float32x4_t _r30 = vld1q_f32(r3);
876                 float32x4_t _r31 = vld1q_f32(r3 + 4);
877                 float32x4_t _r32 = vld1q_f32(r3 + 8);
878                 float32x4_t _r33 = vld1q_f32(r3 + 12);
879                 float32x4_t _r34 = vld1q_f32(r3 + 16);
880                 float32x4_t _r35 = vld1q_f32(r3 + 20);
881                 float32x4_t _r36 = vld1q_f32(r3 + 24);
882                 float32x4_t _r37 = vld1q_f32(r3 + 28);
883 
884                 float32x4_t _k30 = vld1q_f32(k0);
885                 float32x4_t _k31 = vld1q_f32(k0 + 4);
886                 float32x4_t _k32 = vld1q_f32(k0 + 8);
887                 float32x4_t _k33 = vld1q_f32(k0 + 12);
888                 float32x4_t _k34 = vld1q_f32(k0 + 16);
889                 k0 += 20;
890 
891                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
892                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
893                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
894                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
895                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
896                 _sum1 = vmlaq_f32(_sum1, _k30, _r31);
897                 _sum1 = vmlaq_f32(_sum1, _k31, _r32);
898                 _sum1 = vmlaq_f32(_sum1, _k32, _r33);
899                 _sum1 = vmlaq_f32(_sum1, _k33, _r34);
900                 _sum1 = vmlaq_f32(_sum1, _k34, _r35);
901                 _sum2 = vmlaq_f32(_sum2, _k30, _r32);
902                 _sum2 = vmlaq_f32(_sum2, _k31, _r33);
903                 _sum2 = vmlaq_f32(_sum2, _k32, _r34);
904                 _sum2 = vmlaq_f32(_sum2, _k33, _r35);
905                 _sum2 = vmlaq_f32(_sum2, _k34, _r36);
906                 _sum3 = vmlaq_f32(_sum3, _k30, _r33);
907                 _sum3 = vmlaq_f32(_sum3, _k31, _r34);
908                 _sum3 = vmlaq_f32(_sum3, _k32, _r35);
909                 _sum3 = vmlaq_f32(_sum3, _k33, _r36);
910                 _sum3 = vmlaq_f32(_sum3, _k34, _r37);
911 
912                 float32x4_t _r40 = vld1q_f32(r4);
913                 float32x4_t _r41 = vld1q_f32(r4 + 4);
914                 float32x4_t _r42 = vld1q_f32(r4 + 8);
915                 float32x4_t _r43 = vld1q_f32(r4 + 12);
916                 float32x4_t _r44 = vld1q_f32(r4 + 16);
917                 float32x4_t _r45 = vld1q_f32(r4 + 20);
918                 float32x4_t _r46 = vld1q_f32(r4 + 24);
919                 float32x4_t _r47 = vld1q_f32(r4 + 28);
920 
921                 float32x4_t _k40 = vld1q_f32(k0);
922                 float32x4_t _k41 = vld1q_f32(k0 + 4);
923                 float32x4_t _k42 = vld1q_f32(k0 + 8);
924                 float32x4_t _k43 = vld1q_f32(k0 + 12);
925                 float32x4_t _k44 = vld1q_f32(k0 + 16);
926                 k0 -= 80;
927 
928                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
929                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
930                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
931                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
932                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
933                 _sum1 = vmlaq_f32(_sum1, _k40, _r41);
934                 _sum1 = vmlaq_f32(_sum1, _k41, _r42);
935                 _sum1 = vmlaq_f32(_sum1, _k42, _r43);
936                 _sum1 = vmlaq_f32(_sum1, _k43, _r44);
937                 _sum1 = vmlaq_f32(_sum1, _k44, _r45);
938                 _sum2 = vmlaq_f32(_sum2, _k40, _r42);
939                 _sum2 = vmlaq_f32(_sum2, _k41, _r43);
940                 _sum2 = vmlaq_f32(_sum2, _k42, _r44);
941                 _sum2 = vmlaq_f32(_sum2, _k43, _r45);
942                 _sum2 = vmlaq_f32(_sum2, _k44, _r46);
943                 _sum3 = vmlaq_f32(_sum3, _k40, _r43);
944                 _sum3 = vmlaq_f32(_sum3, _k41, _r44);
945                 _sum3 = vmlaq_f32(_sum3, _k42, _r45);
946                 _sum3 = vmlaq_f32(_sum3, _k43, _r46);
947                 _sum3 = vmlaq_f32(_sum3, _k44, _r47);
948 
949                 vst1q_f32(outptr0, _sum0);
950                 vst1q_f32(outptr0 + 4, _sum1);
951                 vst1q_f32(outptr0 + 8, _sum2);
952                 vst1q_f32(outptr0 + 12, _sum3);
953 
954                 r0 += 16;
955                 r1 += 16;
956                 r2 += 16;
957                 r3 += 16;
958                 r4 += 16;
959                 outptr0 += 16;
960             }
961             for (; j + 1 < outw; j += 2)
962             {
963                 float32x4_t _sum0 = _bias0;
964                 float32x4_t _sum1 = _bias0;
965 
966                 float32x4_t _r00 = vld1q_f32(r0);
967                 float32x4_t _r01 = vld1q_f32(r0 + 4);
968                 float32x4_t _r02 = vld1q_f32(r0 + 8);
969                 float32x4_t _r03 = vld1q_f32(r0 + 12);
970                 float32x4_t _r04 = vld1q_f32(r0 + 16);
971                 float32x4_t _r05 = vld1q_f32(r0 + 20);
972 
973                 float32x4_t _k00 = vld1q_f32(k0);
974                 float32x4_t _k01 = vld1q_f32(k0 + 4);
975                 float32x4_t _k02 = vld1q_f32(k0 + 8);
976                 float32x4_t _k03 = vld1q_f32(k0 + 12);
977                 float32x4_t _k04 = vld1q_f32(k0 + 16);
978                 k0 += 20;
979 
980                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
981                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
982                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
983                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
984                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
985                 _sum1 = vmlaq_f32(_sum1, _k00, _r01);
986                 _sum1 = vmlaq_f32(_sum1, _k01, _r02);
987                 _sum1 = vmlaq_f32(_sum1, _k02, _r03);
988                 _sum1 = vmlaq_f32(_sum1, _k03, _r04);
989                 _sum1 = vmlaq_f32(_sum1, _k04, _r05);
990 
991                 float32x4_t _r10 = vld1q_f32(r1);
992                 float32x4_t _r11 = vld1q_f32(r1 + 4);
993                 float32x4_t _r12 = vld1q_f32(r1 + 8);
994                 float32x4_t _r13 = vld1q_f32(r1 + 12);
995                 float32x4_t _r14 = vld1q_f32(r1 + 16);
996                 float32x4_t _r15 = vld1q_f32(r1 + 20);
997 
998                 float32x4_t _k10 = vld1q_f32(k0);
999                 float32x4_t _k11 = vld1q_f32(k0 + 4);
1000                 float32x4_t _k12 = vld1q_f32(k0 + 8);
1001                 float32x4_t _k13 = vld1q_f32(k0 + 12);
1002                 float32x4_t _k14 = vld1q_f32(k0 + 16);
1003                 k0 += 20;
1004 
1005                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1006                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1007                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1008                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1009                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1010                 _sum1 = vmlaq_f32(_sum1, _k10, _r11);
1011                 _sum1 = vmlaq_f32(_sum1, _k11, _r12);
1012                 _sum1 = vmlaq_f32(_sum1, _k12, _r13);
1013                 _sum1 = vmlaq_f32(_sum1, _k13, _r14);
1014                 _sum1 = vmlaq_f32(_sum1, _k14, _r15);
1015 
1016                 float32x4_t _r20 = vld1q_f32(r2);
1017                 float32x4_t _r21 = vld1q_f32(r2 + 4);
1018                 float32x4_t _r22 = vld1q_f32(r2 + 8);
1019                 float32x4_t _r23 = vld1q_f32(r2 + 12);
1020                 float32x4_t _r24 = vld1q_f32(r2 + 16);
1021                 float32x4_t _r25 = vld1q_f32(r2 + 20);
1022 
1023                 float32x4_t _k20 = vld1q_f32(k0);
1024                 float32x4_t _k21 = vld1q_f32(k0 + 4);
1025                 float32x4_t _k22 = vld1q_f32(k0 + 8);
1026                 float32x4_t _k23 = vld1q_f32(k0 + 12);
1027                 float32x4_t _k24 = vld1q_f32(k0 + 16);
1028                 k0 += 20;
1029 
1030                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1031                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1032                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1033                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1034                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1035                 _sum1 = vmlaq_f32(_sum1, _k20, _r21);
1036                 _sum1 = vmlaq_f32(_sum1, _k21, _r22);
1037                 _sum1 = vmlaq_f32(_sum1, _k22, _r23);
1038                 _sum1 = vmlaq_f32(_sum1, _k23, _r24);
1039                 _sum1 = vmlaq_f32(_sum1, _k24, _r25);
1040 
1041                 float32x4_t _r30 = vld1q_f32(r3);
1042                 float32x4_t _r31 = vld1q_f32(r3 + 4);
1043                 float32x4_t _r32 = vld1q_f32(r3 + 8);
1044                 float32x4_t _r33 = vld1q_f32(r3 + 12);
1045                 float32x4_t _r34 = vld1q_f32(r3 + 16);
1046                 float32x4_t _r35 = vld1q_f32(r3 + 20);
1047 
1048                 float32x4_t _k30 = vld1q_f32(k0);
1049                 float32x4_t _k31 = vld1q_f32(k0 + 4);
1050                 float32x4_t _k32 = vld1q_f32(k0 + 8);
1051                 float32x4_t _k33 = vld1q_f32(k0 + 12);
1052                 float32x4_t _k34 = vld1q_f32(k0 + 16);
1053                 k0 += 20;
1054 
1055                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1056                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1057                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1058                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1059                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1060                 _sum1 = vmlaq_f32(_sum1, _k30, _r31);
1061                 _sum1 = vmlaq_f32(_sum1, _k31, _r32);
1062                 _sum1 = vmlaq_f32(_sum1, _k32, _r33);
1063                 _sum1 = vmlaq_f32(_sum1, _k33, _r34);
1064                 _sum1 = vmlaq_f32(_sum1, _k34, _r35);
1065 
1066                 float32x4_t _r40 = vld1q_f32(r4);
1067                 float32x4_t _r41 = vld1q_f32(r4 + 4);
1068                 float32x4_t _r42 = vld1q_f32(r4 + 8);
1069                 float32x4_t _r43 = vld1q_f32(r4 + 12);
1070                 float32x4_t _r44 = vld1q_f32(r4 + 16);
1071                 float32x4_t _r45 = vld1q_f32(r4 + 20);
1072 
1073                 float32x4_t _k40 = vld1q_f32(k0);
1074                 float32x4_t _k41 = vld1q_f32(k0 + 4);
1075                 float32x4_t _k42 = vld1q_f32(k0 + 8);
1076                 float32x4_t _k43 = vld1q_f32(k0 + 12);
1077                 float32x4_t _k44 = vld1q_f32(k0 + 16);
1078                 k0 -= 80;
1079 
1080                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1081                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1082                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1083                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1084                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1085                 _sum1 = vmlaq_f32(_sum1, _k40, _r41);
1086                 _sum1 = vmlaq_f32(_sum1, _k41, _r42);
1087                 _sum1 = vmlaq_f32(_sum1, _k42, _r43);
1088                 _sum1 = vmlaq_f32(_sum1, _k43, _r44);
1089                 _sum1 = vmlaq_f32(_sum1, _k44, _r45);
1090 
1091                 vst1q_f32(outptr0, _sum0);
1092                 vst1q_f32(outptr0 + 4, _sum1);
1093 
1094                 r0 += 8;
1095                 r1 += 8;
1096                 r2 += 8;
1097                 r3 += 8;
1098                 r4 += 8;
1099                 outptr0 += 8;
1100             }
1101             for (; j < outw; j++)
1102             {
1103                 float32x4_t _sum0 = _bias0;
1104 
1105                 float32x4_t _r00 = vld1q_f32(r0);
1106                 float32x4_t _r01 = vld1q_f32(r0 + 4);
1107                 float32x4_t _r02 = vld1q_f32(r0 + 8);
1108                 float32x4_t _r03 = vld1q_f32(r0 + 12);
1109                 float32x4_t _r04 = vld1q_f32(r0 + 16);
1110 
1111                 float32x4_t _k00 = vld1q_f32(k0);
1112                 float32x4_t _k01 = vld1q_f32(k0 + 4);
1113                 float32x4_t _k02 = vld1q_f32(k0 + 8);
1114                 float32x4_t _k03 = vld1q_f32(k0 + 12);
1115                 float32x4_t _k04 = vld1q_f32(k0 + 16);
1116                 k0 += 20;
1117 
1118                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1119                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1120                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1121                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1122                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1123 
1124                 float32x4_t _r10 = vld1q_f32(r1);
1125                 float32x4_t _r11 = vld1q_f32(r1 + 4);
1126                 float32x4_t _r12 = vld1q_f32(r1 + 8);
1127                 float32x4_t _r13 = vld1q_f32(r1 + 12);
1128                 float32x4_t _r14 = vld1q_f32(r1 + 16);
1129 
1130                 float32x4_t _k10 = vld1q_f32(k0);
1131                 float32x4_t _k11 = vld1q_f32(k0 + 4);
1132                 float32x4_t _k12 = vld1q_f32(k0 + 8);
1133                 float32x4_t _k13 = vld1q_f32(k0 + 12);
1134                 float32x4_t _k14 = vld1q_f32(k0 + 16);
1135                 k0 += 20;
1136 
1137                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1138                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1139                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1140                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1141                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1142 
1143                 float32x4_t _r20 = vld1q_f32(r2);
1144                 float32x4_t _r21 = vld1q_f32(r2 + 4);
1145                 float32x4_t _r22 = vld1q_f32(r2 + 8);
1146                 float32x4_t _r23 = vld1q_f32(r2 + 12);
1147                 float32x4_t _r24 = vld1q_f32(r2 + 16);
1148 
1149                 float32x4_t _k20 = vld1q_f32(k0);
1150                 float32x4_t _k21 = vld1q_f32(k0 + 4);
1151                 float32x4_t _k22 = vld1q_f32(k0 + 8);
1152                 float32x4_t _k23 = vld1q_f32(k0 + 12);
1153                 float32x4_t _k24 = vld1q_f32(k0 + 16);
1154                 k0 += 20;
1155 
1156                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1157                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1158                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1159                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1160                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1161 
1162                 float32x4_t _r30 = vld1q_f32(r3);
1163                 float32x4_t _r31 = vld1q_f32(r3 + 4);
1164                 float32x4_t _r32 = vld1q_f32(r3 + 8);
1165                 float32x4_t _r33 = vld1q_f32(r3 + 12);
1166                 float32x4_t _r34 = vld1q_f32(r3 + 16);
1167 
1168                 float32x4_t _k30 = vld1q_f32(k0);
1169                 float32x4_t _k31 = vld1q_f32(k0 + 4);
1170                 float32x4_t _k32 = vld1q_f32(k0 + 8);
1171                 float32x4_t _k33 = vld1q_f32(k0 + 12);
1172                 float32x4_t _k34 = vld1q_f32(k0 + 16);
1173                 k0 += 20;
1174 
1175                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1176                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1177                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1178                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1179                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1180 
1181                 float32x4_t _r40 = vld1q_f32(r4);
1182                 float32x4_t _r41 = vld1q_f32(r4 + 4);
1183                 float32x4_t _r42 = vld1q_f32(r4 + 8);
1184                 float32x4_t _r43 = vld1q_f32(r4 + 12);
1185                 float32x4_t _r44 = vld1q_f32(r4 + 16);
1186 
1187                 float32x4_t _k40 = vld1q_f32(k0);
1188                 float32x4_t _k41 = vld1q_f32(k0 + 4);
1189                 float32x4_t _k42 = vld1q_f32(k0 + 8);
1190                 float32x4_t _k43 = vld1q_f32(k0 + 12);
1191                 float32x4_t _k44 = vld1q_f32(k0 + 16);
1192                 k0 -= 80;
1193 
1194                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1195                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1196                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1197                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1198                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1199 
1200                 vst1q_f32(outptr0, _sum0);
1201 
1202                 r0 += 4;
1203                 r1 += 4;
1204                 r2 += 4;
1205                 r3 += 4;
1206                 r4 += 4;
1207                 outptr0 += 4;
1208             }
1209 
1210             r0 += 4 * 4;
1211             r1 += 4 * 4;
1212             r2 += 4 * 4;
1213             r3 += 4 * 4;
1214             r4 += 4 * 4;
1215         }
1216     }
1217 }
1218 
convdw5x5s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)1219 static void convdw5x5s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
1220 {
1221     int w = bottom_blob.w;
1222 
1223     int outw = top_blob.w;
1224     int outh = top_blob.h;
1225 
1226     const int group = bottom_blob.c;
1227 
1228     const int tailstep = (w - 2 * outw + w) * 4;
1229 
1230     const float* bias = _bias;
1231 
1232     #pragma omp parallel for num_threads(opt.num_threads)
1233     for (int g = 0; g < group; g++)
1234     {
1235         Mat out = top_blob.channel(g);
1236 
1237         float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + g * 4) : vdupq_n_f32(0.f);
1238 
1239         const float* k0 = kernel.row(g);
1240 
1241         float* outptr0 = out;
1242 
1243         const Mat img0 = bottom_blob.channel(g);
1244 
1245         const float* r0 = img0.row(0);
1246         const float* r1 = img0.row(1);
1247         const float* r2 = img0.row(2);
1248         const float* r3 = img0.row(3);
1249         const float* r4 = img0.row(4);
1250 
1251         int i = 0;
1252 
1253         for (; i < outh; i++)
1254         {
1255             int j = 0;
1256 
1257             for (; j + 3 < outw; j += 4)
1258             {
1259                 float32x4_t _sum0 = _bias0;
1260                 float32x4_t _sum1 = _bias0;
1261                 float32x4_t _sum2 = _bias0;
1262                 float32x4_t _sum3 = _bias0;
1263 
1264                 float32x4_t _r00 = vld1q_f32(r0);
1265                 float32x4_t _r01 = vld1q_f32(r0 + 4);
1266                 float32x4_t _r02 = vld1q_f32(r0 + 8);
1267                 float32x4_t _r03 = vld1q_f32(r0 + 12);
1268                 float32x4_t _r04 = vld1q_f32(r0 + 16);
1269                 float32x4_t _r05 = vld1q_f32(r0 + 20);
1270                 float32x4_t _r06 = vld1q_f32(r0 + 24);
1271                 float32x4_t _r07 = vld1q_f32(r0 + 28);
1272                 float32x4_t _r08 = vld1q_f32(r0 + 32);
1273                 float32x4_t _r09 = vld1q_f32(r0 + 36);
1274                 float32x4_t _r010 = vld1q_f32(r0 + 40);
1275 
1276                 float32x4_t _k00 = vld1q_f32(k0);
1277                 float32x4_t _k01 = vld1q_f32(k0 + 4);
1278                 float32x4_t _k02 = vld1q_f32(k0 + 8);
1279                 float32x4_t _k03 = vld1q_f32(k0 + 12);
1280                 float32x4_t _k04 = vld1q_f32(k0 + 16);
1281                 k0 += 20;
1282 
1283                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1284                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1285                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1286                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1287                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1288                 _sum1 = vmlaq_f32(_sum1, _k00, _r02);
1289                 _sum1 = vmlaq_f32(_sum1, _k01, _r03);
1290                 _sum1 = vmlaq_f32(_sum1, _k02, _r04);
1291                 _sum1 = vmlaq_f32(_sum1, _k03, _r05);
1292                 _sum1 = vmlaq_f32(_sum1, _k04, _r06);
1293                 _sum2 = vmlaq_f32(_sum2, _k00, _r04);
1294                 _sum2 = vmlaq_f32(_sum2, _k01, _r05);
1295                 _sum2 = vmlaq_f32(_sum2, _k02, _r06);
1296                 _sum2 = vmlaq_f32(_sum2, _k03, _r07);
1297                 _sum2 = vmlaq_f32(_sum2, _k04, _r08);
1298                 _sum3 = vmlaq_f32(_sum3, _k00, _r06);
1299                 _sum3 = vmlaq_f32(_sum3, _k01, _r07);
1300                 _sum3 = vmlaq_f32(_sum3, _k02, _r08);
1301                 _sum3 = vmlaq_f32(_sum3, _k03, _r09);
1302                 _sum3 = vmlaq_f32(_sum3, _k04, _r010);
1303 
1304                 float32x4_t _r10 = vld1q_f32(r1);
1305                 float32x4_t _r11 = vld1q_f32(r1 + 4);
1306                 float32x4_t _r12 = vld1q_f32(r1 + 8);
1307                 float32x4_t _r13 = vld1q_f32(r1 + 12);
1308                 float32x4_t _r14 = vld1q_f32(r1 + 16);
1309                 float32x4_t _r15 = vld1q_f32(r1 + 20);
1310                 float32x4_t _r16 = vld1q_f32(r1 + 24);
1311                 float32x4_t _r17 = vld1q_f32(r1 + 28);
1312                 float32x4_t _r18 = vld1q_f32(r1 + 32);
1313                 float32x4_t _r19 = vld1q_f32(r1 + 36);
1314                 float32x4_t _r110 = vld1q_f32(r1 + 40);
1315 
1316                 float32x4_t _k10 = vld1q_f32(k0);
1317                 float32x4_t _k11 = vld1q_f32(k0 + 4);
1318                 float32x4_t _k12 = vld1q_f32(k0 + 8);
1319                 float32x4_t _k13 = vld1q_f32(k0 + 12);
1320                 float32x4_t _k14 = vld1q_f32(k0 + 16);
1321                 k0 += 20;
1322 
1323                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1324                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1325                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1326                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1327                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1328                 _sum1 = vmlaq_f32(_sum1, _k10, _r12);
1329                 _sum1 = vmlaq_f32(_sum1, _k11, _r13);
1330                 _sum1 = vmlaq_f32(_sum1, _k12, _r14);
1331                 _sum1 = vmlaq_f32(_sum1, _k13, _r15);
1332                 _sum1 = vmlaq_f32(_sum1, _k14, _r16);
1333                 _sum2 = vmlaq_f32(_sum2, _k10, _r14);
1334                 _sum2 = vmlaq_f32(_sum2, _k11, _r15);
1335                 _sum2 = vmlaq_f32(_sum2, _k12, _r16);
1336                 _sum2 = vmlaq_f32(_sum2, _k13, _r17);
1337                 _sum2 = vmlaq_f32(_sum2, _k14, _r18);
1338                 _sum3 = vmlaq_f32(_sum3, _k10, _r16);
1339                 _sum3 = vmlaq_f32(_sum3, _k11, _r17);
1340                 _sum3 = vmlaq_f32(_sum3, _k12, _r18);
1341                 _sum3 = vmlaq_f32(_sum3, _k13, _r19);
1342                 _sum3 = vmlaq_f32(_sum3, _k14, _r110);
1343 
1344                 float32x4_t _r20 = vld1q_f32(r2);
1345                 float32x4_t _r21 = vld1q_f32(r2 + 4);
1346                 float32x4_t _r22 = vld1q_f32(r2 + 8);
1347                 float32x4_t _r23 = vld1q_f32(r2 + 12);
1348                 float32x4_t _r24 = vld1q_f32(r2 + 16);
1349                 float32x4_t _r25 = vld1q_f32(r2 + 20);
1350                 float32x4_t _r26 = vld1q_f32(r2 + 24);
1351                 float32x4_t _r27 = vld1q_f32(r2 + 28);
1352                 float32x4_t _r28 = vld1q_f32(r2 + 32);
1353                 float32x4_t _r29 = vld1q_f32(r2 + 36);
1354                 float32x4_t _r210 = vld1q_f32(r2 + 40);
1355 
1356                 float32x4_t _k20 = vld1q_f32(k0);
1357                 float32x4_t _k21 = vld1q_f32(k0 + 4);
1358                 float32x4_t _k22 = vld1q_f32(k0 + 8);
1359                 float32x4_t _k23 = vld1q_f32(k0 + 12);
1360                 float32x4_t _k24 = vld1q_f32(k0 + 16);
1361                 k0 += 20;
1362 
1363                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1364                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1365                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1366                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1367                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1368                 _sum1 = vmlaq_f32(_sum1, _k20, _r22);
1369                 _sum1 = vmlaq_f32(_sum1, _k21, _r23);
1370                 _sum1 = vmlaq_f32(_sum1, _k22, _r24);
1371                 _sum1 = vmlaq_f32(_sum1, _k23, _r25);
1372                 _sum1 = vmlaq_f32(_sum1, _k24, _r26);
1373                 _sum2 = vmlaq_f32(_sum2, _k20, _r24);
1374                 _sum2 = vmlaq_f32(_sum2, _k21, _r25);
1375                 _sum2 = vmlaq_f32(_sum2, _k22, _r26);
1376                 _sum2 = vmlaq_f32(_sum2, _k23, _r27);
1377                 _sum2 = vmlaq_f32(_sum2, _k24, _r28);
1378                 _sum3 = vmlaq_f32(_sum3, _k20, _r26);
1379                 _sum3 = vmlaq_f32(_sum3, _k21, _r27);
1380                 _sum3 = vmlaq_f32(_sum3, _k22, _r28);
1381                 _sum3 = vmlaq_f32(_sum3, _k23, _r29);
1382                 _sum3 = vmlaq_f32(_sum3, _k24, _r210);
1383 
1384                 float32x4_t _r30 = vld1q_f32(r3);
1385                 float32x4_t _r31 = vld1q_f32(r3 + 4);
1386                 float32x4_t _r32 = vld1q_f32(r3 + 8);
1387                 float32x4_t _r33 = vld1q_f32(r3 + 12);
1388                 float32x4_t _r34 = vld1q_f32(r3 + 16);
1389                 float32x4_t _r35 = vld1q_f32(r3 + 20);
1390                 float32x4_t _r36 = vld1q_f32(r3 + 24);
1391                 float32x4_t _r37 = vld1q_f32(r3 + 28);
1392                 float32x4_t _r38 = vld1q_f32(r3 + 32);
1393                 float32x4_t _r39 = vld1q_f32(r3 + 36);
1394                 float32x4_t _r310 = vld1q_f32(r3 + 40);
1395 
1396                 float32x4_t _k30 = vld1q_f32(k0);
1397                 float32x4_t _k31 = vld1q_f32(k0 + 4);
1398                 float32x4_t _k32 = vld1q_f32(k0 + 8);
1399                 float32x4_t _k33 = vld1q_f32(k0 + 12);
1400                 float32x4_t _k34 = vld1q_f32(k0 + 16);
1401                 k0 += 20;
1402 
1403                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1404                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1405                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1406                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1407                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1408                 _sum1 = vmlaq_f32(_sum1, _k30, _r32);
1409                 _sum1 = vmlaq_f32(_sum1, _k31, _r33);
1410                 _sum1 = vmlaq_f32(_sum1, _k32, _r34);
1411                 _sum1 = vmlaq_f32(_sum1, _k33, _r35);
1412                 _sum1 = vmlaq_f32(_sum1, _k34, _r36);
1413                 _sum2 = vmlaq_f32(_sum2, _k30, _r34);
1414                 _sum2 = vmlaq_f32(_sum2, _k31, _r35);
1415                 _sum2 = vmlaq_f32(_sum2, _k32, _r36);
1416                 _sum2 = vmlaq_f32(_sum2, _k33, _r37);
1417                 _sum2 = vmlaq_f32(_sum2, _k34, _r38);
1418                 _sum3 = vmlaq_f32(_sum3, _k30, _r36);
1419                 _sum3 = vmlaq_f32(_sum3, _k31, _r37);
1420                 _sum3 = vmlaq_f32(_sum3, _k32, _r38);
1421                 _sum3 = vmlaq_f32(_sum3, _k33, _r39);
1422                 _sum3 = vmlaq_f32(_sum3, _k34, _r310);
1423 
1424                 float32x4_t _r40 = vld1q_f32(r4);
1425                 float32x4_t _r41 = vld1q_f32(r4 + 4);
1426                 float32x4_t _r42 = vld1q_f32(r4 + 8);
1427                 float32x4_t _r43 = vld1q_f32(r4 + 12);
1428                 float32x4_t _r44 = vld1q_f32(r4 + 16);
1429                 float32x4_t _r45 = vld1q_f32(r4 + 20);
1430                 float32x4_t _r46 = vld1q_f32(r4 + 24);
1431                 float32x4_t _r47 = vld1q_f32(r4 + 28);
1432                 float32x4_t _r48 = vld1q_f32(r4 + 32);
1433                 float32x4_t _r49 = vld1q_f32(r4 + 36);
1434                 float32x4_t _r410 = vld1q_f32(r4 + 40);
1435 
1436                 float32x4_t _k40 = vld1q_f32(k0);
1437                 float32x4_t _k41 = vld1q_f32(k0 + 4);
1438                 float32x4_t _k42 = vld1q_f32(k0 + 8);
1439                 float32x4_t _k43 = vld1q_f32(k0 + 12);
1440                 float32x4_t _k44 = vld1q_f32(k0 + 16);
1441                 k0 -= 80;
1442 
1443                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1444                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1445                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1446                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1447                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1448                 _sum1 = vmlaq_f32(_sum1, _k40, _r42);
1449                 _sum1 = vmlaq_f32(_sum1, _k41, _r43);
1450                 _sum1 = vmlaq_f32(_sum1, _k42, _r44);
1451                 _sum1 = vmlaq_f32(_sum1, _k43, _r45);
1452                 _sum1 = vmlaq_f32(_sum1, _k44, _r46);
1453                 _sum2 = vmlaq_f32(_sum2, _k40, _r44);
1454                 _sum2 = vmlaq_f32(_sum2, _k41, _r45);
1455                 _sum2 = vmlaq_f32(_sum2, _k42, _r46);
1456                 _sum2 = vmlaq_f32(_sum2, _k43, _r47);
1457                 _sum2 = vmlaq_f32(_sum2, _k44, _r48);
1458                 _sum3 = vmlaq_f32(_sum3, _k40, _r46);
1459                 _sum3 = vmlaq_f32(_sum3, _k41, _r47);
1460                 _sum3 = vmlaq_f32(_sum3, _k42, _r48);
1461                 _sum3 = vmlaq_f32(_sum3, _k43, _r49);
1462                 _sum3 = vmlaq_f32(_sum3, _k44, _r410);
1463 
1464                 vst1q_f32(outptr0, _sum0);
1465                 vst1q_f32(outptr0 + 4, _sum1);
1466                 vst1q_f32(outptr0 + 8, _sum2);
1467                 vst1q_f32(outptr0 + 12, _sum3);
1468 
1469                 r0 += 8 * 4;
1470                 r1 += 8 * 4;
1471                 r2 += 8 * 4;
1472                 r3 += 8 * 4;
1473                 r4 += 8 * 4;
1474                 outptr0 += 16;
1475             }
1476             for (; j + 1 < outw; j += 2)
1477             {
1478                 float32x4_t _sum0 = _bias0;
1479                 float32x4_t _sum1 = _bias0;
1480 
1481                 float32x4_t _r00 = vld1q_f32(r0);
1482                 float32x4_t _r01 = vld1q_f32(r0 + 4);
1483                 float32x4_t _r02 = vld1q_f32(r0 + 8);
1484                 float32x4_t _r03 = vld1q_f32(r0 + 12);
1485                 float32x4_t _r04 = vld1q_f32(r0 + 16);
1486                 float32x4_t _r05 = vld1q_f32(r0 + 20);
1487                 float32x4_t _r06 = vld1q_f32(r0 + 24);
1488 
1489                 float32x4_t _k00 = vld1q_f32(k0);
1490                 float32x4_t _k01 = vld1q_f32(k0 + 4);
1491                 float32x4_t _k02 = vld1q_f32(k0 + 8);
1492                 float32x4_t _k03 = vld1q_f32(k0 + 12);
1493                 float32x4_t _k04 = vld1q_f32(k0 + 16);
1494                 k0 += 20;
1495 
1496                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1497                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1498                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1499                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1500                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1501                 _sum1 = vmlaq_f32(_sum1, _k00, _r02);
1502                 _sum1 = vmlaq_f32(_sum1, _k01, _r03);
1503                 _sum1 = vmlaq_f32(_sum1, _k02, _r04);
1504                 _sum1 = vmlaq_f32(_sum1, _k03, _r05);
1505                 _sum1 = vmlaq_f32(_sum1, _k04, _r06);
1506 
1507                 float32x4_t _r10 = vld1q_f32(r1);
1508                 float32x4_t _r11 = vld1q_f32(r1 + 4);
1509                 float32x4_t _r12 = vld1q_f32(r1 + 8);
1510                 float32x4_t _r13 = vld1q_f32(r1 + 12);
1511                 float32x4_t _r14 = vld1q_f32(r1 + 16);
1512                 float32x4_t _r15 = vld1q_f32(r1 + 20);
1513                 float32x4_t _r16 = vld1q_f32(r1 + 24);
1514 
1515                 float32x4_t _k10 = vld1q_f32(k0);
1516                 float32x4_t _k11 = vld1q_f32(k0 + 4);
1517                 float32x4_t _k12 = vld1q_f32(k0 + 8);
1518                 float32x4_t _k13 = vld1q_f32(k0 + 12);
1519                 float32x4_t _k14 = vld1q_f32(k0 + 16);
1520                 k0 += 20;
1521 
1522                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1523                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1524                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1525                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1526                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1527                 _sum1 = vmlaq_f32(_sum1, _k10, _r12);
1528                 _sum1 = vmlaq_f32(_sum1, _k11, _r13);
1529                 _sum1 = vmlaq_f32(_sum1, _k12, _r14);
1530                 _sum1 = vmlaq_f32(_sum1, _k13, _r15);
1531                 _sum1 = vmlaq_f32(_sum1, _k14, _r16);
1532 
1533                 float32x4_t _r20 = vld1q_f32(r2);
1534                 float32x4_t _r21 = vld1q_f32(r2 + 4);
1535                 float32x4_t _r22 = vld1q_f32(r2 + 8);
1536                 float32x4_t _r23 = vld1q_f32(r2 + 12);
1537                 float32x4_t _r24 = vld1q_f32(r2 + 16);
1538                 float32x4_t _r25 = vld1q_f32(r2 + 20);
1539                 float32x4_t _r26 = vld1q_f32(r2 + 24);
1540 
1541                 float32x4_t _k20 = vld1q_f32(k0);
1542                 float32x4_t _k21 = vld1q_f32(k0 + 4);
1543                 float32x4_t _k22 = vld1q_f32(k0 + 8);
1544                 float32x4_t _k23 = vld1q_f32(k0 + 12);
1545                 float32x4_t _k24 = vld1q_f32(k0 + 16);
1546                 k0 += 20;
1547 
1548                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1549                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1550                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1551                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1552                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1553                 _sum1 = vmlaq_f32(_sum1, _k20, _r22);
1554                 _sum1 = vmlaq_f32(_sum1, _k21, _r23);
1555                 _sum1 = vmlaq_f32(_sum1, _k22, _r24);
1556                 _sum1 = vmlaq_f32(_sum1, _k23, _r25);
1557                 _sum1 = vmlaq_f32(_sum1, _k24, _r26);
1558 
1559                 float32x4_t _r30 = vld1q_f32(r3);
1560                 float32x4_t _r31 = vld1q_f32(r3 + 4);
1561                 float32x4_t _r32 = vld1q_f32(r3 + 8);
1562                 float32x4_t _r33 = vld1q_f32(r3 + 12);
1563                 float32x4_t _r34 = vld1q_f32(r3 + 16);
1564                 float32x4_t _r35 = vld1q_f32(r3 + 20);
1565                 float32x4_t _r36 = vld1q_f32(r3 + 24);
1566 
1567                 float32x4_t _k30 = vld1q_f32(k0);
1568                 float32x4_t _k31 = vld1q_f32(k0 + 4);
1569                 float32x4_t _k32 = vld1q_f32(k0 + 8);
1570                 float32x4_t _k33 = vld1q_f32(k0 + 12);
1571                 float32x4_t _k34 = vld1q_f32(k0 + 16);
1572                 k0 += 20;
1573 
1574                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1575                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1576                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1577                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1578                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1579                 _sum1 = vmlaq_f32(_sum1, _k30, _r32);
1580                 _sum1 = vmlaq_f32(_sum1, _k31, _r33);
1581                 _sum1 = vmlaq_f32(_sum1, _k32, _r34);
1582                 _sum1 = vmlaq_f32(_sum1, _k33, _r35);
1583                 _sum1 = vmlaq_f32(_sum1, _k34, _r36);
1584 
1585                 float32x4_t _r40 = vld1q_f32(r4);
1586                 float32x4_t _r41 = vld1q_f32(r4 + 4);
1587                 float32x4_t _r42 = vld1q_f32(r4 + 8);
1588                 float32x4_t _r43 = vld1q_f32(r4 + 12);
1589                 float32x4_t _r44 = vld1q_f32(r4 + 16);
1590                 float32x4_t _r45 = vld1q_f32(r4 + 20);
1591                 float32x4_t _r46 = vld1q_f32(r4 + 24);
1592 
1593                 float32x4_t _k40 = vld1q_f32(k0);
1594                 float32x4_t _k41 = vld1q_f32(k0 + 4);
1595                 float32x4_t _k42 = vld1q_f32(k0 + 8);
1596                 float32x4_t _k43 = vld1q_f32(k0 + 12);
1597                 float32x4_t _k44 = vld1q_f32(k0 + 16);
1598                 k0 -= 80;
1599 
1600                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1601                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1602                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1603                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1604                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1605                 _sum1 = vmlaq_f32(_sum1, _k40, _r42);
1606                 _sum1 = vmlaq_f32(_sum1, _k41, _r43);
1607                 _sum1 = vmlaq_f32(_sum1, _k42, _r44);
1608                 _sum1 = vmlaq_f32(_sum1, _k43, _r45);
1609                 _sum1 = vmlaq_f32(_sum1, _k44, _r46);
1610 
1611                 vst1q_f32(outptr0, _sum0);
1612                 vst1q_f32(outptr0 + 4, _sum1);
1613 
1614                 r0 += 4 * 4;
1615                 r1 += 4 * 4;
1616                 r2 += 4 * 4;
1617                 r3 += 4 * 4;
1618                 r4 += 4 * 4;
1619                 outptr0 += 8;
1620             }
1621             for (; j < outw; j++)
1622             {
1623                 float32x4_t _sum0 = _bias0;
1624 
1625                 float32x4_t _r00 = vld1q_f32(r0);
1626                 float32x4_t _r01 = vld1q_f32(r0 + 4);
1627                 float32x4_t _r02 = vld1q_f32(r0 + 8);
1628                 float32x4_t _r03 = vld1q_f32(r0 + 12);
1629                 float32x4_t _r04 = vld1q_f32(r0 + 16);
1630 
1631                 float32x4_t _k00 = vld1q_f32(k0);
1632                 float32x4_t _k01 = vld1q_f32(k0 + 4);
1633                 float32x4_t _k02 = vld1q_f32(k0 + 8);
1634                 float32x4_t _k03 = vld1q_f32(k0 + 12);
1635                 float32x4_t _k04 = vld1q_f32(k0 + 16);
1636                 k0 += 20;
1637 
1638                 _sum0 = vmlaq_f32(_sum0, _k00, _r00);
1639                 _sum0 = vmlaq_f32(_sum0, _k01, _r01);
1640                 _sum0 = vmlaq_f32(_sum0, _k02, _r02);
1641                 _sum0 = vmlaq_f32(_sum0, _k03, _r03);
1642                 _sum0 = vmlaq_f32(_sum0, _k04, _r04);
1643 
1644                 float32x4_t _r10 = vld1q_f32(r1);
1645                 float32x4_t _r11 = vld1q_f32(r1 + 4);
1646                 float32x4_t _r12 = vld1q_f32(r1 + 8);
1647                 float32x4_t _r13 = vld1q_f32(r1 + 12);
1648                 float32x4_t _r14 = vld1q_f32(r1 + 16);
1649 
1650                 float32x4_t _k10 = vld1q_f32(k0);
1651                 float32x4_t _k11 = vld1q_f32(k0 + 4);
1652                 float32x4_t _k12 = vld1q_f32(k0 + 8);
1653                 float32x4_t _k13 = vld1q_f32(k0 + 12);
1654                 float32x4_t _k14 = vld1q_f32(k0 + 16);
1655                 k0 += 20;
1656 
1657                 _sum0 = vmlaq_f32(_sum0, _k10, _r10);
1658                 _sum0 = vmlaq_f32(_sum0, _k11, _r11);
1659                 _sum0 = vmlaq_f32(_sum0, _k12, _r12);
1660                 _sum0 = vmlaq_f32(_sum0, _k13, _r13);
1661                 _sum0 = vmlaq_f32(_sum0, _k14, _r14);
1662 
1663                 float32x4_t _r20 = vld1q_f32(r2);
1664                 float32x4_t _r21 = vld1q_f32(r2 + 4);
1665                 float32x4_t _r22 = vld1q_f32(r2 + 8);
1666                 float32x4_t _r23 = vld1q_f32(r2 + 12);
1667                 float32x4_t _r24 = vld1q_f32(r2 + 16);
1668 
1669                 float32x4_t _k20 = vld1q_f32(k0);
1670                 float32x4_t _k21 = vld1q_f32(k0 + 4);
1671                 float32x4_t _k22 = vld1q_f32(k0 + 8);
1672                 float32x4_t _k23 = vld1q_f32(k0 + 12);
1673                 float32x4_t _k24 = vld1q_f32(k0 + 16);
1674                 k0 += 20;
1675 
1676                 _sum0 = vmlaq_f32(_sum0, _k20, _r20);
1677                 _sum0 = vmlaq_f32(_sum0, _k21, _r21);
1678                 _sum0 = vmlaq_f32(_sum0, _k22, _r22);
1679                 _sum0 = vmlaq_f32(_sum0, _k23, _r23);
1680                 _sum0 = vmlaq_f32(_sum0, _k24, _r24);
1681 
1682                 float32x4_t _r30 = vld1q_f32(r3);
1683                 float32x4_t _r31 = vld1q_f32(r3 + 4);
1684                 float32x4_t _r32 = vld1q_f32(r3 + 8);
1685                 float32x4_t _r33 = vld1q_f32(r3 + 12);
1686                 float32x4_t _r34 = vld1q_f32(r3 + 16);
1687 
1688                 float32x4_t _k30 = vld1q_f32(k0);
1689                 float32x4_t _k31 = vld1q_f32(k0 + 4);
1690                 float32x4_t _k32 = vld1q_f32(k0 + 8);
1691                 float32x4_t _k33 = vld1q_f32(k0 + 12);
1692                 float32x4_t _k34 = vld1q_f32(k0 + 16);
1693                 k0 += 20;
1694 
1695                 _sum0 = vmlaq_f32(_sum0, _k30, _r30);
1696                 _sum0 = vmlaq_f32(_sum0, _k31, _r31);
1697                 _sum0 = vmlaq_f32(_sum0, _k32, _r32);
1698                 _sum0 = vmlaq_f32(_sum0, _k33, _r33);
1699                 _sum0 = vmlaq_f32(_sum0, _k34, _r34);
1700 
1701                 float32x4_t _r40 = vld1q_f32(r4);
1702                 float32x4_t _r41 = vld1q_f32(r4 + 4);
1703                 float32x4_t _r42 = vld1q_f32(r4 + 8);
1704                 float32x4_t _r43 = vld1q_f32(r4 + 12);
1705                 float32x4_t _r44 = vld1q_f32(r4 + 16);
1706 
1707                 float32x4_t _k40 = vld1q_f32(k0);
1708                 float32x4_t _k41 = vld1q_f32(k0 + 4);
1709                 float32x4_t _k42 = vld1q_f32(k0 + 8);
1710                 float32x4_t _k43 = vld1q_f32(k0 + 12);
1711                 float32x4_t _k44 = vld1q_f32(k0 + 16);
1712                 k0 -= 80;
1713 
1714                 _sum0 = vmlaq_f32(_sum0, _k40, _r40);
1715                 _sum0 = vmlaq_f32(_sum0, _k41, _r41);
1716                 _sum0 = vmlaq_f32(_sum0, _k42, _r42);
1717                 _sum0 = vmlaq_f32(_sum0, _k43, _r43);
1718                 _sum0 = vmlaq_f32(_sum0, _k44, _r44);
1719 
1720                 vst1q_f32(outptr0, _sum0);
1721 
1722                 r0 += 2 * 4;
1723                 r1 += 2 * 4;
1724                 r2 += 2 * 4;
1725                 r3 += 2 * 4;
1726                 r4 += 2 * 4;
1727                 outptr0 += 4;
1728             }
1729 
1730             r0 += tailstep;
1731             r1 += tailstep;
1732             r2 += tailstep;
1733             r3 += tailstep;
1734             r4 += tailstep;
1735         }
1736     }
1737 }
1738