1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv3x3s1_winograd64_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)15 static void conv3x3s1_winograd64_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
16 {
17     // winograd63 transform kernel
18     Mat kernel_tm;
19     kernel_tm.create(8 * 8, inch, outch);
20 
21     const float ktm[8][3] = {
22         {1.0f, 0.0f, 0.0f},
23         {-2.0f / 9, -2.0f / 9, -2.0f / 9},
24         {-2.0f / 9, 2.0f / 9, -2.0f / 9},
25         {1.0f / 90, 1.0f / 45, 2.0f / 45},
26         {1.0f / 90, -1.0f / 45, 2.0f / 45},
27         {1.0f / 45, 1.0f / 90, 1.0f / 180},
28         {1.0f / 45, -1.0f / 90, 1.0f / 180},
29         {0.0f, 0.0f, 1.0f}
30     };
31 
32     #pragma omp parallel for
33     for (int p = 0; p < outch; p++)
34     {
35         for (int q = 0; q < inch; q++)
36         {
37             const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
38             float* kernel_tm0 = kernel_tm.channel(p).row(q);
39 
40             // transform kernel, transposed
41             const float* k0 = kernel0;
42             const float* k1 = kernel0 + 3;
43             const float* k2 = kernel0 + 6;
44 
45             // h
46             float tmp[8][3];
47             for (int i = 0; i < 8; i++)
48             {
49                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
50                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
51                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
52             }
53 
54             // v
55             for (int j = 0; j < 8; j++)
56             {
57                 float* tmpp = &tmp[j][0];
58 
59                 for (int i = 0; i < 8; i++)
60                 {
61                     kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
62                 }
63             }
64         }
65     }
66 
67     // interleave
68     // src = 64-inch-outch
69     // dst = 4b-4a-inch/4a-64-outch/4b;
70 #if __aarch64__
71     kernel_tm_pack4.create(2 * inch / 4, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
72 #else
73     kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 16, 16);
74 #endif
75 
76     int q = 0;
77 #if __aarch64__
78     for (; q + 7 < outch; q += 8)
79     {
80         const Mat k0 = kernel_tm.channel(q);
81         const Mat k1 = kernel_tm.channel(q + 1);
82         const Mat k2 = kernel_tm.channel(q + 2);
83         const Mat k3 = kernel_tm.channel(q + 3);
84         const Mat k4 = kernel_tm.channel(q + 4);
85         const Mat k5 = kernel_tm.channel(q + 5);
86         const Mat k6 = kernel_tm.channel(q + 6);
87         const Mat k7 = kernel_tm.channel(q + 7);
88 
89         Mat g0 = kernel_tm_pack4.channel(q / 8);
90 
91         for (int k = 0; k < 64; k++)
92         {
93             float* g00 = g0.row(k);
94 
95             for (int p = 0; p + 3 < inch; p += 4)
96             {
97                 const float* k00 = k0.row(p);
98                 const float* k01 = k0.row(p + 1);
99                 const float* k02 = k0.row(p + 2);
100                 const float* k03 = k0.row(p + 3);
101 
102                 const float* k10 = k1.row(p);
103                 const float* k11 = k1.row(p + 1);
104                 const float* k12 = k1.row(p + 2);
105                 const float* k13 = k1.row(p + 3);
106 
107                 const float* k20 = k2.row(p);
108                 const float* k21 = k2.row(p + 1);
109                 const float* k22 = k2.row(p + 2);
110                 const float* k23 = k2.row(p + 3);
111 
112                 const float* k30 = k3.row(p);
113                 const float* k31 = k3.row(p + 1);
114                 const float* k32 = k3.row(p + 2);
115                 const float* k33 = k3.row(p + 3);
116 
117                 const float* k40 = k4.row(p);
118                 const float* k41 = k4.row(p + 1);
119                 const float* k42 = k4.row(p + 2);
120                 const float* k43 = k4.row(p + 3);
121 
122                 const float* k50 = k5.row(p);
123                 const float* k51 = k5.row(p + 1);
124                 const float* k52 = k5.row(p + 2);
125                 const float* k53 = k5.row(p + 3);
126 
127                 const float* k60 = k6.row(p);
128                 const float* k61 = k6.row(p + 1);
129                 const float* k62 = k6.row(p + 2);
130                 const float* k63 = k6.row(p + 3);
131 
132                 const float* k70 = k7.row(p);
133                 const float* k71 = k7.row(p + 1);
134                 const float* k72 = k7.row(p + 2);
135                 const float* k73 = k7.row(p + 3);
136 
137                 g00[0] = k00[k];
138                 g00[1] = k10[k];
139                 g00[2] = k20[k];
140                 g00[3] = k30[k];
141 
142                 g00[4] = k40[k];
143                 g00[5] = k50[k];
144                 g00[6] = k60[k];
145                 g00[7] = k70[k];
146 
147                 g00[8] = k01[k];
148                 g00[9] = k11[k];
149                 g00[10] = k21[k];
150                 g00[11] = k31[k];
151 
152                 g00[12] = k41[k];
153                 g00[13] = k51[k];
154                 g00[14] = k61[k];
155                 g00[15] = k71[k];
156 
157                 g00[16] = k02[k];
158                 g00[17] = k12[k];
159                 g00[18] = k22[k];
160                 g00[19] = k32[k];
161 
162                 g00[20] = k42[k];
163                 g00[21] = k52[k];
164                 g00[22] = k62[k];
165                 g00[23] = k72[k];
166 
167                 g00[24] = k03[k];
168                 g00[25] = k13[k];
169                 g00[26] = k23[k];
170                 g00[27] = k33[k];
171 
172                 g00[28] = k43[k];
173                 g00[29] = k53[k];
174                 g00[30] = k63[k];
175                 g00[31] = k73[k];
176 
177                 g00 += 32;
178             }
179         }
180     }
181 #endif // __aarch64__
182     for (; q + 3 < outch; q += 4)
183     {
184         const Mat k0 = kernel_tm.channel(q);
185         const Mat k1 = kernel_tm.channel(q + 1);
186         const Mat k2 = kernel_tm.channel(q + 2);
187         const Mat k3 = kernel_tm.channel(q + 3);
188 
189 #if __aarch64__
190         Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
191 #else
192         Mat g0 = kernel_tm_pack4.channel(q / 4);
193 #endif
194 
195         for (int k = 0; k < 64; k++)
196         {
197             float* g00 = g0.row(k);
198 
199             for (int p = 0; p + 3 < inch; p += 4)
200             {
201                 const float* k00 = k0.row(p);
202                 const float* k01 = k0.row(p + 1);
203                 const float* k02 = k0.row(p + 2);
204                 const float* k03 = k0.row(p + 3);
205 
206                 const float* k10 = k1.row(p);
207                 const float* k11 = k1.row(p + 1);
208                 const float* k12 = k1.row(p + 2);
209                 const float* k13 = k1.row(p + 3);
210 
211                 const float* k20 = k2.row(p);
212                 const float* k21 = k2.row(p + 1);
213                 const float* k22 = k2.row(p + 2);
214                 const float* k23 = k2.row(p + 3);
215 
216                 const float* k30 = k3.row(p);
217                 const float* k31 = k3.row(p + 1);
218                 const float* k32 = k3.row(p + 2);
219                 const float* k33 = k3.row(p + 3);
220 
221                 g00[0] = k00[k];
222                 g00[1] = k10[k];
223                 g00[2] = k20[k];
224                 g00[3] = k30[k];
225 
226                 g00[4] = k01[k];
227                 g00[5] = k11[k];
228                 g00[6] = k21[k];
229                 g00[7] = k31[k];
230 
231                 g00[8] = k02[k];
232                 g00[9] = k12[k];
233                 g00[10] = k22[k];
234                 g00[11] = k32[k];
235 
236                 g00[12] = k03[k];
237                 g00[13] = k13[k];
238                 g00[14] = k23[k];
239                 g00[15] = k33[k];
240 
241                 g00 += 16;
242             }
243         }
244     }
245 }
246 
conv3x3s1_winograd64_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)247 static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
248 {
249     int w = bottom_blob.w;
250     int h = bottom_blob.h;
251     int inch = bottom_blob.c;
252     size_t elemsize = bottom_blob.elemsize;
253     int elempack = bottom_blob.elempack;
254 
255     int outw = top_blob.w;
256     int outh = top_blob.h;
257     int outch = top_blob.c;
258 
259     // pad to 6n+2
260     Mat bottom_blob_bordered = bottom_blob;
261 
262     outw = (outw + 5) / 6 * 6;
263     outh = (outh + 5) / 6 * 6;
264 
265     w = outw + 2;
266     h = outh + 2;
267     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
268 
269     const float* bias = _bias;
270 
271     // BEGIN transform input
272     Mat bottom_blob_tm;
273     {
274         int w_tm = outw / 6 * 8;
275         int h_tm = outh / 6 * 8;
276 
277         const int tiles = w_tm / 8 * h_tm / 8;
278 
279         bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
280 
281         //         const float itm[8][8] = {
282         //             {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
283         //
284         //             {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
285         //             {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
286         //
287         //             {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
288         //             {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
289         //
290         //             {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
291         //             {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
292         //
293         //             {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
294         //         };
295 
296         // 0 = r00 - r06 + (r04 - r02) * 5.25
297         // 7 = r07 - r01 + (r03 - r05) * 5.25
298 
299         // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
300         // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
301 
302         // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
303         // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
304 
305         // reuse r04 * 1.25
306         // reuse r03 * 2.5
307         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
308         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
309 
310         #pragma omp parallel for num_threads(opt.num_threads)
311         for (int q = 0; q < inch; q++)
312         {
313             const Mat img0 = bottom_blob_bordered.channel(q);
314             Mat img0_tm = bottom_blob_tm.channel(q);
315 
316             float tmp[8][8][4];
317 
318             // tile
319             for (int i = 0; i < h_tm / 8; i++)
320             {
321                 for (int j = 0; j < w_tm / 8; j++)
322                 {
323                     const float* r0 = img0.row(i * 6) + (j * 6) * 4;
324 
325                     for (int m = 0; m < 8; m++)
326                     {
327                         float32x4_t _r00 = vld1q_f32(r0);
328                         float32x4_t _r01 = vld1q_f32(r0 + 4);
329                         float32x4_t _r02 = vld1q_f32(r0 + 8);
330                         float32x4_t _r03 = vld1q_f32(r0 + 12);
331                         float32x4_t _r04 = vld1q_f32(r0 + 16);
332                         float32x4_t _r05 = vld1q_f32(r0 + 20);
333                         float32x4_t _r06 = vld1q_f32(r0 + 24);
334                         float32x4_t _r07 = vld1q_f32(r0 + 28);
335 
336                         float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
337                         float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
338                         vst1q_f32(tmp[0][m], _tmp0m);
339                         vst1q_f32(tmp[7][m], _tmp7m);
340 
341                         //                         tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25;
342                         //                         tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25;
343 
344                         float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
345                         float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
346 
347                         //                         float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25);
348                         //                         float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25);
349 
350                         float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
351                         float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
352                         vst1q_f32(tmp[1][m], _tmp1m);
353                         vst1q_f32(tmp[2][m], _tmp2m);
354 
355                         //                         tmp[1][m] = tmp12a + tmp12b;
356                         //                         tmp[2][m] = tmp12a - tmp12b;
357 
358                         float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
359                         float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
360 
361                         //                         float tmp34a = (r0[6] + r0[2] * 0.25 - r0[4] * 1.25);
362                         //                         float tmp34b = (r0[1] * 0.5 - r0[3] * 2.5 + r0[5] * 2);
363 
364                         float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
365                         float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
366                         vst1q_f32(tmp[3][m], _tmp3m);
367                         vst1q_f32(tmp[4][m], _tmp4m);
368 
369                         //                         tmp[3][m] = tmp34a + tmp34b;
370                         //                         tmp[4][m] = tmp34a - tmp34b;
371 
372                         float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
373                         float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
374 
375                         //                         float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25) * 4);
376                         //                         float tmp56b = (r0[1] * 2 - r0[3] * 2.5 + r0[5] * 0.5);
377 
378                         float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
379                         float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
380                         vst1q_f32(tmp[5][m], _tmp5m);
381                         vst1q_f32(tmp[6][m], _tmp6m);
382 
383                         //                         tmp[5][m] = tmp56a + tmp56b;
384                         //                         tmp[6][m] = tmp56a - tmp56b;
385 
386                         r0 += w * 4;
387                     }
388 
389                     float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 8 + j) * 4;
390                     float* r0_tm_1 = r0_tm_0 + tiles * 4;
391                     float* r0_tm_2 = r0_tm_0 + tiles * 8;
392                     float* r0_tm_3 = r0_tm_0 + tiles * 12;
393                     float* r0_tm_4 = r0_tm_0 + tiles * 16;
394                     float* r0_tm_5 = r0_tm_0 + tiles * 20;
395                     float* r0_tm_6 = r0_tm_0 + tiles * 24;
396                     float* r0_tm_7 = r0_tm_0 + tiles * 28;
397 
398                     for (int m = 0; m < 8; m++)
399                     {
400                         float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
401                         float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
402                         float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
403                         float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
404                         float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
405                         float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
406                         float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
407                         float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
408 
409                         float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
410                         float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
411 
412                         //                         r0_tm[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25;
413                         //                         r0_tm[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25;
414 
415                         float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
416                         float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
417 
418                         //                         float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25);
419                         //                         float tmp12b = (tmp0[1] + tmp0[5] - tmp0[3] * 4.25);
420 
421                         float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
422                         float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
423 
424                         //                         r0_tm[1] = tmp12a + tmp12b;
425                         //                         r0_tm[2] = tmp12a - tmp12b;
426 
427                         float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
428                         float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
429 
430                         //                         float tmp34a = (tmp0[6] + tmp0[2] * 0.25 - tmp0[4] * 1.25);
431                         //                         float tmp34b = (tmp0[1] * 0.5 - tmp0[3] * 2.5 + tmp0[5] * 2);
432 
433                         float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
434                         float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
435 
436                         //                         r0_tm[3] = tmp34a + tmp34b;
437                         //                         r0_tm[4] = tmp34a - tmp34b;
438 
439                         float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
440                         float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
441 
442                         //                         float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25) * 4);
443                         //                         float tmp56b = (tmp0[1] * 2 - tmp0[3] * 2.5 + tmp0[5] * 0.5);
444 
445                         float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
446                         float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
447 
448                         //                         r0_tm[5] = tmp56a + tmp56b;
449                         //                         r0_tm[6] = tmp56a - tmp56b;
450 
451                         vst1q_f32(r0_tm_0, _r0tm0);
452                         vst1q_f32(r0_tm_1, _r0tm1);
453                         vst1q_f32(r0_tm_2, _r0tm2);
454                         vst1q_f32(r0_tm_3, _r0tm3);
455                         vst1q_f32(r0_tm_4, _r0tm4);
456                         vst1q_f32(r0_tm_5, _r0tm5);
457                         vst1q_f32(r0_tm_6, _r0tm6);
458                         vst1q_f32(r0_tm_7, _r0tm7);
459 
460                         r0_tm_0 += tiles * 32;
461                         r0_tm_1 += tiles * 32;
462                         r0_tm_2 += tiles * 32;
463                         r0_tm_3 += tiles * 32;
464                         r0_tm_4 += tiles * 32;
465                         r0_tm_5 += tiles * 32;
466                         r0_tm_6 += tiles * 32;
467                         r0_tm_7 += tiles * 32;
468                     }
469                 }
470             }
471         }
472     }
473     bottom_blob_bordered = Mat();
474     // END transform input
475 
476     // BEGIN dot
477     Mat top_blob_tm;
478     {
479         int w_tm = outw / 6 * 8;
480         int h_tm = outh / 6 * 8;
481 
482         const int tiles = h_tm / 8 * w_tm / 8;
483 
484         // permute
485         //         bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
486         Mat bottom_blob_tm2;
487 #if __aarch64__
488         if (tiles >= 12)
489             bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, 64, elemsize, elempack, opt.workspace_allocator);
490         else if (tiles >= 8)
491             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
492         else if (tiles >= 4)
493             bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
494         else if (tiles >= 2)
495             bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
496         else // if (tiles >= 1)
497             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
498 #else
499         if (tiles >= 8)
500             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
501         else if (tiles >= 4)
502             bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
503         else if (tiles >= 2)
504             bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
505         else // if (tiles >= 1)
506             bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
507 #endif
508 
509         #pragma omp parallel for num_threads(opt.num_threads)
510         for (int r = 0; r < 64; r++)
511         {
512             Mat tm2 = bottom_blob_tm2.channel(r);
513 
514             // tile
515             int i = 0;
516 #if __aarch64__
517             for (; i + 11 < tiles; i += 12)
518             {
519                 float* tm2p = tm2.row(i / 12);
520 
521                 const float* r0 = bottom_blob_tm;
522 
523                 r0 += (r * tiles + i) * 4;
524 
525                 for (int q = 0; q < inch; q++)
526                 {
527                     asm volatile(
528                         "prfm   pldl1keep, [%0, #512]       \n"
529                         "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
530                         "prfm   pldl1keep, [%0, #512]       \n"
531                         "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
532                         "prfm   pldl1keep, [%0, #512]       \n"
533                         "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
534                         "st1    {v0.4s}, [%1], #16          \n"
535                         "st1    {v4.4s}, [%1], #16          \n"
536                         "st1    {v8.4s}, [%1], #16          \n"
537                         "sub    %0, %0, #128                \n"
538                         "st1    {v1.4s}, [%1], #16          \n"
539                         "st1    {v5.4s}, [%1], #16          \n"
540                         "st1    {v9.4s}, [%1], #16          \n"
541                         "st1    {v2.4s}, [%1], #16          \n"
542                         "st1    {v6.4s}, [%1], #16          \n"
543                         "st1    {v10.4s}, [%1], #16         \n"
544                         "st1    {v3.4s}, [%1], #16          \n"
545                         "st1    {v7.4s}, [%1], #16          \n"
546                         "st1    {v11.4s}, [%1], #16         \n"
547                         : "=r"(r0),  // %0
548                         "=r"(tm2p) // %1
549                         : "0"(r0),
550                         "1"(tm2p)
551                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
552                     r0 += bottom_blob_tm.cstep * 4;
553                 }
554             }
555 #endif
556             for (; i + 7 < tiles; i += 8)
557             {
558 #if __aarch64__
559                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
560 #else
561                 float* tm2p = tm2.row(i / 8);
562 #endif
563 
564                 const float* r0 = bottom_blob_tm;
565 
566                 r0 += (r * tiles + i) * 4;
567 
568                 for (int q = 0; q < inch; q++)
569                 {
570 #if __aarch64__
571                     asm volatile(
572                         "prfm   pldl1keep, [%0, #512]       \n"
573                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
574                         "prfm   pldl1keep, [%0, #512]       \n"
575                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
576                         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
577                         "sub    %0, %0, #64                 \n"
578                         "st1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
579                         : "=r"(r0),  // %0
580                         "=r"(tm2p) // %1
581                         : "0"(r0),
582                         "1"(tm2p)
583                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
584 #else
585                     asm volatile(
586                         "pld        [%0, #512]          \n"
587                         "vldm       %0!, {d0-d7}        \n"
588                         "pld        [%0, #512]          \n"
589                         "vldm       %0, {d16-d23}       \n"
590 
591                         // transpose 8x4
592                         "vtrn.32    q0, q1              \n"
593                         "vtrn.32    q2, q3              \n"
594                         "vtrn.32    q8, q9              \n"
595                         "vtrn.32    q10, q11            \n"
596                         "vswp       d1, d4              \n"
597                         "vswp       d3, d6              \n"
598                         "vswp       d17, d20            \n"
599                         "vswp       d19, d22            \n"
600                         "vswp       q1, q8              \n"
601                         "vswp       q3, q10             \n"
602 
603                         "vst1.f32   {d0-d3}, [%1 :128]! \n"
604                         "vst1.f32   {d16-d19}, [%1 :128]! \n"
605                         "sub        %0, %0, #64         \n"
606                         "vst1.f32   {d4-d7}, [%1 :128]! \n"
607                         "vst1.f32   {d20-d23}, [%1 :128]! \n"
608                         : "=r"(r0),  // %0
609                         "=r"(tm2p) // %1
610                         : "0"(r0),
611                         "1"(tm2p)
612                         : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
613 #endif
614                     r0 += bottom_blob_tm.cstep * 4;
615                 }
616             }
617             for (; i + 3 < tiles; i += 4)
618             {
619 #if __aarch64__
620                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
621 #else
622                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
623 #endif
624 
625                 const float* r0 = bottom_blob_tm;
626 
627                 r0 += (r * tiles + i) * 4;
628 
629                 for (int q = 0; q < inch; q++)
630                 {
631 #if __aarch64__
632                     asm volatile(
633                         "prfm   pldl1keep, [%0, #512]       \n"
634                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
635                         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
636                         : "=r"(r0),  // %0
637                         "=r"(tm2p) // %1
638                         : "0"(r0),
639                         "1"(tm2p)
640                         : "memory", "v0", "v1", "v2", "v3");
641 #else
642                     asm volatile(
643                         "pld        [%0, #512]          \n"
644                         "vldm       %0, {d0-d7}         \n"
645                         "vstm       %1!, {d0-d7}        \n"
646                         : "=r"(r0),  // %0
647                         "=r"(tm2p) // %1
648                         : "0"(r0),
649                         "1"(tm2p)
650                         : "memory", "q0", "q1", "q2", "q3");
651 #endif // __aarch64__
652                     r0 += bottom_blob_tm.cstep * 4;
653                 }
654             }
655             for (; i + 1 < tiles; i += 2)
656             {
657 #if __aarch64__
658                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
659 #else
660                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
661 #endif
662 
663                 const float* r0 = bottom_blob_tm;
664 
665                 r0 += (r * tiles + i) * 4;
666 
667                 for (int q = 0; q < inch; q++)
668                 {
669 #if __aarch64__
670                     asm volatile(
671                         "prfm   pldl1keep, [%0, #256]       \n"
672                         "ld1    {v0.4s, v1.4s}, [%0]        \n"
673                         "st1    {v0.4s, v1.4s}, [%1], #32   \n"
674                         : "=r"(r0),  // %0
675                         "=r"(tm2p) // %1
676                         : "0"(r0),
677                         "1"(tm2p)
678                         : "memory", "v0", "v1");
679 #else
680                     asm volatile(
681                         "pld        [%0, #256]          \n"
682                         "vld1.f32   {d0-d3}, [%0 :128]  \n"
683                         "vst1.f32   {d0-d3}, [%1 :128]! \n"
684                         : "=r"(r0),  // %0
685                         "=r"(tm2p) // %1
686                         : "0"(r0),
687                         "1"(tm2p)
688                         : "memory", "q0", "q1");
689 #endif // __aarch64__
690                     r0 += bottom_blob_tm.cstep * 4;
691                 }
692             }
693             for (; i < tiles; i++)
694             {
695 #if __aarch64__
696                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
697 #else
698                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
699 #endif
700 
701                 const float* r0 = bottom_blob_tm;
702 
703                 r0 += (r * tiles + i) * 4;
704 
705                 for (int q = 0; q < inch; q++)
706                 {
707 #if __aarch64__
708                     asm volatile(
709                         "prfm   pldl1keep, [%0, #128]       \n"
710                         "ld1    {v0.4s}, [%0]               \n"
711                         "st1    {v0.4s}, [%1], #16          \n"
712                         : "=r"(r0),  // %0
713                         "=r"(tm2p) // %1
714                         : "0"(r0),
715                         "1"(tm2p)
716                         : "memory", "v0");
717 #else
718                     asm volatile(
719                         "pld        [%0, #128]          \n"
720                         "vld1.f32   {d0-d1}, [%0 :128]  \n"
721                         "vst1.f32   {d0-d1}, [%1 :128]! \n"
722                         : "=r"(r0),  // %0
723                         "=r"(tm2p) // %1
724                         : "0"(r0),
725                         "1"(tm2p)
726                         : "memory", "q0");
727 #endif // __aarch64__
728                     r0 += bottom_blob_tm.cstep * 4;
729                 }
730             }
731         }
732 
733         bottom_blob_tm = Mat();
734         // permute end
735 
736         top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
737 
738         int remain_outch_start = 0;
739 
740 #if __ARM_NEON && __aarch64__
741         int nn_outch = 0;
742         nn_outch = outch >> 1;
743         remain_outch_start = nn_outch << 1;
744 
745         #pragma omp parallel for num_threads(opt.num_threads)
746         for (int pp = 0; pp < nn_outch; pp++)
747         {
748             int p = pp * 2;
749 
750             float* output0_tm = top_blob_tm.channel(p);
751             float* output1_tm = top_blob_tm.channel(p + 1);
752 
753             const Mat kernel01_tm = kernel_tm.channel(pp);
754 
755             for (int r = 0; r < 64; r++)
756             {
757                 const Mat bb2 = bottom_blob_tm2.channel(r);
758 
759                 int i = 0;
760                 for (; i + 11 < tiles; i += 12)
761                 {
762                     const float* r0 = bb2.row(i / 12);
763 
764                     const float* k01 = kernel01_tm.row(r);
765 
766                     int nn = inch; // inch always > 0
767 
768                     asm volatile(
769                         "eor    v8.16b, v8.16b, v8.16b      \n"
770                         "eor    v9.16b, v9.16b, v9.16b      \n"
771                         "eor    v10.16b, v10.16b, v10.16b   \n"
772                         "eor    v11.16b, v11.16b, v11.16b   \n"
773                         "eor    v12.16b, v12.16b, v12.16b   \n"
774                         "eor    v13.16b, v13.16b, v13.16b   \n"
775                         "eor    v14.16b, v14.16b, v14.16b   \n"
776                         "eor    v15.16b, v15.16b, v15.16b   \n"
777                         "eor    v16.16b, v16.16b, v16.16b   \n"
778                         "eor    v17.16b, v17.16b, v17.16b   \n"
779                         "eor    v18.16b, v18.16b, v18.16b   \n"
780                         "eor    v19.16b, v19.16b, v19.16b   \n"
781                         "eor    v20.16b, v20.16b, v20.16b   \n"
782                         "eor    v21.16b, v21.16b, v21.16b   \n"
783                         "eor    v22.16b, v22.16b, v22.16b   \n"
784                         "eor    v23.16b, v23.16b, v23.16b   \n"
785                         "eor    v24.16b, v24.16b, v24.16b   \n"
786                         "eor    v25.16b, v25.16b, v25.16b   \n"
787                         "eor    v26.16b, v26.16b, v26.16b   \n"
788                         "eor    v27.16b, v27.16b, v27.16b   \n"
789                         "eor    v28.16b, v28.16b, v28.16b   \n"
790                         "eor    v29.16b, v29.16b, v29.16b   \n"
791                         "eor    v30.16b, v30.16b, v30.16b   \n"
792                         "eor    v31.16b, v31.16b, v31.16b   \n"
793 
794                         "0:                                 \n"
795 
796                         "prfm   pldl1keep, [%3, #512]       \n"
797                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
798 
799                         "prfm   pldl1keep, [%4, #512]       \n"
800                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64   \n" // w0011_01
801 
802                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
803                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
804                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
805                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
806                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
807                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
808                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
809                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
810                         "fmla   v16.4s, v4.4s, v2.s[0]      \n"
811                         "fmla   v17.4s, v4.4s, v2.s[1]      \n"
812                         "fmla   v18.4s, v4.4s, v2.s[2]      \n"
813                         "fmla   v19.4s, v4.4s, v2.s[3]      \n"
814 
815                         "fmla   v20.4s, v5.4s, v0.s[0]      \n"
816                         "fmla   v21.4s, v5.4s, v0.s[1]      \n"
817                         "fmla   v22.4s, v5.4s, v0.s[2]      \n"
818                         "fmla   v23.4s, v5.4s, v0.s[3]      \n"
819                         "fmla   v24.4s, v5.4s, v1.s[0]      \n"
820                         "fmla   v25.4s, v5.4s, v1.s[1]      \n"
821                         "fmla   v26.4s, v5.4s, v1.s[2]      \n"
822                         "fmla   v27.4s, v5.4s, v1.s[3]      \n"
823                         "fmla   v28.4s, v5.4s, v2.s[0]      \n"
824                         "fmla   v29.4s, v5.4s, v2.s[1]      \n"
825                         "fmla   v30.4s, v5.4s, v2.s[2]      \n"
826                         "fmla   v31.4s, v5.4s, v2.s[3]      \n"
827 
828                         "fmla   v8.4s, v6.4s, v3.s[0]       \n"
829                         "fmla   v9.4s, v6.4s, v3.s[1]       \n"
830                         "fmla   v10.4s, v6.4s, v3.s[2]      \n"
831                         "fmla   v11.4s, v6.4s, v3.s[3]      \n"
832 
833                         "fmla   v20.4s, v7.4s, v3.s[0]      \n"
834                         "fmla   v21.4s, v7.4s, v3.s[1]      \n"
835                         "fmla   v22.4s, v7.4s, v3.s[2]      \n"
836                         "fmla   v23.4s, v7.4s, v3.s[3]      \n"
837 
838                         "prfm   pldl1keep, [%3, #512]       \n"
839                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
840 
841                         "fmla   v12.4s, v6.4s, v0.s[0]      \n"
842                         "fmla   v13.4s, v6.4s, v0.s[1]      \n"
843                         "fmla   v14.4s, v6.4s, v0.s[2]      \n"
844                         "fmla   v15.4s, v6.4s, v0.s[3]      \n"
845                         "fmla   v16.4s, v6.4s, v1.s[0]      \n"
846                         "fmla   v17.4s, v6.4s, v1.s[1]      \n"
847                         "fmla   v18.4s, v6.4s, v1.s[2]      \n"
848                         "fmla   v19.4s, v6.4s, v1.s[3]      \n"
849 
850                         "fmla   v24.4s, v7.4s, v0.s[0]      \n"
851                         "fmla   v25.4s, v7.4s, v0.s[1]      \n"
852                         "fmla   v26.4s, v7.4s, v0.s[2]      \n"
853                         "fmla   v27.4s, v7.4s, v0.s[3]      \n"
854                         "fmla   v28.4s, v7.4s, v1.s[0]      \n"
855                         "fmla   v29.4s, v7.4s, v1.s[1]      \n"
856                         "fmla   v30.4s, v7.4s, v1.s[2]      \n"
857                         "fmla   v31.4s, v7.4s, v1.s[3]      \n"
858 
859                         "prfm   pldl1keep, [%4, #512]       \n"
860                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64   \n" // w2233_01
861 
862                         "fmla   v8.4s, v4.4s, v2.s[0]       \n"
863                         "fmla   v9.4s, v4.4s, v2.s[1]       \n"
864                         "fmla   v10.4s, v4.4s, v2.s[2]      \n"
865                         "fmla   v11.4s, v4.4s, v2.s[3]      \n"
866                         "fmla   v12.4s, v4.4s, v3.s[0]      \n"
867                         "fmla   v13.4s, v4.4s, v3.s[1]      \n"
868                         "fmla   v14.4s, v4.4s, v3.s[2]      \n"
869                         "fmla   v15.4s, v4.4s, v3.s[3]      \n"
870 
871                         "fmla   v20.4s, v5.4s, v2.s[0]      \n"
872                         "fmla   v21.4s, v5.4s, v2.s[1]      \n"
873                         "fmla   v22.4s, v5.4s, v2.s[2]      \n"
874                         "fmla   v23.4s, v5.4s, v2.s[3]      \n"
875                         "fmla   v24.4s, v5.4s, v3.s[0]      \n"
876                         "fmla   v25.4s, v5.4s, v3.s[1]      \n"
877                         "fmla   v26.4s, v5.4s, v3.s[2]      \n"
878                         "fmla   v27.4s, v5.4s, v3.s[3]      \n"
879 
880                         "prfm   pldl1keep, [%3, #512]       \n"
881                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
882 
883                         "fmla   v16.4s, v4.4s, v0.s[0]      \n"
884                         "fmla   v17.4s, v4.4s, v0.s[1]      \n"
885                         "fmla   v18.4s, v4.4s, v0.s[2]      \n"
886                         "fmla   v19.4s, v4.4s, v0.s[3]      \n"
887 
888                         "fmla   v28.4s, v5.4s, v0.s[0]      \n"
889                         "fmla   v29.4s, v5.4s, v0.s[1]      \n"
890                         "fmla   v30.4s, v5.4s, v0.s[2]      \n"
891                         "fmla   v31.4s, v5.4s, v0.s[3]      \n"
892 
893                         "fmla   v8.4s, v6.4s, v1.s[0]       \n"
894                         "fmla   v9.4s, v6.4s, v1.s[1]       \n"
895                         "fmla   v10.4s, v6.4s, v1.s[2]      \n"
896                         "fmla   v11.4s, v6.4s, v1.s[3]      \n"
897                         "fmla   v12.4s, v6.4s, v2.s[0]      \n"
898                         "fmla   v13.4s, v6.4s, v2.s[1]      \n"
899                         "fmla   v14.4s, v6.4s, v2.s[2]      \n"
900                         "fmla   v15.4s, v6.4s, v2.s[3]      \n"
901                         "fmla   v16.4s, v6.4s, v3.s[0]      \n"
902                         "fmla   v17.4s, v6.4s, v3.s[1]      \n"
903                         "fmla   v18.4s, v6.4s, v3.s[2]      \n"
904                         "fmla   v19.4s, v6.4s, v3.s[3]      \n"
905 
906                         "subs   %w0, %w0, #1                \n"
907 
908                         "fmla   v20.4s, v7.4s, v1.s[0]      \n"
909                         "fmla   v21.4s, v7.4s, v1.s[1]      \n"
910                         "fmla   v22.4s, v7.4s, v1.s[2]      \n"
911                         "fmla   v23.4s, v7.4s, v1.s[3]      \n"
912                         "fmla   v24.4s, v7.4s, v2.s[0]      \n"
913                         "fmla   v25.4s, v7.4s, v2.s[1]      \n"
914                         "fmla   v26.4s, v7.4s, v2.s[2]      \n"
915                         "fmla   v27.4s, v7.4s, v2.s[3]      \n"
916                         "fmla   v28.4s, v7.4s, v3.s[0]      \n"
917                         "fmla   v29.4s, v7.4s, v3.s[1]      \n"
918                         "fmla   v30.4s, v7.4s, v3.s[2]      \n"
919                         "fmla   v31.4s, v7.4s, v3.s[3]      \n"
920 
921                         "bne    0b                          \n"
922 
923                         "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
924                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
925                         "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
926                         "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
927                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
928                         "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
929 
930                         : "=r"(nn),         // %0
931                         "=r"(output0_tm), // %1
932                         "=r"(output1_tm), // %2
933                         "=r"(r0),         // %3
934                         "=r"(k01)         // %4
935                         : "0"(nn),
936                         "1"(output0_tm),
937                         "2"(output1_tm),
938                         "3"(r0),
939                         "4"(k01)
940                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
941                 }
942                 for (; i + 7 < tiles; i += 8)
943                 {
944                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
945 
946                     const float* k01 = kernel01_tm.row(r);
947 
948                     int nn = inch; // inch always > 0
949 
950                     asm volatile(
951                         "eor    v16.16b, v16.16b, v16.16b   \n"
952                         "eor    v17.16b, v17.16b, v17.16b   \n"
953                         "eor    v18.16b, v18.16b, v18.16b   \n"
954                         "eor    v19.16b, v19.16b, v19.16b   \n"
955                         "eor    v20.16b, v20.16b, v20.16b   \n"
956                         "eor    v21.16b, v21.16b, v21.16b   \n"
957                         "eor    v22.16b, v22.16b, v22.16b   \n"
958                         "eor    v23.16b, v23.16b, v23.16b   \n"
959                         "eor    v24.16b, v24.16b, v24.16b   \n"
960                         "eor    v25.16b, v25.16b, v25.16b   \n"
961                         "eor    v26.16b, v26.16b, v26.16b   \n"
962                         "eor    v27.16b, v27.16b, v27.16b   \n"
963                         "eor    v28.16b, v28.16b, v28.16b   \n"
964                         "eor    v29.16b, v29.16b, v29.16b   \n"
965                         "eor    v30.16b, v30.16b, v30.16b   \n"
966                         "eor    v31.16b, v31.16b, v31.16b   \n"
967 
968                         "0:                                 \n"
969 
970                         "prfm   pldl1keep, [%3, #512]       \n"
971                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
972 
973                         "prfm   pldl1keep, [%4, #512]       \n"
974                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
975 
976                         "prfm   pldl1keep, [%3, #512]       \n"
977                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
978 
979                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
980                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
981                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
982                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
983                         "fmla   v20.4s, v8.4s, v4.s[0]      \n"
984                         "fmla   v21.4s, v8.4s, v5.s[0]      \n"
985                         "fmla   v22.4s, v8.4s, v6.s[0]      \n"
986                         "fmla   v23.4s, v8.4s, v7.s[0]      \n"
987 
988                         "fmla   v24.4s, v9.4s, v0.s[0]      \n"
989                         "fmla   v25.4s, v9.4s, v1.s[0]      \n"
990                         "fmla   v26.4s, v9.4s, v2.s[0]      \n"
991                         "fmla   v27.4s, v9.4s, v3.s[0]      \n"
992                         "fmla   v28.4s, v9.4s, v4.s[0]      \n"
993                         "fmla   v29.4s, v9.4s, v5.s[0]      \n"
994                         "fmla   v30.4s, v9.4s, v6.s[0]      \n"
995                         "fmla   v31.4s, v9.4s, v7.s[0]      \n"
996 
997                         "prfm   pldl1keep, [%4, #512]       \n"
998                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
999 
1000                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
1001                         "fmla   v17.4s, v10.4s, v1.s[1]     \n"
1002                         "fmla   v18.4s, v10.4s, v2.s[1]     \n"
1003                         "fmla   v19.4s, v10.4s, v3.s[1]     \n"
1004                         "fmla   v20.4s, v10.4s, v4.s[1]     \n"
1005                         "fmla   v21.4s, v10.4s, v5.s[1]     \n"
1006                         "fmla   v22.4s, v10.4s, v6.s[1]     \n"
1007                         "fmla   v23.4s, v10.4s, v7.s[1]     \n"
1008 
1009                         "fmla   v24.4s, v11.4s, v0.s[1]     \n"
1010                         "fmla   v25.4s, v11.4s, v1.s[1]     \n"
1011                         "fmla   v26.4s, v11.4s, v2.s[1]     \n"
1012                         "fmla   v27.4s, v11.4s, v3.s[1]     \n"
1013                         "fmla   v28.4s, v11.4s, v4.s[1]     \n"
1014                         "fmla   v29.4s, v11.4s, v5.s[1]     \n"
1015                         "fmla   v30.4s, v11.4s, v6.s[1]     \n"
1016                         "fmla   v31.4s, v11.4s, v7.s[1]     \n"
1017 
1018                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
1019                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
1020                         "fmla   v18.4s, v12.4s, v2.s[2]     \n"
1021                         "fmla   v19.4s, v12.4s, v3.s[2]     \n"
1022                         "fmla   v20.4s, v12.4s, v4.s[2]     \n"
1023                         "fmla   v21.4s, v12.4s, v5.s[2]     \n"
1024                         "fmla   v22.4s, v12.4s, v6.s[2]     \n"
1025                         "fmla   v23.4s, v12.4s, v7.s[2]     \n"
1026 
1027                         "fmla   v24.4s, v13.4s, v0.s[2]     \n"
1028                         "fmla   v25.4s, v13.4s, v1.s[2]     \n"
1029                         "fmla   v26.4s, v13.4s, v2.s[2]     \n"
1030                         "fmla   v27.4s, v13.4s, v3.s[2]     \n"
1031                         "fmla   v28.4s, v13.4s, v4.s[2]     \n"
1032                         "fmla   v29.4s, v13.4s, v5.s[2]     \n"
1033                         "fmla   v30.4s, v13.4s, v6.s[2]     \n"
1034                         "fmla   v31.4s, v13.4s, v7.s[2]     \n"
1035 
1036                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
1037                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
1038                         "fmla   v18.4s, v14.4s, v2.s[3]     \n"
1039                         "fmla   v19.4s, v14.4s, v3.s[3]     \n"
1040                         "fmla   v20.4s, v14.4s, v4.s[3]     \n"
1041                         "fmla   v21.4s, v14.4s, v5.s[3]     \n"
1042                         "fmla   v22.4s, v14.4s, v6.s[3]     \n"
1043                         "fmla   v23.4s, v14.4s, v7.s[3]     \n"
1044 
1045                         "subs   %w0, %w0, #1                \n"
1046 
1047                         "fmla   v24.4s, v15.4s, v0.s[3]     \n"
1048                         "fmla   v25.4s, v15.4s, v1.s[3]     \n"
1049                         "fmla   v26.4s, v15.4s, v2.s[3]     \n"
1050                         "fmla   v27.4s, v15.4s, v3.s[3]     \n"
1051                         "fmla   v28.4s, v15.4s, v4.s[3]     \n"
1052                         "fmla   v29.4s, v15.4s, v5.s[3]     \n"
1053                         "fmla   v30.4s, v15.4s, v6.s[3]     \n"
1054                         "fmla   v31.4s, v15.4s, v7.s[3]     \n"
1055 
1056                         "bne    0b                          \n"
1057 
1058                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1059                         "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1060                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1061                         "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
1062 
1063                         : "=r"(nn),         // %0
1064                         "=r"(output0_tm), // %1
1065                         "=r"(output1_tm), // %2
1066                         "=r"(r0),         // %3
1067                         "=r"(k01)         // %4
1068                         : "0"(nn),
1069                         "1"(output0_tm),
1070                         "2"(output1_tm),
1071                         "3"(r0),
1072                         "4"(k01)
1073                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
1074                 }
1075                 for (; i + 3 < tiles; i += 4)
1076                 {
1077                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1078 
1079                     const float* k01 = kernel01_tm.row(r);
1080 
1081                     int nn = inch; // inch always > 0
1082 
1083                     asm volatile(
1084                         "eor    v16.16b, v16.16b, v16.16b   \n"
1085                         "eor    v17.16b, v17.16b, v17.16b   \n"
1086                         "eor    v18.16b, v18.16b, v18.16b   \n"
1087                         "eor    v19.16b, v19.16b, v19.16b   \n"
1088                         "eor    v20.16b, v20.16b, v20.16b   \n"
1089                         "eor    v21.16b, v21.16b, v21.16b   \n"
1090                         "eor    v22.16b, v22.16b, v22.16b   \n"
1091                         "eor    v23.16b, v23.16b, v23.16b   \n"
1092 
1093                         "0:                                 \n"
1094 
1095                         "prfm   pldl1keep, [%3, #512]       \n"
1096                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
1097 
1098                         "prfm   pldl1keep, [%4, #512]       \n"
1099                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1100 
1101                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1102                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1103                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
1104                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
1105 
1106                         "fmla   v20.4s, v9.4s, v0.s[0]     \n"
1107                         "fmla   v21.4s, v9.4s, v1.s[0]     \n"
1108                         "fmla   v22.4s, v9.4s, v2.s[0]     \n"
1109                         "fmla   v23.4s, v9.4s, v3.s[0]     \n"
1110 
1111                         "prfm   pldl1keep, [%4, #512]       \n"
1112                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1113 
1114                         "fmla   v16.4s, v10.4s, v0.s[1]      \n"
1115                         "fmla   v17.4s, v10.4s, v1.s[1]      \n"
1116                         "fmla   v18.4s, v10.4s, v2.s[1]      \n"
1117                         "fmla   v19.4s, v10.4s, v3.s[1]      \n"
1118 
1119                         "fmla   v20.4s, v11.4s, v0.s[1]     \n"
1120                         "fmla   v21.4s, v11.4s, v1.s[1]     \n"
1121                         "fmla   v22.4s, v11.4s, v2.s[1]     \n"
1122                         "fmla   v23.4s, v11.4s, v3.s[1]     \n"
1123 
1124                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
1125                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
1126                         "fmla   v18.4s, v12.4s, v2.s[2]     \n"
1127                         "fmla   v19.4s, v12.4s, v3.s[2]     \n"
1128 
1129                         "fmla   v20.4s, v13.4s, v0.s[2]     \n"
1130                         "fmla   v21.4s, v13.4s, v1.s[2]     \n"
1131                         "fmla   v22.4s, v13.4s, v2.s[2]     \n"
1132                         "fmla   v23.4s, v13.4s, v3.s[2]     \n"
1133 
1134                         "subs   %w0, %w0, #1                \n"
1135 
1136                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
1137                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
1138                         "fmla   v18.4s, v14.4s, v2.s[3]     \n"
1139                         "fmla   v19.4s, v14.4s, v3.s[3]     \n"
1140 
1141                         "fmla   v20.4s, v15.4s, v0.s[3]     \n"
1142                         "fmla   v21.4s, v15.4s, v1.s[3]     \n"
1143                         "fmla   v22.4s, v15.4s, v2.s[3]     \n"
1144                         "fmla   v23.4s, v15.4s, v3.s[3]     \n"
1145 
1146                         "bne    0b                          \n"
1147 
1148                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1149                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1150 
1151                         : "=r"(nn),         // %0
1152                         "=r"(output0_tm), // %1
1153                         "=r"(output1_tm), // %2
1154                         "=r"(r0),         // %3
1155                         "=r"(k01)         // %4
1156                         : "0"(nn),
1157                         "1"(output0_tm),
1158                         "2"(output1_tm),
1159                         "3"(r0),
1160                         "4"(k01)
1161                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1162                 }
1163                 for (; i + 1 < tiles; i += 2)
1164                 {
1165                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1166 
1167                     const float* k01 = kernel01_tm.row(r);
1168 
1169                     int nn = inch; // inch always > 0
1170 
1171                     asm volatile(
1172                         "eor    v16.16b, v16.16b, v16.16b   \n"
1173                         "eor    v17.16b, v17.16b, v17.16b   \n"
1174                         "eor    v18.16b, v18.16b, v18.16b   \n"
1175                         "eor    v19.16b, v19.16b, v19.16b   \n"
1176 
1177                         "0:                                 \n"
1178 
1179                         "prfm   pldl1keep, [%3, #256]       \n"
1180                         "ld1    {v0.4s, v1.4s}, [%3], #32   \n" // r0 r1
1181 
1182                         "prfm   pldl1keep, [%4, #512]       \n"
1183                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1184 
1185                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1186                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1187                         "fmla   v18.4s, v9.4s, v0.s[0]      \n"
1188                         "fmla   v19.4s, v9.4s, v1.s[0]      \n"
1189 
1190                         "prfm   pldl1keep, [%4, #512]       \n"
1191                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1192 
1193                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
1194                         "fmla   v17.4s, v10.4s, v1.s[1]     \n"
1195                         "fmla   v18.4s, v11.4s, v0.s[1]     \n"
1196                         "fmla   v19.4s, v11.4s, v1.s[1]     \n"
1197 
1198                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
1199                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
1200                         "fmla   v18.4s, v13.4s, v0.s[2]     \n"
1201                         "fmla   v19.4s, v13.4s, v1.s[2]     \n"
1202 
1203                         "subs   %w0, %w0, #1                \n"
1204 
1205                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
1206                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
1207                         "fmla   v18.4s, v15.4s, v0.s[3]     \n"
1208                         "fmla   v19.4s, v15.4s, v1.s[3]     \n"
1209 
1210                         "bne    0b                          \n"
1211 
1212                         "st1    {v16.4s, v17.4s}, [%1], #32 \n"
1213                         "st1    {v18.4s, v19.4s}, [%2], #32 \n"
1214 
1215                         : "=r"(nn),         // %0
1216                         "=r"(output0_tm), // %1
1217                         "=r"(output1_tm), // %2
1218                         "=r"(r0),         // %3
1219                         "=r"(k01)         // %4
1220                         : "0"(nn),
1221                         "1"(output0_tm),
1222                         "2"(output1_tm),
1223                         "3"(r0),
1224                         "4"(k01)
1225                         : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1226                 }
1227                 for (; i < tiles; i++)
1228                 {
1229                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1230 
1231                     const float* k01 = kernel01_tm.row(r);
1232 
1233                     int nn = inch; // inch always > 0
1234 
1235                     asm volatile(
1236                         "eor    v16.16b, v16.16b, v16.16b   \n"
1237                         "eor    v17.16b, v17.16b, v17.16b   \n"
1238 
1239                         "0:                                 \n"
1240 
1241                         "prfm   pldl1keep, [%3, #128]       \n"
1242                         "ld1    {v0.4s}, [%3], #16          \n" // r0
1243 
1244                         "prfm   pldl1keep, [%4, #512]       \n"
1245                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1246 
1247                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1248                         "fmla   v17.4s, v9.4s, v0.s[0]      \n"
1249 
1250                         "prfm   pldl1keep, [%4, #512]       \n"
1251                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1252 
1253                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
1254                         "fmla   v17.4s, v11.4s, v0.s[1]     \n"
1255 
1256                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
1257                         "fmla   v17.4s, v13.4s, v0.s[2]     \n"
1258 
1259                         "subs   %w0, %w0, #1                \n"
1260 
1261                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
1262                         "fmla   v17.4s, v15.4s, v0.s[3]     \n"
1263 
1264                         "bne    0b                          \n"
1265 
1266                         "st1    {v16.4s}, [%1], #16         \n"
1267                         "st1    {v17.4s}, [%2], #16         \n"
1268 
1269                         : "=r"(nn),         // %0
1270                         "=r"(output0_tm), // %1
1271                         "=r"(output1_tm), // %2
1272                         "=r"(r0),         // %3
1273                         "=r"(k01)         // %4
1274                         : "0"(nn),
1275                         "1"(output0_tm),
1276                         "2"(output1_tm),
1277                         "3"(r0),
1278                         "4"(k01)
1279                         : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
1280                 }
1281             }
1282         }
1283 #endif // __ARM_NEON && __aarch64__
1284 
1285         #pragma omp parallel for num_threads(opt.num_threads)
1286         for (int p = remain_outch_start; p < outch; p++)
1287         {
1288             float* output0_tm = top_blob_tm.channel(p);
1289 
1290 #if __aarch64__
1291             const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
1292 #else
1293             const Mat kernel0_tm = kernel_tm.channel(p);
1294 #endif
1295 
1296             for (int r = 0; r < 64; r++)
1297             {
1298                 const Mat bb2 = bottom_blob_tm2.channel(r);
1299 
1300                 int i = 0;
1301 #if __aarch64__
1302                 for (; i + 11 < tiles; i += 12)
1303                 {
1304                     const float* r0 = bb2.row(i / 12);
1305 
1306                     const float* k0 = kernel0_tm.row(r);
1307 
1308                     int nn = inch; // inch always > 0
1309 
1310                     asm volatile(
1311                         "eor    v8.16b, v8.16b, v8.16b      \n"
1312                         "eor    v9.16b, v9.16b, v9.16b      \n"
1313                         "eor    v10.16b, v10.16b, v10.16b   \n"
1314                         "eor    v11.16b, v11.16b, v11.16b   \n"
1315                         "eor    v12.16b, v12.16b, v12.16b   \n"
1316                         "eor    v13.16b, v13.16b, v13.16b   \n"
1317                         "eor    v14.16b, v14.16b, v14.16b   \n"
1318                         "eor    v15.16b, v15.16b, v15.16b   \n"
1319                         "eor    v16.16b, v16.16b, v16.16b   \n"
1320                         "eor    v17.16b, v17.16b, v17.16b   \n"
1321                         "eor    v18.16b, v18.16b, v18.16b   \n"
1322                         "eor    v19.16b, v19.16b, v19.16b   \n"
1323 
1324                         "0:                                 \n"
1325 
1326                         "prfm   pldl1keep, [%2, #512]       \n"
1327                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1328 
1329                         "prfm   pldl1keep, [%3, #512]       \n"
1330                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
1331 
1332                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
1333                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
1334                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
1335                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
1336                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
1337                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
1338                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
1339                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
1340                         "fmla   v16.4s, v4.4s, v2.s[0]      \n"
1341                         "fmla   v17.4s, v4.4s, v2.s[1]      \n"
1342                         "fmla   v18.4s, v4.4s, v2.s[2]      \n"
1343                         "fmla   v19.4s, v4.4s, v2.s[3]      \n"
1344 
1345                         "prfm   pldl1keep, [%2, #512]       \n"
1346                         "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1347 
1348                         "fmla   v8.4s, v5.4s, v3.s[0]       \n"
1349                         "fmla   v9.4s, v5.4s, v3.s[1]       \n"
1350                         "fmla   v10.4s, v5.4s, v3.s[2]      \n"
1351                         "fmla   v11.4s, v5.4s, v3.s[3]      \n"
1352                         "fmla   v12.4s, v5.4s, v20.s[0]     \n"
1353                         "fmla   v13.4s, v5.4s, v20.s[1]     \n"
1354                         "fmla   v14.4s, v5.4s, v20.s[2]     \n"
1355                         "fmla   v15.4s, v5.4s, v20.s[3]     \n"
1356                         "fmla   v16.4s, v5.4s, v21.s[0]     \n"
1357                         "fmla   v17.4s, v5.4s, v21.s[1]     \n"
1358                         "fmla   v18.4s, v5.4s, v21.s[2]     \n"
1359                         "fmla   v19.4s, v5.4s, v21.s[3]     \n"
1360 
1361                         "prfm   pldl1keep, [%2, #512]       \n"
1362                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1363 
1364                         "fmla   v8.4s, v6.4s, v22.s[0]      \n"
1365                         "fmla   v9.4s, v6.4s, v22.s[1]      \n"
1366                         "fmla   v10.4s, v6.4s, v22.s[2]     \n"
1367                         "fmla   v11.4s, v6.4s, v22.s[3]     \n"
1368                         "fmla   v12.4s, v6.4s, v23.s[0]     \n"
1369                         "fmla   v13.4s, v6.4s, v23.s[1]     \n"
1370                         "fmla   v14.4s, v6.4s, v23.s[2]     \n"
1371                         "fmla   v15.4s, v6.4s, v23.s[3]     \n"
1372                         "fmla   v16.4s, v6.4s, v24.s[0]     \n"
1373                         "fmla   v17.4s, v6.4s, v24.s[1]     \n"
1374                         "fmla   v18.4s, v6.4s, v24.s[2]     \n"
1375                         "fmla   v19.4s, v6.4s, v24.s[3]     \n"
1376 
1377                         "subs   %w0, %w0, #1                \n"
1378 
1379                         "fmla   v8.4s, v7.4s, v25.s[0]      \n"
1380                         "fmla   v9.4s, v7.4s, v25.s[1]      \n"
1381                         "fmla   v10.4s, v7.4s, v25.s[2]     \n"
1382                         "fmla   v11.4s, v7.4s, v25.s[3]     \n"
1383                         "fmla   v12.4s, v7.4s, v26.s[0]     \n"
1384                         "fmla   v13.4s, v7.4s, v26.s[1]     \n"
1385                         "fmla   v14.4s, v7.4s, v26.s[2]     \n"
1386                         "fmla   v15.4s, v7.4s, v26.s[3]     \n"
1387                         "fmla   v16.4s, v7.4s, v27.s[0]     \n"
1388                         "fmla   v17.4s, v7.4s, v27.s[1]     \n"
1389                         "fmla   v18.4s, v7.4s, v27.s[2]     \n"
1390                         "fmla   v19.4s, v7.4s, v27.s[3]     \n"
1391 
1392                         "bne    0b                          \n"
1393 
1394                         "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
1395                         "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
1396                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1397 
1398                         : "=r"(nn),         // %0
1399                         "=r"(output0_tm), // %1
1400                         "=r"(r0),         // %2
1401                         "=r"(k0)          // %3
1402                         : "0"(nn),
1403                         "1"(output0_tm),
1404                         "2"(r0),
1405                         "3"(k0)
1406                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1407                 }
1408 #endif
1409                 for (; i + 7 < tiles; i += 8)
1410                 {
1411 #if __aarch64__
1412                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1413 #else
1414                     const float* r0 = bb2.row(i / 8);
1415 #endif
1416 
1417                     const float* k0 = kernel0_tm.row(r);
1418 
1419                     int nn = inch; // inch always > 0
1420 
1421 #if __aarch64__
1422                     asm volatile(
1423                         "eor    v16.16b, v16.16b, v16.16b   \n"
1424                         "eor    v17.16b, v17.16b, v17.16b   \n"
1425                         "eor    v18.16b, v18.16b, v18.16b   \n"
1426                         "eor    v19.16b, v19.16b, v19.16b   \n"
1427                         "eor    v20.16b, v20.16b, v20.16b   \n"
1428                         "eor    v21.16b, v21.16b, v21.16b   \n"
1429                         "eor    v22.16b, v22.16b, v22.16b   \n"
1430                         "eor    v23.16b, v23.16b, v23.16b   \n"
1431 
1432                         "0:                                 \n"
1433 
1434                         "prfm   pldl1keep, [%2, #512]       \n"
1435                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1436 
1437                         "prfm   pldl1keep, [%3, #512]       \n"
1438                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1439 
1440                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1441                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1442                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
1443                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
1444 
1445                         "prfm   pldl1keep, [%2, #512]       \n"
1446                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
1447 
1448                         "fmla   v20.4s, v8.4s, v4.s[0]      \n"
1449                         "fmla   v21.4s, v8.4s, v5.s[0]      \n"
1450                         "fmla   v22.4s, v8.4s, v6.s[0]      \n"
1451                         "fmla   v23.4s, v8.4s, v7.s[0]      \n"
1452 
1453                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1454                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1455                         "fmla   v18.4s, v9.4s, v2.s[1]      \n"
1456                         "fmla   v19.4s, v9.4s, v3.s[1]      \n"
1457                         "fmla   v20.4s, v9.4s, v4.s[1]      \n"
1458                         "fmla   v21.4s, v9.4s, v5.s[1]      \n"
1459                         "fmla   v22.4s, v9.4s, v6.s[1]      \n"
1460                         "fmla   v23.4s, v9.4s, v7.s[1]      \n"
1461 
1462                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1463                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1464                         "fmla   v18.4s, v10.4s, v2.s[2]     \n"
1465                         "fmla   v19.4s, v10.4s, v3.s[2]     \n"
1466                         "fmla   v20.4s, v10.4s, v4.s[2]     \n"
1467                         "fmla   v21.4s, v10.4s, v5.s[2]     \n"
1468                         "fmla   v22.4s, v10.4s, v6.s[2]     \n"
1469                         "fmla   v23.4s, v10.4s, v7.s[2]     \n"
1470 
1471                         "subs   %w0, %w0, #1                \n"
1472 
1473                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1474                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1475                         "fmla   v18.4s, v11.4s, v2.s[3]     \n"
1476                         "fmla   v19.4s, v11.4s, v3.s[3]     \n"
1477                         "fmla   v20.4s, v11.4s, v4.s[3]     \n"
1478                         "fmla   v21.4s, v11.4s, v5.s[3]     \n"
1479                         "fmla   v22.4s, v11.4s, v6.s[3]     \n"
1480                         "fmla   v23.4s, v11.4s, v7.s[3]     \n"
1481 
1482                         "bne    0b                          \n"
1483 
1484                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1485                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1486 
1487                         : "=r"(nn),         // %0
1488                         "=r"(output0_tm), // %1
1489                         "=r"(r0),         // %2
1490                         "=r"(k0)          // %3
1491                         : "0"(nn),
1492                         "1"(output0_tm),
1493                         "2"(r0),
1494                         "3"(k0)
1495                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1496 #else
1497                     asm volatile(
1498                         "veor       q8, q8          \n"
1499                         "veor       q9, q9          \n"
1500                         "veor       q10, q10        \n"
1501                         "veor       q11, q11        \n"
1502                         "veor       q12, q12        \n"
1503                         "veor       q13, q13        \n"
1504                         "veor       q14, q14        \n"
1505                         "veor       q15, q15        \n"
1506 
1507                         "0:                         \n"
1508 
1509                         "pld        [%2, #512]      \n"
1510                         "vldm       %2!, {d0-d7}    \n"
1511 
1512                         "pld        [%3, #512]      \n"
1513                         "vldm       %3!, {d8-d15}   \n"
1514 
1515                         "vmla.f32   q8, q4, d0[0]   \n"
1516                         "vmla.f32   q9, q4, d0[1]   \n"
1517                         "vmla.f32   q10, q4, d1[0]  \n"
1518                         "vmla.f32   q11, q4, d1[1]  \n"
1519                         "vmla.f32   q12, q4, d2[0]  \n"
1520                         "vmla.f32   q13, q4, d2[1]  \n"
1521                         "vmla.f32   q14, q4, d3[0]  \n"
1522                         "vmla.f32   q15, q4, d3[1]  \n"
1523 
1524                         "vmla.f32   q8, q5, d4[0]   \n"
1525                         "vmla.f32   q9, q5, d4[1]   \n"
1526                         "vmla.f32   q10, q5, d5[0]  \n"
1527                         "vmla.f32   q11, q5, d5[1]  \n"
1528                         "vmla.f32   q12, q5, d6[0]  \n"
1529                         "vmla.f32   q13, q5, d6[1]  \n"
1530                         "vmla.f32   q14, q5, d7[0]  \n"
1531                         "vmla.f32   q15, q5, d7[1]  \n"
1532 
1533                         "pld        [%2, #512]      \n"
1534                         "vldm       %2!, {d0-d7}    \n"
1535 
1536                         "vmla.f32   q8, q6, d0[0]   \n"
1537                         "vmla.f32   q9, q6, d0[1]   \n"
1538                         "vmla.f32   q10, q6, d1[0]  \n"
1539                         "vmla.f32   q11, q6, d1[1]  \n"
1540                         "vmla.f32   q12, q6, d2[0]  \n"
1541                         "vmla.f32   q13, q6, d2[1]  \n"
1542                         "vmla.f32   q14, q6, d3[0]  \n"
1543                         "vmla.f32   q15, q6, d3[1]  \n"
1544 
1545                         "subs       %0, %0, #1      \n"
1546 
1547                         "vmla.f32   q8, q7, d4[0]   \n"
1548                         "vmla.f32   q9, q7, d4[1]   \n"
1549                         "vmla.f32   q10, q7, d5[0]  \n"
1550                         "vmla.f32   q11, q7, d5[1]  \n"
1551                         "vmla.f32   q12, q7, d6[0]  \n"
1552                         "vmla.f32   q13, q7, d6[1]  \n"
1553                         "vmla.f32   q14, q7, d7[0]  \n"
1554                         "vmla.f32   q15, q7, d7[1]  \n"
1555 
1556                         "bne        0b              \n"
1557 
1558                         "vstm       %1!, {d16-d23}  \n"
1559                         "vstm       %1!, {d24-d31}  \n"
1560 
1561                         : "=r"(nn),         // %0
1562                         "=r"(output0_tm), // %1
1563                         "=r"(r0),         // %2
1564                         "=r"(k0)          // %3
1565                         : "0"(nn),
1566                         "1"(output0_tm),
1567                         "2"(r0),
1568                         "3"(k0)
1569                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1570 #endif
1571                 }
1572                 for (; i + 3 < tiles; i += 4)
1573                 {
1574 #if __aarch64__
1575                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1576 #else
1577                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1578 #endif
1579 
1580                     const float* k0 = kernel0_tm.row(r);
1581 
1582                     int nn = inch; // inch always > 0
1583 
1584 #if __aarch64__
1585                     asm volatile(
1586                         "eor    v16.16b, v16.16b, v16.16b   \n"
1587                         "eor    v17.16b, v17.16b, v17.16b   \n"
1588                         "eor    v18.16b, v18.16b, v18.16b   \n"
1589                         "eor    v19.16b, v19.16b, v19.16b   \n"
1590 
1591                         "0:                                 \n"
1592 
1593                         "prfm   pldl1keep, [%2, #512]       \n"
1594                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1595 
1596                         "prfm   pldl1keep, [%3, #512]       \n"
1597                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1598 
1599                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1600                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1601                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
1602                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
1603 
1604                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1605                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1606                         "fmla   v18.4s, v9.4s, v2.s[1]      \n"
1607                         "fmla   v19.4s, v9.4s, v3.s[1]      \n"
1608 
1609                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1610                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1611                         "fmla   v18.4s, v10.4s, v2.s[2]     \n"
1612                         "fmla   v19.4s, v10.4s, v3.s[2]     \n"
1613 
1614                         "subs   %w0, %w0, #1                \n"
1615 
1616                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1617                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1618                         "fmla   v18.4s, v11.4s, v2.s[3]     \n"
1619                         "fmla   v19.4s, v11.4s, v3.s[3]     \n"
1620 
1621                         "bne    0b                          \n"
1622 
1623                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1624 
1625                         : "=r"(nn),         // %0
1626                         "=r"(output0_tm), // %1
1627                         "=r"(r0),         // %2
1628                         "=r"(k0)          // %3
1629                         : "0"(nn),
1630                         "1"(output0_tm),
1631                         "2"(r0),
1632                         "3"(k0)
1633                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1634 #else
1635                     asm volatile(
1636                         "veor       q8, q8          \n"
1637                         "veor       q9, q9          \n"
1638                         "veor       q10, q10        \n"
1639                         "veor       q11, q11        \n"
1640 
1641                         "0:                         \n"
1642 
1643                         "pld        [%2, #512]      \n"
1644                         "vldm       %2!, {d0-d7}    \n"
1645 
1646                         "pld        [%3, #512]      \n"
1647                         "vldm       %3!, {d8-d15}   \n"
1648 
1649                         "vmla.f32   q8, q4, d0[0]   \n"
1650                         "vmla.f32   q9, q4, d2[0]   \n"
1651                         "vmla.f32   q10, q4, d4[0]  \n"
1652                         "vmla.f32   q11, q4, d6[0]  \n"
1653 
1654                         "vmla.f32   q8, q5, d0[1]   \n"
1655                         "vmla.f32   q9, q5, d2[1]   \n"
1656                         "vmla.f32   q10, q5, d4[1]  \n"
1657                         "vmla.f32   q11, q5, d6[1]  \n"
1658 
1659                         "vmla.f32   q8, q6, d1[0]   \n"
1660                         "vmla.f32   q9, q6, d3[0]   \n"
1661                         "vmla.f32   q10, q6, d5[0]  \n"
1662                         "vmla.f32   q11, q6, d7[0]  \n"
1663 
1664                         "subs       %0, %0, #1      \n"
1665 
1666                         "vmla.f32   q8, q7, d1[1]   \n"
1667                         "vmla.f32   q9, q7, d3[1]   \n"
1668                         "vmla.f32   q10, q7, d5[1]  \n"
1669                         "vmla.f32   q11, q7, d7[1]  \n"
1670 
1671                         "bne        0b              \n"
1672 
1673                         "vstm       %1!, {d16-d23}  \n"
1674 
1675                         : "=r"(nn),         // %0
1676                         "=r"(output0_tm), // %1
1677                         "=r"(r0),         // %2
1678                         "=r"(k0)          // %3
1679                         : "0"(nn),
1680                         "1"(output0_tm),
1681                         "2"(r0),
1682                         "3"(k0)
1683                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1684 #endif
1685                 }
1686                 for (; i + 1 < tiles; i += 2)
1687                 {
1688 #if __aarch64__
1689                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1690 #else
1691                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
1692 #endif
1693 
1694                     const float* k0 = kernel0_tm.row(r);
1695 
1696                     int nn = inch; // inch always > 0
1697 
1698 #if __aarch64__
1699                     asm volatile(
1700                         "eor    v16.16b, v16.16b, v16.16b   \n"
1701                         "eor    v17.16b, v17.16b, v17.16b   \n"
1702 
1703                         "0:                                 \n"
1704 
1705                         "prfm   pldl1keep, [%2, #256]       \n"
1706                         "ld1    {v0.4s, v1.4s}, [%2], #32   \n" // r0 r1
1707 
1708                         "prfm   pldl1keep, [%3, #512]       \n"
1709                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1710 
1711                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1712                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1713 
1714                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1715                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1716 
1717                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1718                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1719 
1720                         "subs   %w0, %w0, #1                \n"
1721 
1722                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1723                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1724 
1725                         "bne    0b                          \n"
1726 
1727                         "st1    {v16.4s, v17.4s}, [%1], #32 \n"
1728 
1729                         : "=r"(nn),         // %0
1730                         "=r"(output0_tm), // %1
1731                         "=r"(r0),         // %2
1732                         "=r"(k0)          // %3
1733                         : "0"(nn),
1734                         "1"(output0_tm),
1735                         "2"(r0),
1736                         "3"(k0)
1737                         : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
1738 #else
1739                     asm volatile(
1740                         "veor       q8, q8          \n"
1741                         "veor       q9, q9          \n"
1742 
1743                         "0:                         \n"
1744 
1745                         "pld        [%2, #256]      \n"
1746                         "vld1.f32   {d0-d3}, [%2 :128]! \n"
1747 
1748                         "pld        [%3, #512]      \n"
1749                         "vldm       %3!, {d8-d15}   \n"
1750 
1751                         "vmla.f32   q8, q4, d0[0]   \n"
1752                         "vmla.f32   q9, q4, d2[0]   \n"
1753 
1754                         "vmla.f32   q8, q5, d0[1]   \n"
1755                         "vmla.f32   q9, q5, d2[1]   \n"
1756 
1757                         "vmla.f32   q8, q6, d1[0]   \n"
1758                         "vmla.f32   q9, q6, d3[0]   \n"
1759 
1760                         "subs       %0, %0, #1      \n"
1761 
1762                         "vmla.f32   q8, q7, d1[1]   \n"
1763                         "vmla.f32   q9, q7, d3[1]   \n"
1764 
1765                         "bne        0b              \n"
1766 
1767                         "vst1.f32   {d16-d19}, [%1 :128]! \n"
1768 
1769                         : "=r"(nn),         // %0
1770                         "=r"(output0_tm), // %1
1771                         "=r"(r0),         // %2
1772                         "=r"(k0)          // %3
1773                         : "0"(nn),
1774                         "1"(output0_tm),
1775                         "2"(r0),
1776                         "3"(k0)
1777                         : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
1778 #endif
1779                 }
1780                 for (; i < tiles; i++)
1781                 {
1782 #if __aarch64__
1783                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1784 #else
1785                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
1786 #endif
1787 
1788                     const float* k0 = kernel0_tm.row(r);
1789 
1790                     int nn = inch; // inch always > 0
1791 
1792 #if __aarch64__
1793                     asm volatile(
1794                         "eor    v16.16b, v16.16b, v16.16b   \n"
1795 
1796                         "0:                                 \n"
1797 
1798                         "prfm   pldl1keep, [%2, #128]       \n"
1799                         "ld1    {v0.4s}, [%2], #16          \n" // r0
1800 
1801                         "prfm   pldl1keep, [%3, #512]       \n"
1802                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1803 
1804                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1805                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1806 
1807                         "subs   %w0, %w0, #1                \n"
1808 
1809                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1810                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1811 
1812                         "bne    0b                          \n"
1813 
1814                         "st1    {v16.4s}, [%1], #16         \n"
1815 
1816                         : "=r"(nn),         // %0
1817                         "=r"(output0_tm), // %1
1818                         "=r"(r0),         // %2
1819                         "=r"(k0)          // %3
1820                         : "0"(nn),
1821                         "1"(output0_tm),
1822                         "2"(r0),
1823                         "3"(k0)
1824                         : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
1825 #else
1826                     asm volatile(
1827                         "veor       q8, q8          \n"
1828 
1829                         "0:                         \n"
1830 
1831                         "pld        [%2, #128]      \n"
1832                         "vld1.f32   {d0-d1}, [%2 :128]! \n"
1833 
1834                         "pld        [%3, #512]      \n"
1835                         "vldm       %3!, {d8-d15}   \n"
1836 
1837                         "vmla.f32   q8, q4, d0[0]   \n"
1838                         "vmla.f32   q8, q5, d0[1]   \n"
1839 
1840                         "subs       %0, %0, #1      \n"
1841 
1842                         "vmla.f32   q8, q6, d1[0]   \n"
1843                         "vmla.f32   q8, q7, d1[1]   \n"
1844 
1845                         "bne        0b              \n"
1846 
1847                         "vst1.f32   {d16-d17}, [%1 :128]! \n"
1848 
1849                         : "=r"(nn),         // %0
1850                         "=r"(output0_tm), // %1
1851                         "=r"(r0),         // %2
1852                         "=r"(k0)          // %3
1853                         : "0"(nn),
1854                         "1"(output0_tm),
1855                         "2"(r0),
1856                         "3"(k0)
1857                         : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
1858 #endif
1859                 }
1860             }
1861         }
1862     }
1863     bottom_blob_tm = Mat();
1864     // END dot
1865 
1866     // BEGIN transform output
1867     Mat top_blob_bordered;
1868     if (outw == top_blob.w && outh == top_blob.h)
1869     {
1870         top_blob_bordered = top_blob;
1871     }
1872     else
1873     {
1874         top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
1875     }
1876     {
1877         //         const float otm[6][8] = {
1878         //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
1879         //             {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
1880         //             {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
1881         //             {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
1882         //             {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
1883         //             {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
1884         //         };
1885 
1886         // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
1887         // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
1888         // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
1889         // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
1890         // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
1891         // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
1892 
1893         int w_tm = outw / 6 * 8;
1894         int h_tm = outh / 6 * 8;
1895         const int tiles = w_tm / 8 * h_tm / 8;
1896 
1897         #pragma omp parallel for num_threads(opt.num_threads)
1898         for (int p = 0; p < outch; p++)
1899         {
1900             const Mat out0_tm = top_blob_tm.channel(p);
1901             Mat out0 = top_blob_bordered.channel(p);
1902 
1903             //             const float bias0 = bias ? bias[p] : 0.f;
1904             float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
1905 
1906             float tmp[6][8][4];
1907 
1908             // tile
1909             for (int i = 0; i < outh / 6; i++)
1910             {
1911                 for (int j = 0; j < outw / 6; j++)
1912                 {
1913                     //                     top_blob_tm.create(tiles, 64, outch, elemsize, elempack);
1914 
1915                     const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 8 + j) * 4;
1916                     const float* output0_tm_1 = output0_tm_0 + tiles * 4;
1917                     const float* output0_tm_2 = output0_tm_0 + tiles * 8;
1918                     const float* output0_tm_3 = output0_tm_0 + tiles * 12;
1919                     const float* output0_tm_4 = output0_tm_0 + tiles * 16;
1920                     const float* output0_tm_5 = output0_tm_0 + tiles * 20;
1921                     const float* output0_tm_6 = output0_tm_0 + tiles * 24;
1922                     const float* output0_tm_7 = output0_tm_0 + tiles * 28;
1923 
1924                     float* output0 = out0.row(i * 6) + (j * 6) * 4;
1925 
1926                     // TODO neon optimize
1927                     for (int m = 0; m < 8; m++)
1928                     {
1929                         float32x4_t _out0tm0 = vld1q_f32(output0_tm_0);
1930                         float32x4_t _out0tm1 = vld1q_f32(output0_tm_1);
1931                         float32x4_t _out0tm2 = vld1q_f32(output0_tm_2);
1932                         float32x4_t _out0tm3 = vld1q_f32(output0_tm_3);
1933                         float32x4_t _out0tm4 = vld1q_f32(output0_tm_4);
1934                         float32x4_t _out0tm5 = vld1q_f32(output0_tm_5);
1935                         float32x4_t _out0tm6 = vld1q_f32(output0_tm_6);
1936                         float32x4_t _out0tm7 = vld1q_f32(output0_tm_7);
1937 
1938                         float32x4_t _tmp024a = vaddq_f32(_out0tm1, _out0tm2);
1939                         float32x4_t _tmp135a = vsubq_f32(_out0tm1, _out0tm2);
1940 
1941                         //                         float tmp024a = output0_tm[1] + output0_tm[2];
1942                         //                         float tmp135a = output0_tm[1] - output0_tm[2];
1943 
1944                         float32x4_t _tmp024b = vaddq_f32(_out0tm3, _out0tm4);
1945                         float32x4_t _tmp135b = vsubq_f32(_out0tm3, _out0tm4);
1946 
1947                         //                         float tmp024b = output0_tm[3] + output0_tm[4];
1948                         //                         float tmp135b = output0_tm[3] - output0_tm[4];
1949 
1950                         float32x4_t _tmp024c = vaddq_f32(_out0tm5, _out0tm6);
1951                         float32x4_t _tmp135c = vsubq_f32(_out0tm5, _out0tm6);
1952 
1953                         //                         float tmp024c = output0_tm[5] + output0_tm[6];
1954                         //                         float tmp135c = output0_tm[5] - output0_tm[6];
1955 
1956                         float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f));
1957                         float32x4_t _tmp2m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f);
1958                         float32x4_t _tmp4m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f);
1959                         vst1q_f32(tmp[0][m], _tmp0m);
1960                         vst1q_f32(tmp[2][m], _tmp2m);
1961                         vst1q_f32(tmp[4][m], _tmp4m);
1962 
1963                         //                         tmp[0][m] = output0_tm[0] + tmp024a + tmp024b + tmp024c * 32;
1964                         //                         tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
1965                         //                         tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
1966 
1967                         float32x4_t _tmp1m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f);
1968                         float32x4_t _tmp3m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f);
1969                         float32x4_t _tmp5m = vaddq_f32(vaddq_f32(_out0tm7, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f));
1970                         vst1q_f32(tmp[1][m], _tmp1m);
1971                         vst1q_f32(tmp[3][m], _tmp3m);
1972                         vst1q_f32(tmp[5][m], _tmp5m);
1973 
1974                         //                         tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
1975                         //                         tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
1976                         //                         tmp[5][m] = output0_tm[7] + tmp135a + tmp135b * 32 + tmp135c;
1977 
1978                         output0_tm_0 += tiles * 32;
1979                         output0_tm_1 += tiles * 32;
1980                         output0_tm_2 += tiles * 32;
1981                         output0_tm_3 += tiles * 32;
1982                         output0_tm_4 += tiles * 32;
1983                         output0_tm_5 += tiles * 32;
1984                         output0_tm_6 += tiles * 32;
1985                         output0_tm_7 += tiles * 32;
1986                     }
1987 
1988                     for (int m = 0; m < 6; m++)
1989                     {
1990                         float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
1991                         float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
1992                         float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
1993                         float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
1994                         float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
1995                         float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
1996                         float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
1997                         float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
1998 
1999                         float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02);
2000                         float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02);
2001 
2002                         //                         float tmp024a = tmp0[1] + tmp0[2];
2003                         //                         float tmp135a = tmp0[1] - tmp0[2];
2004 
2005                         float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04);
2006                         float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04);
2007 
2008                         //                         float tmp024b = tmp0[3] + tmp0[4];
2009                         //                         float tmp135b = tmp0[3] - tmp0[4];
2010 
2011                         float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06);
2012                         float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06);
2013 
2014                         //                         float tmp024c = tmp0[5] + tmp0[6];
2015                         //                         float tmp135c = tmp0[5] - tmp0[6];
2016 
2017                         float32x4_t _out00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f)));
2018                         float32x4_t _out02 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f));
2019                         float32x4_t _out04 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f));
2020                         vst1q_f32(output0, _out00);
2021                         vst1q_f32(output0 + 8, _out02);
2022                         vst1q_f32(output0 + 16, _out04);
2023 
2024                         //                         output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
2025                         //                         output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
2026                         //                         output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
2027 
2028                         float32x4_t _out01 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f));
2029                         float32x4_t _out03 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f));
2030                         float32x4_t _out05 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp07, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f)));
2031                         vst1q_f32(output0 + 4, _out01);
2032                         vst1q_f32(output0 + 12, _out03);
2033                         vst1q_f32(output0 + 20, _out05);
2034 
2035                         //                         output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
2036                         //                         output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
2037                         //                         output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
2038 
2039                         output0 += outw * 4;
2040                     }
2041                 }
2042             }
2043         }
2044     }
2045     // END transform output
2046 
2047     // cut result pad
2048     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
2049 }
2050 
conv3x3s1_winograd42_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)2051 static void conv3x3s1_winograd42_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
2052 {
2053     // winograd43 transform kernel
2054     Mat kernel_tm(6 * 6, inch, outch);
2055 
2056     const float ktm[6][3] = {
2057         {1.0f / 4, 0.0f, 0.0f},
2058         {-1.0f / 6, -1.0f / 6, -1.0f / 6},
2059         {-1.0f / 6, 1.0f / 6, -1.0f / 6},
2060         {1.0f / 24, 1.0f / 12, 1.0f / 6},
2061         {1.0f / 24, -1.0f / 12, 1.0f / 6},
2062         {0.0f, 0.0f, 1.0f}
2063     };
2064 
2065     #pragma omp parallel for
2066     for (int p = 0; p < outch; p++)
2067     {
2068         for (int q = 0; q < inch; q++)
2069         {
2070             const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
2071             float* kernel_tm0 = kernel_tm.channel(p).row(q);
2072 
2073             // transform kernel
2074             const float* k0 = kernel0;
2075             const float* k1 = kernel0 + 3;
2076             const float* k2 = kernel0 + 6;
2077 
2078             // h
2079             float tmp[6][3];
2080             for (int i = 0; i < 6; i++)
2081             {
2082                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
2083                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
2084                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
2085             }
2086 
2087             // U
2088             for (int j = 0; j < 6; j++)
2089             {
2090                 float* tmpp = &tmp[j][0];
2091 
2092                 for (int i = 0; i < 6; i++)
2093                 {
2094                     kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
2095                 }
2096             }
2097         }
2098     }
2099 
2100     // interleave
2101     // src = 36-inch-outch
2102     // dst = 4b-4a-inch/4a-36-outch/4b;
2103 #if __aarch64__
2104     kernel_tm_pack4.create(2 * inch / 4, 36, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
2105 #else
2106     kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 16, 16);
2107 #endif
2108 
2109     int q = 0;
2110 #if __aarch64__
2111     for (; q + 7 < outch; q += 8)
2112     {
2113         const Mat k0 = kernel_tm.channel(q);
2114         const Mat k1 = kernel_tm.channel(q + 1);
2115         const Mat k2 = kernel_tm.channel(q + 2);
2116         const Mat k3 = kernel_tm.channel(q + 3);
2117         const Mat k4 = kernel_tm.channel(q + 4);
2118         const Mat k5 = kernel_tm.channel(q + 5);
2119         const Mat k6 = kernel_tm.channel(q + 6);
2120         const Mat k7 = kernel_tm.channel(q + 7);
2121 
2122         Mat g0 = kernel_tm_pack4.channel(q / 8);
2123 
2124         for (int k = 0; k < 36; k++)
2125         {
2126             float* g00 = g0.row(k);
2127 
2128             for (int p = 0; p + 3 < inch; p += 4)
2129             {
2130                 const float* k00 = k0.row(p);
2131                 const float* k01 = k0.row(p + 1);
2132                 const float* k02 = k0.row(p + 2);
2133                 const float* k03 = k0.row(p + 3);
2134 
2135                 const float* k10 = k1.row(p);
2136                 const float* k11 = k1.row(p + 1);
2137                 const float* k12 = k1.row(p + 2);
2138                 const float* k13 = k1.row(p + 3);
2139 
2140                 const float* k20 = k2.row(p);
2141                 const float* k21 = k2.row(p + 1);
2142                 const float* k22 = k2.row(p + 2);
2143                 const float* k23 = k2.row(p + 3);
2144 
2145                 const float* k30 = k3.row(p);
2146                 const float* k31 = k3.row(p + 1);
2147                 const float* k32 = k3.row(p + 2);
2148                 const float* k33 = k3.row(p + 3);
2149 
2150                 const float* k40 = k4.row(p);
2151                 const float* k41 = k4.row(p + 1);
2152                 const float* k42 = k4.row(p + 2);
2153                 const float* k43 = k4.row(p + 3);
2154 
2155                 const float* k50 = k5.row(p);
2156                 const float* k51 = k5.row(p + 1);
2157                 const float* k52 = k5.row(p + 2);
2158                 const float* k53 = k5.row(p + 3);
2159 
2160                 const float* k60 = k6.row(p);
2161                 const float* k61 = k6.row(p + 1);
2162                 const float* k62 = k6.row(p + 2);
2163                 const float* k63 = k6.row(p + 3);
2164 
2165                 const float* k70 = k7.row(p);
2166                 const float* k71 = k7.row(p + 1);
2167                 const float* k72 = k7.row(p + 2);
2168                 const float* k73 = k7.row(p + 3);
2169 
2170                 g00[0] = k00[k];
2171                 g00[1] = k10[k];
2172                 g00[2] = k20[k];
2173                 g00[3] = k30[k];
2174 
2175                 g00[4] = k40[k];
2176                 g00[5] = k50[k];
2177                 g00[6] = k60[k];
2178                 g00[7] = k70[k];
2179 
2180                 g00[8] = k01[k];
2181                 g00[9] = k11[k];
2182                 g00[10] = k21[k];
2183                 g00[11] = k31[k];
2184 
2185                 g00[12] = k41[k];
2186                 g00[13] = k51[k];
2187                 g00[14] = k61[k];
2188                 g00[15] = k71[k];
2189 
2190                 g00[16] = k02[k];
2191                 g00[17] = k12[k];
2192                 g00[18] = k22[k];
2193                 g00[19] = k32[k];
2194 
2195                 g00[20] = k42[k];
2196                 g00[21] = k52[k];
2197                 g00[22] = k62[k];
2198                 g00[23] = k72[k];
2199 
2200                 g00[24] = k03[k];
2201                 g00[25] = k13[k];
2202                 g00[26] = k23[k];
2203                 g00[27] = k33[k];
2204 
2205                 g00[28] = k43[k];
2206                 g00[29] = k53[k];
2207                 g00[30] = k63[k];
2208                 g00[31] = k73[k];
2209 
2210                 g00 += 32;
2211             }
2212         }
2213     }
2214 #endif // __aarch64__
2215     for (; q + 3 < outch; q += 4)
2216     {
2217         const Mat k0 = kernel_tm.channel(q);
2218         const Mat k1 = kernel_tm.channel(q + 1);
2219         const Mat k2 = kernel_tm.channel(q + 2);
2220         const Mat k3 = kernel_tm.channel(q + 3);
2221 
2222 #if __aarch64__
2223         Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
2224 #else
2225         Mat g0 = kernel_tm_pack4.channel(q / 4);
2226 #endif
2227 
2228         for (int k = 0; k < 36; k++)
2229         {
2230             float* g00 = g0.row(k);
2231 
2232             for (int p = 0; p + 3 < inch; p += 4)
2233             {
2234                 const float* k00 = k0.row(p);
2235                 const float* k01 = k0.row(p + 1);
2236                 const float* k02 = k0.row(p + 2);
2237                 const float* k03 = k0.row(p + 3);
2238 
2239                 const float* k10 = k1.row(p);
2240                 const float* k11 = k1.row(p + 1);
2241                 const float* k12 = k1.row(p + 2);
2242                 const float* k13 = k1.row(p + 3);
2243 
2244                 const float* k20 = k2.row(p);
2245                 const float* k21 = k2.row(p + 1);
2246                 const float* k22 = k2.row(p + 2);
2247                 const float* k23 = k2.row(p + 3);
2248 
2249                 const float* k30 = k3.row(p);
2250                 const float* k31 = k3.row(p + 1);
2251                 const float* k32 = k3.row(p + 2);
2252                 const float* k33 = k3.row(p + 3);
2253 
2254                 g00[0] = k00[k];
2255                 g00[1] = k10[k];
2256                 g00[2] = k20[k];
2257                 g00[3] = k30[k];
2258 
2259                 g00[4] = k01[k];
2260                 g00[5] = k11[k];
2261                 g00[6] = k21[k];
2262                 g00[7] = k31[k];
2263 
2264                 g00[8] = k02[k];
2265                 g00[9] = k12[k];
2266                 g00[10] = k22[k];
2267                 g00[11] = k32[k];
2268 
2269                 g00[12] = k03[k];
2270                 g00[13] = k13[k];
2271                 g00[14] = k23[k];
2272                 g00[15] = k33[k];
2273 
2274                 g00 += 16;
2275             }
2276         }
2277     }
2278 }
2279 
conv3x3s1_winograd42_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)2280 static void conv3x3s1_winograd42_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
2281 {
2282     int w = bottom_blob.w;
2283     int h = bottom_blob.h;
2284     int inch = bottom_blob.c;
2285     size_t elemsize = bottom_blob.elemsize;
2286     int elempack = bottom_blob.elempack;
2287 
2288     int outw = top_blob.w;
2289     int outh = top_blob.h;
2290     int outch = top_blob.c;
2291 
2292     // pad to 4n+2
2293     Mat bottom_blob_bordered = bottom_blob;
2294 
2295     outw = (outw + 3) / 4 * 4;
2296     outh = (outh + 3) / 4 * 4;
2297 
2298     w = outw + 2;
2299     h = outh + 2;
2300     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
2301 
2302     const float* bias = _bias;
2303 
2304     // BEGIN transform input
2305     Mat bottom_blob_tm;
2306     {
2307         int w_tm = outw / 4 * 6;
2308         int h_tm = outh / 4 * 6;
2309 
2310         const int tiles = w_tm / 6 * h_tm / 6;
2311 
2312         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
2313 
2314         // const float itm[4][4] = {
2315         //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
2316         //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
2317         //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
2318         //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
2319         //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
2320         //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
2321         // };
2322 
2323         // 0 =  4 * r00 - 5 * r02 + r04
2324         // 1 = -4 * (r01 + r02) + r04 + r03
2325         // 2 =  4 * (r01 - r02) + r04 - r03
2326         // 3 = -2 * (r01 - r03) + r04 - r02
2327         // 4 =  2 * (r01 - r03) + r04 - r02
2328         // 5 =  4 * r01 - 5 * r03 + r05
2329 
2330         #pragma omp parallel for num_threads(opt.num_threads)
2331         for (int q = 0; q < inch; q++)
2332         {
2333             const Mat img0 = bottom_blob_bordered.channel(q);
2334             Mat img0_tm = bottom_blob_tm.channel(q);
2335 
2336             float tmp[6][6][4];
2337 
2338             // tile
2339             for (int i = 0; i < h_tm / 6; i++)
2340             {
2341                 for (int j = 0; j < w_tm / 6; j++)
2342                 {
2343                     const float* r0 = img0.row(i * 4) + (j * 4) * 4;
2344 
2345                     for (int m = 0; m < 6; m++)
2346                     {
2347                         float32x4_t _r00 = vld1q_f32(r0);
2348                         float32x4_t _r01 = vld1q_f32(r0 + 4);
2349                         float32x4_t _r02 = vld1q_f32(r0 + 8);
2350                         float32x4_t _r03 = vld1q_f32(r0 + 12);
2351                         float32x4_t _r04 = vld1q_f32(r0 + 16);
2352                         float32x4_t _r05 = vld1q_f32(r0 + 20);
2353 
2354                         float32x4_t _tmp0m = vmlsq_n_f32(vmlaq_n_f32(_r04, _r00, 4.f), _r02, 5.f);
2355                         float32x4_t _tmp1m = vmlsq_n_f32(vaddq_f32(_r04, _r03), vaddq_f32(_r01, _r02), 4.f);
2356                         float32x4_t _tmp2m = vmlaq_n_f32(vsubq_f32(_r04, _r03), vsubq_f32(_r01, _r02), 4.f);
2357                         float32x4_t _tmp3m = vmlsq_n_f32(vsubq_f32(_r04, _r02), vsubq_f32(_r01, _r03), 2.f);
2358                         float32x4_t _tmp4m = vmlaq_n_f32(vsubq_f32(_r04, _r02), vsubq_f32(_r01, _r03), 2.f);
2359                         float32x4_t _tmp5m = vmlsq_n_f32(vmlaq_n_f32(_r05, _r01, 4.f), _r03, 5.f);
2360 
2361                         vst1q_f32(tmp[0][m], _tmp0m);
2362                         vst1q_f32(tmp[1][m], _tmp1m);
2363                         vst1q_f32(tmp[2][m], _tmp2m);
2364                         vst1q_f32(tmp[3][m], _tmp3m);
2365                         vst1q_f32(tmp[4][m], _tmp4m);
2366                         vst1q_f32(tmp[5][m], _tmp5m);
2367 
2368                         r0 += w * 4;
2369                     }
2370 
2371                     float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 6 + j) * 4;
2372                     float* r0_tm_1 = r0_tm_0 + tiles * 4;
2373                     float* r0_tm_2 = r0_tm_0 + tiles * 8;
2374                     float* r0_tm_3 = r0_tm_0 + tiles * 12;
2375                     float* r0_tm_4 = r0_tm_0 + tiles * 16;
2376                     float* r0_tm_5 = r0_tm_0 + tiles * 20;
2377 
2378                     for (int m = 0; m < 6; m++)
2379                     {
2380                         float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
2381                         float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
2382                         float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
2383                         float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
2384                         float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
2385                         float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
2386 
2387                         float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f);
2388                         float32x4_t _r0tm1 = vmlsq_n_f32(vaddq_f32(_tmp04, _tmp03), vaddq_f32(_tmp01, _tmp02), 4.f);
2389                         float32x4_t _r0tm2 = vmlaq_n_f32(vsubq_f32(_tmp04, _tmp03), vsubq_f32(_tmp01, _tmp02), 4.f);
2390                         float32x4_t _r0tm3 = vmlsq_n_f32(vsubq_f32(_tmp04, _tmp02), vsubq_f32(_tmp01, _tmp03), 2.f);
2391                         float32x4_t _r0tm4 = vmlaq_n_f32(vsubq_f32(_tmp04, _tmp02), vsubq_f32(_tmp01, _tmp03), 2.f);
2392                         float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f);
2393 
2394                         vst1q_f32(r0_tm_0, _r0tm0);
2395                         vst1q_f32(r0_tm_1, _r0tm1);
2396                         vst1q_f32(r0_tm_2, _r0tm2);
2397                         vst1q_f32(r0_tm_3, _r0tm3);
2398                         vst1q_f32(r0_tm_4, _r0tm4);
2399                         vst1q_f32(r0_tm_5, _r0tm5);
2400 
2401                         r0_tm_0 += tiles * 24;
2402                         r0_tm_1 += tiles * 24;
2403                         r0_tm_2 += tiles * 24;
2404                         r0_tm_3 += tiles * 24;
2405                         r0_tm_4 += tiles * 24;
2406                         r0_tm_5 += tiles * 24;
2407                     }
2408                 }
2409             }
2410         }
2411     }
2412     bottom_blob_bordered = Mat();
2413     // END transform input
2414 
2415     // BEGIN dot
2416     Mat top_blob_tm;
2417     {
2418         int w_tm = outw / 4 * 6;
2419         int h_tm = outh / 4 * 6;
2420 
2421         const int tiles = h_tm / 6 * w_tm / 6;
2422 
2423         // permute
2424         //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
2425         Mat bottom_blob_tm2;
2426 #if __aarch64__
2427         if (tiles >= 12)
2428             bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, 36, elemsize, elempack, opt.workspace_allocator);
2429         else if (tiles >= 8)
2430             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2431         else if (tiles >= 4)
2432             bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2433         else if (tiles >= 2)
2434             bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2435         else // if (tiles >= 1)
2436             bottom_blob_tm2.create(1 * inch, tiles, 36, elemsize, elempack, opt.workspace_allocator);
2437 #else
2438         if (tiles >= 8)
2439             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2440         else if (tiles >= 4)
2441             bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2442         else if (tiles >= 2)
2443             bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2444         else // if (tiles >= 1)
2445             bottom_blob_tm2.create(1 * inch, tiles, 36, elemsize, elempack, opt.workspace_allocator);
2446 #endif
2447 
2448         #pragma omp parallel for num_threads(opt.num_threads)
2449         for (int r = 0; r < 36; r++)
2450         {
2451             Mat tm2 = bottom_blob_tm2.channel(r);
2452 
2453             // tile
2454             int i = 0;
2455 #if __aarch64__
2456             for (; i + 11 < tiles; i += 12)
2457             {
2458                 float* tm2p = tm2.row(i / 12);
2459 
2460                 const float* r0 = bottom_blob_tm;
2461 
2462                 r0 += (r * tiles + i) * 4;
2463 
2464                 for (int q = 0; q < inch; q++)
2465                 {
2466                     asm volatile(
2467                         "prfm   pldl1keep, [%0, #512]       \n"
2468                         "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
2469                         "prfm   pldl1keep, [%0, #512]       \n"
2470                         "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
2471                         "prfm   pldl1keep, [%0, #512]       \n"
2472                         "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
2473                         "st1    {v0.4s}, [%1], #16          \n"
2474                         "st1    {v4.4s}, [%1], #16          \n"
2475                         "st1    {v8.4s}, [%1], #16          \n"
2476                         "sub    %0, %0, #128                \n"
2477                         "st1    {v1.4s}, [%1], #16          \n"
2478                         "st1    {v5.4s}, [%1], #16          \n"
2479                         "st1    {v9.4s}, [%1], #16          \n"
2480                         "st1    {v2.4s}, [%1], #16          \n"
2481                         "st1    {v6.4s}, [%1], #16          \n"
2482                         "st1    {v10.4s}, [%1], #16         \n"
2483                         "st1    {v3.4s}, [%1], #16          \n"
2484                         "st1    {v7.4s}, [%1], #16          \n"
2485                         "st1    {v11.4s}, [%1], #16         \n"
2486                         : "=r"(r0),  // %0
2487                         "=r"(tm2p) // %1
2488                         : "0"(r0),
2489                         "1"(tm2p)
2490                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
2491                     r0 += bottom_blob_tm.cstep * 4;
2492                 }
2493             }
2494 #endif
2495             for (; i + 7 < tiles; i += 8)
2496             {
2497 #if __aarch64__
2498                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
2499 #else
2500                 float* tm2p = tm2.row(i / 8);
2501 #endif
2502 
2503                 const float* r0 = bottom_blob_tm;
2504 
2505                 r0 += (r * tiles + i) * 4;
2506 
2507                 for (int q = 0; q < inch; q++)
2508                 {
2509 #if __aarch64__
2510                     asm volatile(
2511                         "prfm   pldl1keep, [%0, #512]       \n"
2512                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
2513                         "prfm   pldl1keep, [%0, #512]       \n"
2514                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
2515                         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
2516                         "sub    %0, %0, #64                 \n"
2517                         "st1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
2518                         : "=r"(r0),  // %0
2519                         "=r"(tm2p) // %1
2520                         : "0"(r0),
2521                         "1"(tm2p)
2522                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2523 #else
2524                     asm volatile(
2525                         "pld        [%0, #512]          \n"
2526                         "vldm       %0!, {d0-d7}        \n"
2527                         "pld        [%0, #512]          \n"
2528                         "vldm       %0, {d16-d23}       \n"
2529 
2530                         // transpose 8x4
2531                         "vtrn.32    q0, q1              \n"
2532                         "vtrn.32    q2, q3              \n"
2533                         "vtrn.32    q8, q9              \n"
2534                         "vtrn.32    q10, q11            \n"
2535                         "vswp       d1, d4              \n"
2536                         "vswp       d3, d6              \n"
2537                         "vswp       d17, d20            \n"
2538                         "vswp       d19, d22            \n"
2539                         "vswp       q1, q8              \n"
2540                         "vswp       q3, q10             \n"
2541 
2542                         "vst1.f32   {d0-d3}, [%1 :128]! \n"
2543                         "vst1.f32   {d16-d19}, [%1 :128]! \n"
2544                         "sub        %0, %0, #64         \n"
2545                         "vst1.f32   {d4-d7}, [%1 :128]! \n"
2546                         "vst1.f32   {d20-d23}, [%1 :128]! \n"
2547                         : "=r"(r0),  // %0
2548                         "=r"(tm2p) // %1
2549                         : "0"(r0),
2550                         "1"(tm2p)
2551                         : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
2552 #endif
2553                     r0 += bottom_blob_tm.cstep * 4;
2554                 }
2555             }
2556             for (; i + 3 < tiles; i += 4)
2557             {
2558 #if __aarch64__
2559                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
2560 #else
2561                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
2562 #endif
2563 
2564                 const float* r0 = bottom_blob_tm;
2565 
2566                 r0 += (r * tiles + i) * 4;
2567 
2568                 for (int q = 0; q < inch; q++)
2569                 {
2570 #if __aarch64__
2571                     asm volatile(
2572                         "prfm   pldl1keep, [%0, #512]       \n"
2573                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
2574                         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
2575                         : "=r"(r0),  // %0
2576                         "=r"(tm2p) // %1
2577                         : "0"(r0),
2578                         "1"(tm2p)
2579                         : "memory", "v0", "v1", "v2", "v3");
2580 #else
2581                     asm volatile(
2582                         "pld        [%0, #512]          \n"
2583                         "vldm       %0, {d0-d7}         \n"
2584                         "vstm       %1!, {d0-d7}        \n"
2585                         : "=r"(r0),  // %0
2586                         "=r"(tm2p) // %1
2587                         : "0"(r0),
2588                         "1"(tm2p)
2589                         : "memory", "q0", "q1", "q2", "q3");
2590 #endif // __aarch64__
2591                     r0 += bottom_blob_tm.cstep * 4;
2592                 }
2593             }
2594             for (; i + 1 < tiles; i += 2)
2595             {
2596 #if __aarch64__
2597                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
2598 #else
2599                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
2600 #endif
2601 
2602                 const float* r0 = bottom_blob_tm;
2603 
2604                 r0 += (r * tiles + i) * 4;
2605 
2606                 for (int q = 0; q < inch; q++)
2607                 {
2608 #if __aarch64__
2609                     asm volatile(
2610                         "prfm   pldl1keep, [%0, #256]       \n"
2611                         "ld1    {v0.4s, v1.4s}, [%0]        \n"
2612                         "st1    {v0.4s, v1.4s}, [%1], #32   \n"
2613                         : "=r"(r0),  // %0
2614                         "=r"(tm2p) // %1
2615                         : "0"(r0),
2616                         "1"(tm2p)
2617                         : "memory", "v0", "v1");
2618 #else
2619                     asm volatile(
2620                         "pld        [%0, #256]          \n"
2621                         "vld1.f32   {d0-d3}, [%0 :128]  \n"
2622                         "vst1.f32   {d0-d3}, [%1 :128]! \n"
2623                         : "=r"(r0),  // %0
2624                         "=r"(tm2p) // %1
2625                         : "0"(r0),
2626                         "1"(tm2p)
2627                         : "memory", "q0", "q1");
2628 #endif // __aarch64__
2629                     r0 += bottom_blob_tm.cstep * 4;
2630                 }
2631             }
2632             for (; i < tiles; i++)
2633             {
2634 #if __aarch64__
2635                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
2636 #else
2637                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
2638 #endif
2639 
2640                 const float* r0 = bottom_blob_tm;
2641 
2642                 r0 += (r * tiles + i) * 4;
2643 
2644                 for (int q = 0; q < inch; q++)
2645                 {
2646 #if __aarch64__
2647                     asm volatile(
2648                         "prfm   pldl1keep, [%0, #128]       \n"
2649                         "ld1    {v0.4s}, [%0]               \n"
2650                         "st1    {v0.4s}, [%1], #16          \n"
2651                         : "=r"(r0),  // %0
2652                         "=r"(tm2p) // %1
2653                         : "0"(r0),
2654                         "1"(tm2p)
2655                         : "memory", "v0");
2656 #else
2657                     asm volatile(
2658                         "pld        [%0, #128]          \n"
2659                         "vld1.f32   {d0-d1}, [%0 :128]  \n"
2660                         "vst1.f32   {d0-d1}, [%1 :128]! \n"
2661                         : "=r"(r0),  // %0
2662                         "=r"(tm2p) // %1
2663                         : "0"(r0),
2664                         "1"(tm2p)
2665                         : "memory", "q0");
2666 #endif // __aarch64__
2667                     r0 += bottom_blob_tm.cstep * 4;
2668                 }
2669             }
2670         }
2671 
2672         bottom_blob_tm = Mat();
2673         // permute end
2674 
2675         top_blob_tm.create(tiles, 36, outch, elemsize, elempack, opt.workspace_allocator);
2676 
2677         int remain_outch_start = 0;
2678 
2679 #if __ARM_NEON && __aarch64__
2680         int nn_outch = 0;
2681         nn_outch = outch >> 1;
2682         remain_outch_start = nn_outch << 1;
2683 
2684         #pragma omp parallel for num_threads(opt.num_threads)
2685         for (int pp = 0; pp < nn_outch; pp++)
2686         {
2687             int p = pp * 2;
2688 
2689             float* output0_tm = top_blob_tm.channel(p);
2690             float* output1_tm = top_blob_tm.channel(p + 1);
2691 
2692             const Mat kernel01_tm = kernel_tm.channel(pp);
2693 
2694             for (int r = 0; r < 36; r++)
2695             {
2696                 const Mat bb2 = bottom_blob_tm2.channel(r);
2697 
2698                 int i = 0;
2699                 for (; i + 11 < tiles; i += 12)
2700                 {
2701                     const float* r0 = bb2.row(i / 12);
2702 
2703                     const float* k01 = kernel01_tm.row(r);
2704 
2705                     int nn = inch; // inch always > 0
2706 
2707                     asm volatile(
2708                         "eor    v8.16b, v8.16b, v8.16b      \n"
2709                         "eor    v9.16b, v9.16b, v9.16b      \n"
2710                         "eor    v10.16b, v10.16b, v10.16b   \n"
2711                         "eor    v11.16b, v11.16b, v11.16b   \n"
2712                         "eor    v12.16b, v12.16b, v12.16b   \n"
2713                         "eor    v13.16b, v13.16b, v13.16b   \n"
2714                         "eor    v14.16b, v14.16b, v14.16b   \n"
2715                         "eor    v15.16b, v15.16b, v15.16b   \n"
2716                         "eor    v16.16b, v16.16b, v16.16b   \n"
2717                         "eor    v17.16b, v17.16b, v17.16b   \n"
2718                         "eor    v18.16b, v18.16b, v18.16b   \n"
2719                         "eor    v19.16b, v19.16b, v19.16b   \n"
2720                         "eor    v20.16b, v20.16b, v20.16b   \n"
2721                         "eor    v21.16b, v21.16b, v21.16b   \n"
2722                         "eor    v22.16b, v22.16b, v22.16b   \n"
2723                         "eor    v23.16b, v23.16b, v23.16b   \n"
2724                         "eor    v24.16b, v24.16b, v24.16b   \n"
2725                         "eor    v25.16b, v25.16b, v25.16b   \n"
2726                         "eor    v26.16b, v26.16b, v26.16b   \n"
2727                         "eor    v27.16b, v27.16b, v27.16b   \n"
2728                         "eor    v28.16b, v28.16b, v28.16b   \n"
2729                         "eor    v29.16b, v29.16b, v29.16b   \n"
2730                         "eor    v30.16b, v30.16b, v30.16b   \n"
2731                         "eor    v31.16b, v31.16b, v31.16b   \n"
2732 
2733                         "0:                                 \n"
2734 
2735                         "prfm   pldl1keep, [%3, #512]       \n"
2736                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2737 
2738                         "prfm   pldl1keep, [%4, #512]       \n"
2739                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64   \n" // w0011_01
2740 
2741                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
2742                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
2743                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
2744                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
2745                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
2746                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
2747                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
2748                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
2749                         "fmla   v16.4s, v4.4s, v2.s[0]      \n"
2750                         "fmla   v17.4s, v4.4s, v2.s[1]      \n"
2751                         "fmla   v18.4s, v4.4s, v2.s[2]      \n"
2752                         "fmla   v19.4s, v4.4s, v2.s[3]      \n"
2753 
2754                         "fmla   v20.4s, v5.4s, v0.s[0]      \n"
2755                         "fmla   v21.4s, v5.4s, v0.s[1]      \n"
2756                         "fmla   v22.4s, v5.4s, v0.s[2]      \n"
2757                         "fmla   v23.4s, v5.4s, v0.s[3]      \n"
2758                         "fmla   v24.4s, v5.4s, v1.s[0]      \n"
2759                         "fmla   v25.4s, v5.4s, v1.s[1]      \n"
2760                         "fmla   v26.4s, v5.4s, v1.s[2]      \n"
2761                         "fmla   v27.4s, v5.4s, v1.s[3]      \n"
2762                         "fmla   v28.4s, v5.4s, v2.s[0]      \n"
2763                         "fmla   v29.4s, v5.4s, v2.s[1]      \n"
2764                         "fmla   v30.4s, v5.4s, v2.s[2]      \n"
2765                         "fmla   v31.4s, v5.4s, v2.s[3]      \n"
2766 
2767                         "fmla   v8.4s, v6.4s, v3.s[0]       \n"
2768                         "fmla   v9.4s, v6.4s, v3.s[1]       \n"
2769                         "fmla   v10.4s, v6.4s, v3.s[2]      \n"
2770                         "fmla   v11.4s, v6.4s, v3.s[3]      \n"
2771 
2772                         "fmla   v20.4s, v7.4s, v3.s[0]      \n"
2773                         "fmla   v21.4s, v7.4s, v3.s[1]      \n"
2774                         "fmla   v22.4s, v7.4s, v3.s[2]      \n"
2775                         "fmla   v23.4s, v7.4s, v3.s[3]      \n"
2776 
2777                         "prfm   pldl1keep, [%3, #512]       \n"
2778                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2779 
2780                         "fmla   v12.4s, v6.4s, v0.s[0]      \n"
2781                         "fmla   v13.4s, v6.4s, v0.s[1]      \n"
2782                         "fmla   v14.4s, v6.4s, v0.s[2]      \n"
2783                         "fmla   v15.4s, v6.4s, v0.s[3]      \n"
2784                         "fmla   v16.4s, v6.4s, v1.s[0]      \n"
2785                         "fmla   v17.4s, v6.4s, v1.s[1]      \n"
2786                         "fmla   v18.4s, v6.4s, v1.s[2]      \n"
2787                         "fmla   v19.4s, v6.4s, v1.s[3]      \n"
2788 
2789                         "fmla   v24.4s, v7.4s, v0.s[0]      \n"
2790                         "fmla   v25.4s, v7.4s, v0.s[1]      \n"
2791                         "fmla   v26.4s, v7.4s, v0.s[2]      \n"
2792                         "fmla   v27.4s, v7.4s, v0.s[3]      \n"
2793                         "fmla   v28.4s, v7.4s, v1.s[0]      \n"
2794                         "fmla   v29.4s, v7.4s, v1.s[1]      \n"
2795                         "fmla   v30.4s, v7.4s, v1.s[2]      \n"
2796                         "fmla   v31.4s, v7.4s, v1.s[3]      \n"
2797 
2798                         "prfm   pldl1keep, [%4, #512]       \n"
2799                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64   \n" // w2233_01
2800 
2801                         "fmla   v8.4s, v4.4s, v2.s[0]       \n"
2802                         "fmla   v9.4s, v4.4s, v2.s[1]       \n"
2803                         "fmla   v10.4s, v4.4s, v2.s[2]      \n"
2804                         "fmla   v11.4s, v4.4s, v2.s[3]      \n"
2805                         "fmla   v12.4s, v4.4s, v3.s[0]      \n"
2806                         "fmla   v13.4s, v4.4s, v3.s[1]      \n"
2807                         "fmla   v14.4s, v4.4s, v3.s[2]      \n"
2808                         "fmla   v15.4s, v4.4s, v3.s[3]      \n"
2809 
2810                         "fmla   v20.4s, v5.4s, v2.s[0]      \n"
2811                         "fmla   v21.4s, v5.4s, v2.s[1]      \n"
2812                         "fmla   v22.4s, v5.4s, v2.s[2]      \n"
2813                         "fmla   v23.4s, v5.4s, v2.s[3]      \n"
2814                         "fmla   v24.4s, v5.4s, v3.s[0]      \n"
2815                         "fmla   v25.4s, v5.4s, v3.s[1]      \n"
2816                         "fmla   v26.4s, v5.4s, v3.s[2]      \n"
2817                         "fmla   v27.4s, v5.4s, v3.s[3]      \n"
2818 
2819                         "prfm   pldl1keep, [%3, #512]       \n"
2820                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2821 
2822                         "fmla   v16.4s, v4.4s, v0.s[0]      \n"
2823                         "fmla   v17.4s, v4.4s, v0.s[1]      \n"
2824                         "fmla   v18.4s, v4.4s, v0.s[2]      \n"
2825                         "fmla   v19.4s, v4.4s, v0.s[3]      \n"
2826 
2827                         "fmla   v28.4s, v5.4s, v0.s[0]      \n"
2828                         "fmla   v29.4s, v5.4s, v0.s[1]      \n"
2829                         "fmla   v30.4s, v5.4s, v0.s[2]      \n"
2830                         "fmla   v31.4s, v5.4s, v0.s[3]      \n"
2831 
2832                         "fmla   v8.4s, v6.4s, v1.s[0]       \n"
2833                         "fmla   v9.4s, v6.4s, v1.s[1]       \n"
2834                         "fmla   v10.4s, v6.4s, v1.s[2]      \n"
2835                         "fmla   v11.4s, v6.4s, v1.s[3]      \n"
2836                         "fmla   v12.4s, v6.4s, v2.s[0]      \n"
2837                         "fmla   v13.4s, v6.4s, v2.s[1]      \n"
2838                         "fmla   v14.4s, v6.4s, v2.s[2]      \n"
2839                         "fmla   v15.4s, v6.4s, v2.s[3]      \n"
2840                         "fmla   v16.4s, v6.4s, v3.s[0]      \n"
2841                         "fmla   v17.4s, v6.4s, v3.s[1]      \n"
2842                         "fmla   v18.4s, v6.4s, v3.s[2]      \n"
2843                         "fmla   v19.4s, v6.4s, v3.s[3]      \n"
2844 
2845                         "subs   %w0, %w0, #1                \n"
2846 
2847                         "fmla   v20.4s, v7.4s, v1.s[0]      \n"
2848                         "fmla   v21.4s, v7.4s, v1.s[1]      \n"
2849                         "fmla   v22.4s, v7.4s, v1.s[2]      \n"
2850                         "fmla   v23.4s, v7.4s, v1.s[3]      \n"
2851                         "fmla   v24.4s, v7.4s, v2.s[0]      \n"
2852                         "fmla   v25.4s, v7.4s, v2.s[1]      \n"
2853                         "fmla   v26.4s, v7.4s, v2.s[2]      \n"
2854                         "fmla   v27.4s, v7.4s, v2.s[3]      \n"
2855                         "fmla   v28.4s, v7.4s, v3.s[0]      \n"
2856                         "fmla   v29.4s, v7.4s, v3.s[1]      \n"
2857                         "fmla   v30.4s, v7.4s, v3.s[2]      \n"
2858                         "fmla   v31.4s, v7.4s, v3.s[3]      \n"
2859 
2860                         "bne    0b                          \n"
2861 
2862                         "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
2863                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
2864                         "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
2865                         "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
2866                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
2867                         "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
2868 
2869                         : "=r"(nn),         // %0
2870                         "=r"(output0_tm), // %1
2871                         "=r"(output1_tm), // %2
2872                         "=r"(r0),         // %3
2873                         "=r"(k01)         // %4
2874                         : "0"(nn),
2875                         "1"(output0_tm),
2876                         "2"(output1_tm),
2877                         "3"(r0),
2878                         "4"(k01)
2879                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
2880                 }
2881                 for (; i + 7 < tiles; i += 8)
2882                 {
2883                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
2884 
2885                     const float* k01 = kernel01_tm.row(r);
2886 
2887                     int nn = inch; // inch always > 0
2888 
2889                     asm volatile(
2890                         "eor    v16.16b, v16.16b, v16.16b   \n"
2891                         "eor    v17.16b, v17.16b, v17.16b   \n"
2892                         "eor    v18.16b, v18.16b, v18.16b   \n"
2893                         "eor    v19.16b, v19.16b, v19.16b   \n"
2894                         "eor    v20.16b, v20.16b, v20.16b   \n"
2895                         "eor    v21.16b, v21.16b, v21.16b   \n"
2896                         "eor    v22.16b, v22.16b, v22.16b   \n"
2897                         "eor    v23.16b, v23.16b, v23.16b   \n"
2898                         "eor    v24.16b, v24.16b, v24.16b   \n"
2899                         "eor    v25.16b, v25.16b, v25.16b   \n"
2900                         "eor    v26.16b, v26.16b, v26.16b   \n"
2901                         "eor    v27.16b, v27.16b, v27.16b   \n"
2902                         "eor    v28.16b, v28.16b, v28.16b   \n"
2903                         "eor    v29.16b, v29.16b, v29.16b   \n"
2904                         "eor    v30.16b, v30.16b, v30.16b   \n"
2905                         "eor    v31.16b, v31.16b, v31.16b   \n"
2906 
2907                         "0:                                 \n"
2908 
2909                         "prfm   pldl1keep, [%3, #512]       \n"
2910                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
2911 
2912                         "prfm   pldl1keep, [%4, #512]       \n"
2913                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
2914 
2915                         "prfm   pldl1keep, [%3, #512]       \n"
2916                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
2917 
2918                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
2919                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
2920                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
2921                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
2922                         "fmla   v20.4s, v8.4s, v4.s[0]      \n"
2923                         "fmla   v21.4s, v8.4s, v5.s[0]      \n"
2924                         "fmla   v22.4s, v8.4s, v6.s[0]      \n"
2925                         "fmla   v23.4s, v8.4s, v7.s[0]      \n"
2926 
2927                         "fmla   v24.4s, v9.4s, v0.s[0]      \n"
2928                         "fmla   v25.4s, v9.4s, v1.s[0]      \n"
2929                         "fmla   v26.4s, v9.4s, v2.s[0]      \n"
2930                         "fmla   v27.4s, v9.4s, v3.s[0]      \n"
2931                         "fmla   v28.4s, v9.4s, v4.s[0]      \n"
2932                         "fmla   v29.4s, v9.4s, v5.s[0]      \n"
2933                         "fmla   v30.4s, v9.4s, v6.s[0]      \n"
2934                         "fmla   v31.4s, v9.4s, v7.s[0]      \n"
2935 
2936                         "prfm   pldl1keep, [%4, #512]       \n"
2937                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
2938 
2939                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
2940                         "fmla   v17.4s, v10.4s, v1.s[1]     \n"
2941                         "fmla   v18.4s, v10.4s, v2.s[1]     \n"
2942                         "fmla   v19.4s, v10.4s, v3.s[1]     \n"
2943                         "fmla   v20.4s, v10.4s, v4.s[1]     \n"
2944                         "fmla   v21.4s, v10.4s, v5.s[1]     \n"
2945                         "fmla   v22.4s, v10.4s, v6.s[1]     \n"
2946                         "fmla   v23.4s, v10.4s, v7.s[1]     \n"
2947 
2948                         "fmla   v24.4s, v11.4s, v0.s[1]     \n"
2949                         "fmla   v25.4s, v11.4s, v1.s[1]     \n"
2950                         "fmla   v26.4s, v11.4s, v2.s[1]     \n"
2951                         "fmla   v27.4s, v11.4s, v3.s[1]     \n"
2952                         "fmla   v28.4s, v11.4s, v4.s[1]     \n"
2953                         "fmla   v29.4s, v11.4s, v5.s[1]     \n"
2954                         "fmla   v30.4s, v11.4s, v6.s[1]     \n"
2955                         "fmla   v31.4s, v11.4s, v7.s[1]     \n"
2956 
2957                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
2958                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
2959                         "fmla   v18.4s, v12.4s, v2.s[2]     \n"
2960                         "fmla   v19.4s, v12.4s, v3.s[2]     \n"
2961                         "fmla   v20.4s, v12.4s, v4.s[2]     \n"
2962                         "fmla   v21.4s, v12.4s, v5.s[2]     \n"
2963                         "fmla   v22.4s, v12.4s, v6.s[2]     \n"
2964                         "fmla   v23.4s, v12.4s, v7.s[2]     \n"
2965 
2966                         "fmla   v24.4s, v13.4s, v0.s[2]     \n"
2967                         "fmla   v25.4s, v13.4s, v1.s[2]     \n"
2968                         "fmla   v26.4s, v13.4s, v2.s[2]     \n"
2969                         "fmla   v27.4s, v13.4s, v3.s[2]     \n"
2970                         "fmla   v28.4s, v13.4s, v4.s[2]     \n"
2971                         "fmla   v29.4s, v13.4s, v5.s[2]     \n"
2972                         "fmla   v30.4s, v13.4s, v6.s[2]     \n"
2973                         "fmla   v31.4s, v13.4s, v7.s[2]     \n"
2974 
2975                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
2976                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
2977                         "fmla   v18.4s, v14.4s, v2.s[3]     \n"
2978                         "fmla   v19.4s, v14.4s, v3.s[3]     \n"
2979                         "fmla   v20.4s, v14.4s, v4.s[3]     \n"
2980                         "fmla   v21.4s, v14.4s, v5.s[3]     \n"
2981                         "fmla   v22.4s, v14.4s, v6.s[3]     \n"
2982                         "fmla   v23.4s, v14.4s, v7.s[3]     \n"
2983 
2984                         "subs   %w0, %w0, #1                \n"
2985 
2986                         "fmla   v24.4s, v15.4s, v0.s[3]     \n"
2987                         "fmla   v25.4s, v15.4s, v1.s[3]     \n"
2988                         "fmla   v26.4s, v15.4s, v2.s[3]     \n"
2989                         "fmla   v27.4s, v15.4s, v3.s[3]     \n"
2990                         "fmla   v28.4s, v15.4s, v4.s[3]     \n"
2991                         "fmla   v29.4s, v15.4s, v5.s[3]     \n"
2992                         "fmla   v30.4s, v15.4s, v6.s[3]     \n"
2993                         "fmla   v31.4s, v15.4s, v7.s[3]     \n"
2994 
2995                         "bne    0b                          \n"
2996 
2997                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
2998                         "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
2999                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
3000                         "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
3001 
3002                         : "=r"(nn),         // %0
3003                         "=r"(output0_tm), // %1
3004                         "=r"(output1_tm), // %2
3005                         "=r"(r0),         // %3
3006                         "=r"(k01)         // %4
3007                         : "0"(nn),
3008                         "1"(output0_tm),
3009                         "2"(output1_tm),
3010                         "3"(r0),
3011                         "4"(k01)
3012                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
3013                 }
3014                 for (; i + 3 < tiles; i += 4)
3015                 {
3016                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
3017 
3018                     const float* k01 = kernel01_tm.row(r);
3019 
3020                     int nn = inch; // inch always > 0
3021 
3022                     asm volatile(
3023                         "eor    v16.16b, v16.16b, v16.16b   \n"
3024                         "eor    v17.16b, v17.16b, v17.16b   \n"
3025                         "eor    v18.16b, v18.16b, v18.16b   \n"
3026                         "eor    v19.16b, v19.16b, v19.16b   \n"
3027                         "eor    v20.16b, v20.16b, v20.16b   \n"
3028                         "eor    v21.16b, v21.16b, v21.16b   \n"
3029                         "eor    v22.16b, v22.16b, v22.16b   \n"
3030                         "eor    v23.16b, v23.16b, v23.16b   \n"
3031 
3032                         "0:                                 \n"
3033 
3034                         "prfm   pldl1keep, [%3, #512]       \n"
3035                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
3036 
3037                         "prfm   pldl1keep, [%4, #512]       \n"
3038                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3039 
3040                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3041                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
3042                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
3043                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
3044 
3045                         "fmla   v20.4s, v9.4s, v0.s[0]     \n"
3046                         "fmla   v21.4s, v9.4s, v1.s[0]     \n"
3047                         "fmla   v22.4s, v9.4s, v2.s[0]     \n"
3048                         "fmla   v23.4s, v9.4s, v3.s[0]     \n"
3049 
3050                         "prfm   pldl1keep, [%4, #512]       \n"
3051                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3052 
3053                         "fmla   v16.4s, v10.4s, v0.s[1]      \n"
3054                         "fmla   v17.4s, v10.4s, v1.s[1]      \n"
3055                         "fmla   v18.4s, v10.4s, v2.s[1]      \n"
3056                         "fmla   v19.4s, v10.4s, v3.s[1]      \n"
3057 
3058                         "fmla   v20.4s, v11.4s, v0.s[1]     \n"
3059                         "fmla   v21.4s, v11.4s, v1.s[1]     \n"
3060                         "fmla   v22.4s, v11.4s, v2.s[1]     \n"
3061                         "fmla   v23.4s, v11.4s, v3.s[1]     \n"
3062 
3063                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
3064                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
3065                         "fmla   v18.4s, v12.4s, v2.s[2]     \n"
3066                         "fmla   v19.4s, v12.4s, v3.s[2]     \n"
3067 
3068                         "fmla   v20.4s, v13.4s, v0.s[2]     \n"
3069                         "fmla   v21.4s, v13.4s, v1.s[2]     \n"
3070                         "fmla   v22.4s, v13.4s, v2.s[2]     \n"
3071                         "fmla   v23.4s, v13.4s, v3.s[2]     \n"
3072 
3073                         "subs   %w0, %w0, #1                \n"
3074 
3075                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
3076                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
3077                         "fmla   v18.4s, v14.4s, v2.s[3]     \n"
3078                         "fmla   v19.4s, v14.4s, v3.s[3]     \n"
3079 
3080                         "fmla   v20.4s, v15.4s, v0.s[3]     \n"
3081                         "fmla   v21.4s, v15.4s, v1.s[3]     \n"
3082                         "fmla   v22.4s, v15.4s, v2.s[3]     \n"
3083                         "fmla   v23.4s, v15.4s, v3.s[3]     \n"
3084 
3085                         "bne    0b                          \n"
3086 
3087                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3088                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
3089 
3090                         : "=r"(nn),         // %0
3091                         "=r"(output0_tm), // %1
3092                         "=r"(output1_tm), // %2
3093                         "=r"(r0),         // %3
3094                         "=r"(k01)         // %4
3095                         : "0"(nn),
3096                         "1"(output0_tm),
3097                         "2"(output1_tm),
3098                         "3"(r0),
3099                         "4"(k01)
3100                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
3101                 }
3102                 for (; i + 1 < tiles; i += 2)
3103                 {
3104                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
3105 
3106                     const float* k01 = kernel01_tm.row(r);
3107 
3108                     int nn = inch; // inch always > 0
3109 
3110                     asm volatile(
3111                         "eor    v16.16b, v16.16b, v16.16b   \n"
3112                         "eor    v17.16b, v17.16b, v17.16b   \n"
3113                         "eor    v18.16b, v18.16b, v18.16b   \n"
3114                         "eor    v19.16b, v19.16b, v19.16b   \n"
3115 
3116                         "0:                                 \n"
3117 
3118                         "prfm   pldl1keep, [%3, #256]       \n"
3119                         "ld1    {v0.4s, v1.4s}, [%3], #32   \n" // r0 r1
3120 
3121                         "prfm   pldl1keep, [%4, #512]       \n"
3122                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3123 
3124                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3125                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
3126                         "fmla   v18.4s, v9.4s, v0.s[0]      \n"
3127                         "fmla   v19.4s, v9.4s, v1.s[0]      \n"
3128 
3129                         "prfm   pldl1keep, [%4, #512]       \n"
3130                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3131 
3132                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
3133                         "fmla   v17.4s, v10.4s, v1.s[1]     \n"
3134                         "fmla   v18.4s, v11.4s, v0.s[1]     \n"
3135                         "fmla   v19.4s, v11.4s, v1.s[1]     \n"
3136 
3137                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
3138                         "fmla   v17.4s, v12.4s, v1.s[2]     \n"
3139                         "fmla   v18.4s, v13.4s, v0.s[2]     \n"
3140                         "fmla   v19.4s, v13.4s, v1.s[2]     \n"
3141 
3142                         "subs   %w0, %w0, #1                \n"
3143 
3144                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
3145                         "fmla   v17.4s, v14.4s, v1.s[3]     \n"
3146                         "fmla   v18.4s, v15.4s, v0.s[3]     \n"
3147                         "fmla   v19.4s, v15.4s, v1.s[3]     \n"
3148 
3149                         "bne    0b                          \n"
3150 
3151                         "st1    {v16.4s, v17.4s}, [%1], #32 \n"
3152                         "st1    {v18.4s, v19.4s}, [%2], #32 \n"
3153 
3154                         : "=r"(nn),         // %0
3155                         "=r"(output0_tm), // %1
3156                         "=r"(output1_tm), // %2
3157                         "=r"(r0),         // %3
3158                         "=r"(k01)         // %4
3159                         : "0"(nn),
3160                         "1"(output0_tm),
3161                         "2"(output1_tm),
3162                         "3"(r0),
3163                         "4"(k01)
3164                         : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
3165                 }
3166                 for (; i < tiles; i++)
3167                 {
3168                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
3169 
3170                     const float* k01 = kernel01_tm.row(r);
3171 
3172                     int nn = inch; // inch always > 0
3173 
3174                     asm volatile(
3175                         "eor    v16.16b, v16.16b, v16.16b   \n"
3176                         "eor    v17.16b, v17.16b, v17.16b   \n"
3177 
3178                         "0:                                 \n"
3179 
3180                         "prfm   pldl1keep, [%3, #128]       \n"
3181                         "ld1    {v0.4s}, [%3], #16          \n" // r0
3182 
3183                         "prfm   pldl1keep, [%4, #512]       \n"
3184                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3185 
3186                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3187                         "fmla   v17.4s, v9.4s, v0.s[0]      \n"
3188 
3189                         "prfm   pldl1keep, [%4, #512]       \n"
3190                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3191 
3192                         "fmla   v16.4s, v10.4s, v0.s[1]     \n"
3193                         "fmla   v17.4s, v11.4s, v0.s[1]     \n"
3194 
3195                         "fmla   v16.4s, v12.4s, v0.s[2]     \n"
3196                         "fmla   v17.4s, v13.4s, v0.s[2]     \n"
3197 
3198                         "subs   %w0, %w0, #1                \n"
3199 
3200                         "fmla   v16.4s, v14.4s, v0.s[3]     \n"
3201                         "fmla   v17.4s, v15.4s, v0.s[3]     \n"
3202 
3203                         "bne    0b                          \n"
3204 
3205                         "st1    {v16.4s}, [%1], #16         \n"
3206                         "st1    {v17.4s}, [%2], #16         \n"
3207 
3208                         : "=r"(nn),         // %0
3209                         "=r"(output0_tm), // %1
3210                         "=r"(output1_tm), // %2
3211                         "=r"(r0),         // %3
3212                         "=r"(k01)         // %4
3213                         : "0"(nn),
3214                         "1"(output0_tm),
3215                         "2"(output1_tm),
3216                         "3"(r0),
3217                         "4"(k01)
3218                         : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
3219                 }
3220             }
3221         }
3222 #endif // __ARM_NEON && __aarch64__
3223 
3224         #pragma omp parallel for num_threads(opt.num_threads)
3225         for (int p = remain_outch_start; p < outch; p++)
3226         {
3227             float* output0_tm = top_blob_tm.channel(p);
3228 
3229 #if __aarch64__
3230             const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
3231 #else
3232             const Mat kernel0_tm = kernel_tm.channel(p);
3233 #endif
3234 
3235             for (int r = 0; r < 36; r++)
3236             {
3237                 const Mat bb2 = bottom_blob_tm2.channel(r);
3238 
3239                 int i = 0;
3240 #if __aarch64__
3241                 for (; i + 11 < tiles; i += 12)
3242                 {
3243                     const float* r0 = bb2.row(i / 12);
3244 
3245                     const float* k0 = kernel0_tm.row(r);
3246 
3247                     int nn = inch; // inch always > 0
3248 
3249                     asm volatile(
3250                         "eor    v8.16b, v8.16b, v8.16b      \n"
3251                         "eor    v9.16b, v9.16b, v9.16b      \n"
3252                         "eor    v10.16b, v10.16b, v10.16b   \n"
3253                         "eor    v11.16b, v11.16b, v11.16b   \n"
3254                         "eor    v12.16b, v12.16b, v12.16b   \n"
3255                         "eor    v13.16b, v13.16b, v13.16b   \n"
3256                         "eor    v14.16b, v14.16b, v14.16b   \n"
3257                         "eor    v15.16b, v15.16b, v15.16b   \n"
3258                         "eor    v16.16b, v16.16b, v16.16b   \n"
3259                         "eor    v17.16b, v17.16b, v17.16b   \n"
3260                         "eor    v18.16b, v18.16b, v18.16b   \n"
3261                         "eor    v19.16b, v19.16b, v19.16b   \n"
3262 
3263                         "0:                                 \n"
3264 
3265                         "prfm   pldl1keep, [%2, #512]       \n"
3266                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
3267 
3268                         "prfm   pldl1keep, [%3, #512]       \n"
3269                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
3270 
3271                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
3272                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
3273                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
3274                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
3275                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
3276                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
3277                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
3278                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
3279                         "fmla   v16.4s, v4.4s, v2.s[0]      \n"
3280                         "fmla   v17.4s, v4.4s, v2.s[1]      \n"
3281                         "fmla   v18.4s, v4.4s, v2.s[2]      \n"
3282                         "fmla   v19.4s, v4.4s, v2.s[3]      \n"
3283 
3284                         "prfm   pldl1keep, [%2, #512]       \n"
3285                         "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
3286 
3287                         "fmla   v8.4s, v5.4s, v3.s[0]       \n"
3288                         "fmla   v9.4s, v5.4s, v3.s[1]       \n"
3289                         "fmla   v10.4s, v5.4s, v3.s[2]      \n"
3290                         "fmla   v11.4s, v5.4s, v3.s[3]      \n"
3291                         "fmla   v12.4s, v5.4s, v20.s[0]     \n"
3292                         "fmla   v13.4s, v5.4s, v20.s[1]     \n"
3293                         "fmla   v14.4s, v5.4s, v20.s[2]     \n"
3294                         "fmla   v15.4s, v5.4s, v20.s[3]     \n"
3295                         "fmla   v16.4s, v5.4s, v21.s[0]     \n"
3296                         "fmla   v17.4s, v5.4s, v21.s[1]     \n"
3297                         "fmla   v18.4s, v5.4s, v21.s[2]     \n"
3298                         "fmla   v19.4s, v5.4s, v21.s[3]     \n"
3299 
3300                         "prfm   pldl1keep, [%2, #512]       \n"
3301                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
3302 
3303                         "fmla   v8.4s, v6.4s, v22.s[0]      \n"
3304                         "fmla   v9.4s, v6.4s, v22.s[1]      \n"
3305                         "fmla   v10.4s, v6.4s, v22.s[2]     \n"
3306                         "fmla   v11.4s, v6.4s, v22.s[3]     \n"
3307                         "fmla   v12.4s, v6.4s, v23.s[0]     \n"
3308                         "fmla   v13.4s, v6.4s, v23.s[1]     \n"
3309                         "fmla   v14.4s, v6.4s, v23.s[2]     \n"
3310                         "fmla   v15.4s, v6.4s, v23.s[3]     \n"
3311                         "fmla   v16.4s, v6.4s, v24.s[0]     \n"
3312                         "fmla   v17.4s, v6.4s, v24.s[1]     \n"
3313                         "fmla   v18.4s, v6.4s, v24.s[2]     \n"
3314                         "fmla   v19.4s, v6.4s, v24.s[3]     \n"
3315 
3316                         "subs   %w0, %w0, #1                \n"
3317 
3318                         "fmla   v8.4s, v7.4s, v25.s[0]      \n"
3319                         "fmla   v9.4s, v7.4s, v25.s[1]      \n"
3320                         "fmla   v10.4s, v7.4s, v25.s[2]     \n"
3321                         "fmla   v11.4s, v7.4s, v25.s[3]     \n"
3322                         "fmla   v12.4s, v7.4s, v26.s[0]     \n"
3323                         "fmla   v13.4s, v7.4s, v26.s[1]     \n"
3324                         "fmla   v14.4s, v7.4s, v26.s[2]     \n"
3325                         "fmla   v15.4s, v7.4s, v26.s[3]     \n"
3326                         "fmla   v16.4s, v7.4s, v27.s[0]     \n"
3327                         "fmla   v17.4s, v7.4s, v27.s[1]     \n"
3328                         "fmla   v18.4s, v7.4s, v27.s[2]     \n"
3329                         "fmla   v19.4s, v7.4s, v27.s[3]     \n"
3330 
3331                         "bne    0b                          \n"
3332 
3333                         "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
3334                         "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
3335                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3336 
3337                         : "=r"(nn),         // %0
3338                         "=r"(output0_tm), // %1
3339                         "=r"(r0),         // %2
3340                         "=r"(k0)          // %3
3341                         : "0"(nn),
3342                         "1"(output0_tm),
3343                         "2"(r0),
3344                         "3"(k0)
3345                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
3346                 }
3347 #endif
3348                 for (; i + 7 < tiles; i += 8)
3349                 {
3350 #if __aarch64__
3351                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
3352 #else
3353                     const float* r0 = bb2.row(i / 8);
3354 #endif
3355 
3356                     const float* k0 = kernel0_tm.row(r);
3357 
3358                     int nn = inch; // inch always > 0
3359 
3360 #if __aarch64__
3361                     asm volatile(
3362                         "eor    v16.16b, v16.16b, v16.16b   \n"
3363                         "eor    v17.16b, v17.16b, v17.16b   \n"
3364                         "eor    v18.16b, v18.16b, v18.16b   \n"
3365                         "eor    v19.16b, v19.16b, v19.16b   \n"
3366                         "eor    v20.16b, v20.16b, v20.16b   \n"
3367                         "eor    v21.16b, v21.16b, v21.16b   \n"
3368                         "eor    v22.16b, v22.16b, v22.16b   \n"
3369                         "eor    v23.16b, v23.16b, v23.16b   \n"
3370 
3371                         "0:                                 \n"
3372 
3373                         "prfm   pldl1keep, [%2, #512]       \n"
3374                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
3375 
3376                         "prfm   pldl1keep, [%3, #512]       \n"
3377                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3378 
3379                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3380                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
3381                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
3382                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
3383 
3384                         "prfm   pldl1keep, [%2, #512]       \n"
3385                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
3386 
3387                         "fmla   v20.4s, v8.4s, v4.s[0]      \n"
3388                         "fmla   v21.4s, v8.4s, v5.s[0]      \n"
3389                         "fmla   v22.4s, v8.4s, v6.s[0]      \n"
3390                         "fmla   v23.4s, v8.4s, v7.s[0]      \n"
3391 
3392                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
3393                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
3394                         "fmla   v18.4s, v9.4s, v2.s[1]      \n"
3395                         "fmla   v19.4s, v9.4s, v3.s[1]      \n"
3396                         "fmla   v20.4s, v9.4s, v4.s[1]      \n"
3397                         "fmla   v21.4s, v9.4s, v5.s[1]      \n"
3398                         "fmla   v22.4s, v9.4s, v6.s[1]      \n"
3399                         "fmla   v23.4s, v9.4s, v7.s[1]      \n"
3400 
3401                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
3402                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
3403                         "fmla   v18.4s, v10.4s, v2.s[2]     \n"
3404                         "fmla   v19.4s, v10.4s, v3.s[2]     \n"
3405                         "fmla   v20.4s, v10.4s, v4.s[2]     \n"
3406                         "fmla   v21.4s, v10.4s, v5.s[2]     \n"
3407                         "fmla   v22.4s, v10.4s, v6.s[2]     \n"
3408                         "fmla   v23.4s, v10.4s, v7.s[2]     \n"
3409 
3410                         "subs   %w0, %w0, #1                \n"
3411 
3412                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
3413                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
3414                         "fmla   v18.4s, v11.4s, v2.s[3]     \n"
3415                         "fmla   v19.4s, v11.4s, v3.s[3]     \n"
3416                         "fmla   v20.4s, v11.4s, v4.s[3]     \n"
3417                         "fmla   v21.4s, v11.4s, v5.s[3]     \n"
3418                         "fmla   v22.4s, v11.4s, v6.s[3]     \n"
3419                         "fmla   v23.4s, v11.4s, v7.s[3]     \n"
3420 
3421                         "bne    0b                          \n"
3422 
3423                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3424                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
3425 
3426                         : "=r"(nn),         // %0
3427                         "=r"(output0_tm), // %1
3428                         "=r"(r0),         // %2
3429                         "=r"(k0)          // %3
3430                         : "0"(nn),
3431                         "1"(output0_tm),
3432                         "2"(r0),
3433                         "3"(k0)
3434                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
3435 #else
3436                     asm volatile(
3437                         "veor       q8, q8          \n"
3438                         "veor       q9, q9          \n"
3439                         "veor       q10, q10        \n"
3440                         "veor       q11, q11        \n"
3441                         "veor       q12, q12        \n"
3442                         "veor       q13, q13        \n"
3443                         "veor       q14, q14        \n"
3444                         "veor       q15, q15        \n"
3445 
3446                         "0:                         \n"
3447 
3448                         "pld        [%2, #512]      \n"
3449                         "vldm       %2!, {d0-d7}    \n"
3450 
3451                         "pld        [%3, #512]      \n"
3452                         "vldm       %3!, {d8-d15}   \n"
3453 
3454                         "vmla.f32   q8, q4, d0[0]   \n"
3455                         "vmla.f32   q9, q4, d0[1]   \n"
3456                         "vmla.f32   q10, q4, d1[0]  \n"
3457                         "vmla.f32   q11, q4, d1[1]  \n"
3458                         "vmla.f32   q12, q4, d2[0]  \n"
3459                         "vmla.f32   q13, q4, d2[1]  \n"
3460                         "vmla.f32   q14, q4, d3[0]  \n"
3461                         "vmla.f32   q15, q4, d3[1]  \n"
3462 
3463                         "vmla.f32   q8, q5, d4[0]   \n"
3464                         "vmla.f32   q9, q5, d4[1]   \n"
3465                         "vmla.f32   q10, q5, d5[0]  \n"
3466                         "vmla.f32   q11, q5, d5[1]  \n"
3467                         "vmla.f32   q12, q5, d6[0]  \n"
3468                         "vmla.f32   q13, q5, d6[1]  \n"
3469                         "vmla.f32   q14, q5, d7[0]  \n"
3470                         "vmla.f32   q15, q5, d7[1]  \n"
3471 
3472                         "pld        [%2, #512]      \n"
3473                         "vldm       %2!, {d0-d7}    \n"
3474 
3475                         "vmla.f32   q8, q6, d0[0]   \n"
3476                         "vmla.f32   q9, q6, d0[1]   \n"
3477                         "vmla.f32   q10, q6, d1[0]  \n"
3478                         "vmla.f32   q11, q6, d1[1]  \n"
3479                         "vmla.f32   q12, q6, d2[0]  \n"
3480                         "vmla.f32   q13, q6, d2[1]  \n"
3481                         "vmla.f32   q14, q6, d3[0]  \n"
3482                         "vmla.f32   q15, q6, d3[1]  \n"
3483 
3484                         "subs       %0, %0, #1      \n"
3485 
3486                         "vmla.f32   q8, q7, d4[0]   \n"
3487                         "vmla.f32   q9, q7, d4[1]   \n"
3488                         "vmla.f32   q10, q7, d5[0]  \n"
3489                         "vmla.f32   q11, q7, d5[1]  \n"
3490                         "vmla.f32   q12, q7, d6[0]  \n"
3491                         "vmla.f32   q13, q7, d6[1]  \n"
3492                         "vmla.f32   q14, q7, d7[0]  \n"
3493                         "vmla.f32   q15, q7, d7[1]  \n"
3494 
3495                         "bne        0b              \n"
3496 
3497                         "vstm       %1!, {d16-d23}  \n"
3498                         "vstm       %1!, {d24-d31}  \n"
3499 
3500                         : "=r"(nn),         // %0
3501                         "=r"(output0_tm), // %1
3502                         "=r"(r0),         // %2
3503                         "=r"(k0)          // %3
3504                         : "0"(nn),
3505                         "1"(output0_tm),
3506                         "2"(r0),
3507                         "3"(k0)
3508                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3509 #endif
3510                 }
3511                 for (; i + 3 < tiles; i += 4)
3512                 {
3513 #if __aarch64__
3514                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
3515 #else
3516                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
3517 #endif
3518 
3519                     const float* k0 = kernel0_tm.row(r);
3520 
3521                     int nn = inch; // inch always > 0
3522 
3523 #if __aarch64__
3524                     asm volatile(
3525                         "eor    v16.16b, v16.16b, v16.16b   \n"
3526                         "eor    v17.16b, v17.16b, v17.16b   \n"
3527                         "eor    v18.16b, v18.16b, v18.16b   \n"
3528                         "eor    v19.16b, v19.16b, v19.16b   \n"
3529 
3530                         "0:                                 \n"
3531 
3532                         "prfm   pldl1keep, [%2, #512]       \n"
3533                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
3534 
3535                         "prfm   pldl1keep, [%3, #512]       \n"
3536                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3537 
3538                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3539                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
3540                         "fmla   v18.4s, v8.4s, v2.s[0]      \n"
3541                         "fmla   v19.4s, v8.4s, v3.s[0]      \n"
3542 
3543                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
3544                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
3545                         "fmla   v18.4s, v9.4s, v2.s[1]      \n"
3546                         "fmla   v19.4s, v9.4s, v3.s[1]      \n"
3547 
3548                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
3549                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
3550                         "fmla   v18.4s, v10.4s, v2.s[2]     \n"
3551                         "fmla   v19.4s, v10.4s, v3.s[2]     \n"
3552 
3553                         "subs   %w0, %w0, #1                \n"
3554 
3555                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
3556                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
3557                         "fmla   v18.4s, v11.4s, v2.s[3]     \n"
3558                         "fmla   v19.4s, v11.4s, v3.s[3]     \n"
3559 
3560                         "bne    0b                          \n"
3561 
3562                         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3563 
3564                         : "=r"(nn),         // %0
3565                         "=r"(output0_tm), // %1
3566                         "=r"(r0),         // %2
3567                         "=r"(k0)          // %3
3568                         : "0"(nn),
3569                         "1"(output0_tm),
3570                         "2"(r0),
3571                         "3"(k0)
3572                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
3573 #else
3574                     asm volatile(
3575                         "veor       q8, q8          \n"
3576                         "veor       q9, q9          \n"
3577                         "veor       q10, q10        \n"
3578                         "veor       q11, q11        \n"
3579 
3580                         "0:                         \n"
3581 
3582                         "pld        [%2, #512]      \n"
3583                         "vldm       %2!, {d0-d7}    \n"
3584 
3585                         "pld        [%3, #512]      \n"
3586                         "vldm       %3!, {d8-d15}   \n"
3587 
3588                         "vmla.f32   q8, q4, d0[0]   \n"
3589                         "vmla.f32   q9, q4, d2[0]   \n"
3590                         "vmla.f32   q10, q4, d4[0]  \n"
3591                         "vmla.f32   q11, q4, d6[0]  \n"
3592 
3593                         "vmla.f32   q8, q5, d0[1]   \n"
3594                         "vmla.f32   q9, q5, d2[1]   \n"
3595                         "vmla.f32   q10, q5, d4[1]  \n"
3596                         "vmla.f32   q11, q5, d6[1]  \n"
3597 
3598                         "vmla.f32   q8, q6, d1[0]   \n"
3599                         "vmla.f32   q9, q6, d3[0]   \n"
3600                         "vmla.f32   q10, q6, d5[0]  \n"
3601                         "vmla.f32   q11, q6, d7[0]  \n"
3602 
3603                         "subs       %0, %0, #1      \n"
3604 
3605                         "vmla.f32   q8, q7, d1[1]   \n"
3606                         "vmla.f32   q9, q7, d3[1]   \n"
3607                         "vmla.f32   q10, q7, d5[1]  \n"
3608                         "vmla.f32   q11, q7, d7[1]  \n"
3609 
3610                         "bne        0b              \n"
3611 
3612                         "vstm       %1!, {d16-d23}  \n"
3613 
3614                         : "=r"(nn),         // %0
3615                         "=r"(output0_tm), // %1
3616                         "=r"(r0),         // %2
3617                         "=r"(k0)          // %3
3618                         : "0"(nn),
3619                         "1"(output0_tm),
3620                         "2"(r0),
3621                         "3"(k0)
3622                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
3623 #endif
3624                 }
3625                 for (; i + 1 < tiles; i += 2)
3626                 {
3627 #if __aarch64__
3628                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
3629 #else
3630                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
3631 #endif
3632 
3633                     const float* k0 = kernel0_tm.row(r);
3634 
3635                     int nn = inch; // inch always > 0
3636 
3637 #if __aarch64__
3638                     asm volatile(
3639                         "eor    v16.16b, v16.16b, v16.16b   \n"
3640                         "eor    v17.16b, v17.16b, v17.16b   \n"
3641 
3642                         "0:                                 \n"
3643 
3644                         "prfm   pldl1keep, [%2, #256]       \n"
3645                         "ld1    {v0.4s, v1.4s}, [%2], #32   \n" // r0 r1
3646 
3647                         "prfm   pldl1keep, [%3, #512]       \n"
3648                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3649 
3650                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3651                         "fmla   v17.4s, v8.4s, v1.s[0]      \n"
3652 
3653                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
3654                         "fmla   v17.4s, v9.4s, v1.s[1]      \n"
3655 
3656                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
3657                         "fmla   v17.4s, v10.4s, v1.s[2]     \n"
3658 
3659                         "subs   %w0, %w0, #1                \n"
3660 
3661                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
3662                         "fmla   v17.4s, v11.4s, v1.s[3]     \n"
3663 
3664                         "bne    0b                          \n"
3665 
3666                         "st1    {v16.4s, v17.4s}, [%1], #32 \n"
3667 
3668                         : "=r"(nn),         // %0
3669                         "=r"(output0_tm), // %1
3670                         "=r"(r0),         // %2
3671                         "=r"(k0)          // %3
3672                         : "0"(nn),
3673                         "1"(output0_tm),
3674                         "2"(r0),
3675                         "3"(k0)
3676                         : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
3677 #else
3678                     asm volatile(
3679                         "veor       q8, q8          \n"
3680                         "veor       q9, q9          \n"
3681 
3682                         "0:                         \n"
3683 
3684                         "pld        [%2, #256]      \n"
3685                         "vld1.f32   {d0-d3}, [%2 :128]! \n"
3686 
3687                         "pld        [%3, #512]      \n"
3688                         "vldm       %3!, {d8-d15}   \n"
3689 
3690                         "vmla.f32   q8, q4, d0[0]   \n"
3691                         "vmla.f32   q9, q4, d2[0]   \n"
3692 
3693                         "vmla.f32   q8, q5, d0[1]   \n"
3694                         "vmla.f32   q9, q5, d2[1]   \n"
3695 
3696                         "vmla.f32   q8, q6, d1[0]   \n"
3697                         "vmla.f32   q9, q6, d3[0]   \n"
3698 
3699                         "subs       %0, %0, #1      \n"
3700 
3701                         "vmla.f32   q8, q7, d1[1]   \n"
3702                         "vmla.f32   q9, q7, d3[1]   \n"
3703 
3704                         "bne        0b              \n"
3705 
3706                         "vst1.f32   {d16-d19}, [%1 :128]! \n"
3707 
3708                         : "=r"(nn),         // %0
3709                         "=r"(output0_tm), // %1
3710                         "=r"(r0),         // %2
3711                         "=r"(k0)          // %3
3712                         : "0"(nn),
3713                         "1"(output0_tm),
3714                         "2"(r0),
3715                         "3"(k0)
3716                         : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
3717 #endif
3718                 }
3719                 for (; i < tiles; i++)
3720                 {
3721 #if __aarch64__
3722                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
3723 #else
3724                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
3725 #endif
3726 
3727                     const float* k0 = kernel0_tm.row(r);
3728 
3729                     int nn = inch; // inch always > 0
3730 
3731 #if __aarch64__
3732                     asm volatile(
3733                         "eor    v16.16b, v16.16b, v16.16b   \n"
3734 
3735                         "0:                                 \n"
3736 
3737                         "prfm   pldl1keep, [%2, #128]       \n"
3738                         "ld1    {v0.4s}, [%2], #16          \n" // r0
3739 
3740                         "prfm   pldl1keep, [%3, #512]       \n"
3741                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3742 
3743                         "fmla   v16.4s, v8.4s, v0.s[0]      \n"
3744                         "fmla   v16.4s, v9.4s, v0.s[1]      \n"
3745 
3746                         "subs   %w0, %w0, #1                \n"
3747 
3748                         "fmla   v16.4s, v10.4s, v0.s[2]     \n"
3749                         "fmla   v16.4s, v11.4s, v0.s[3]     \n"
3750 
3751                         "bne    0b                          \n"
3752 
3753                         "st1    {v16.4s}, [%1], #16         \n"
3754 
3755                         : "=r"(nn),         // %0
3756                         "=r"(output0_tm), // %1
3757                         "=r"(r0),         // %2
3758                         "=r"(k0)          // %3
3759                         : "0"(nn),
3760                         "1"(output0_tm),
3761                         "2"(r0),
3762                         "3"(k0)
3763                         : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
3764 #else
3765                     asm volatile(
3766                         "veor       q8, q8          \n"
3767 
3768                         "0:                         \n"
3769 
3770                         "pld        [%2, #128]      \n"
3771                         "vld1.f32   {d0-d1}, [%2 :128]! \n"
3772 
3773                         "pld        [%3, #512]      \n"
3774                         "vldm       %3!, {d8-d15}   \n"
3775 
3776                         "vmla.f32   q8, q4, d0[0]   \n"
3777                         "vmla.f32   q8, q5, d0[1]   \n"
3778 
3779                         "subs       %0, %0, #1      \n"
3780 
3781                         "vmla.f32   q8, q6, d1[0]   \n"
3782                         "vmla.f32   q8, q7, d1[1]   \n"
3783 
3784                         "bne        0b              \n"
3785 
3786                         "vst1.f32   {d16-d17}, [%1 :128]! \n"
3787 
3788                         : "=r"(nn),         // %0
3789                         "=r"(output0_tm), // %1
3790                         "=r"(r0),         // %2
3791                         "=r"(k0)          // %3
3792                         : "0"(nn),
3793                         "1"(output0_tm),
3794                         "2"(r0),
3795                         "3"(k0)
3796                         : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
3797 #endif
3798                 }
3799             }
3800         }
3801     }
3802     bottom_blob_tm = Mat();
3803     // END dot
3804 
3805     // BEGIN transform output
3806     Mat top_blob_bordered;
3807     if (outw == top_blob.w && outh == top_blob.h)
3808     {
3809         top_blob_bordered = top_blob;
3810     }
3811     else
3812     {
3813         top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
3814     }
3815     {
3816         // const float otm[4][6] = {
3817         //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
3818         //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
3819         //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
3820         //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
3821         // };
3822 
3823         // 0 = r00 + (r01 + r02) + (r03 + r04)
3824         // 1 =       (r01 - r02) + (r03 - r04) * 2
3825         // 2 =       (r01 + r02) + (r03 + r04) * 4
3826         // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
3827 
3828         int w_tm = outw / 4 * 6;
3829         int h_tm = outh / 4 * 6;
3830         const int tiles = w_tm / 6 * h_tm / 6;
3831 
3832         #pragma omp parallel for num_threads(opt.num_threads)
3833         for (int p = 0; p < outch; p++)
3834         {
3835             const Mat out0_tm = top_blob_tm.channel(p);
3836             Mat out0 = top_blob_bordered.channel(p);
3837 
3838             // const float bias0 = bias ? bias[p] : 0.f;
3839             float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
3840 
3841             float tmp[4][6][4];
3842 
3843             // tile
3844             for (int i = 0; i < outh / 4; i++)
3845             {
3846                 for (int j = 0; j < outw / 4; j++)
3847                 {
3848                     // top_blob_tm.create(tiles, 36, outch, elemsize, elempack);
3849 
3850                     const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 6 + j) * 4;
3851                     const float* output0_tm_1 = output0_tm_0 + tiles * 4;
3852                     const float* output0_tm_2 = output0_tm_0 + tiles * 8;
3853                     const float* output0_tm_3 = output0_tm_0 + tiles * 12;
3854                     const float* output0_tm_4 = output0_tm_0 + tiles * 16;
3855                     const float* output0_tm_5 = output0_tm_0 + tiles * 20;
3856 
3857                     float* output0 = out0.row(i * 4) + (j * 4) * 4;
3858 
3859                     // TODO neon optimize
3860                     for (int m = 0; m < 6; m++)
3861                     {
3862                         float32x4_t _out0tm0 = vld1q_f32(output0_tm_0);
3863                         float32x4_t _out0tm1 = vld1q_f32(output0_tm_1);
3864                         float32x4_t _out0tm2 = vld1q_f32(output0_tm_2);
3865                         float32x4_t _out0tm3 = vld1q_f32(output0_tm_3);
3866                         float32x4_t _out0tm4 = vld1q_f32(output0_tm_4);
3867                         float32x4_t _out0tm5 = vld1q_f32(output0_tm_5);
3868 
3869                         float32x4_t _tmp02a = vaddq_f32(_out0tm1, _out0tm2);
3870                         float32x4_t _tmp13a = vsubq_f32(_out0tm1, _out0tm2);
3871 
3872                         float32x4_t _tmp02b = vaddq_f32(_out0tm3, _out0tm4);
3873                         float32x4_t _tmp13b = vsubq_f32(_out0tm3, _out0tm4);
3874 
3875                         float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp02a), _tmp02b);
3876                         float32x4_t _tmp1m = vmlaq_n_f32(_tmp13a, _tmp13b, 2.f);
3877                         float32x4_t _tmp2m = vmlaq_n_f32(_tmp02a, _tmp02b, 4.f);
3878                         float32x4_t _tmp3m = vmlaq_n_f32(vaddq_f32(_out0tm5, _tmp13a), _tmp13b, 8.f);
3879 
3880                         vst1q_f32(tmp[0][m], _tmp0m);
3881                         vst1q_f32(tmp[1][m], _tmp1m);
3882                         vst1q_f32(tmp[2][m], _tmp2m);
3883                         vst1q_f32(tmp[3][m], _tmp3m);
3884 
3885                         output0_tm_0 += tiles * 24;
3886                         output0_tm_1 += tiles * 24;
3887                         output0_tm_2 += tiles * 24;
3888                         output0_tm_3 += tiles * 24;
3889                         output0_tm_4 += tiles * 24;
3890                         output0_tm_5 += tiles * 24;
3891                     }
3892 
3893                     for (int m = 0; m < 4; m++)
3894                     {
3895                         float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
3896                         float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
3897                         float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
3898                         float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
3899                         float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
3900                         float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
3901 
3902                         float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02);
3903                         float32x4_t _tmp13a = vsubq_f32(_tmp01, _tmp02);
3904 
3905                         float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04);
3906                         float32x4_t _tmp13b = vsubq_f32(_tmp03, _tmp04);
3907 
3908                         float32x4_t _out00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp02a), _tmp02b));
3909                         float32x4_t _out01 = vaddq_f32(_bias0, vmlaq_n_f32(_tmp13a, _tmp13b, 2.f));
3910                         float32x4_t _out02 = vaddq_f32(_bias0, vmlaq_n_f32(_tmp02a, _tmp02b, 4.f));
3911                         float32x4_t _out03 = vaddq_f32(_bias0, vmlaq_n_f32(vaddq_f32(_tmp05, _tmp13a), _tmp13b, 8.f));
3912 
3913                         vst1q_f32(output0, _out00);
3914                         vst1q_f32(output0 + 4, _out01);
3915                         vst1q_f32(output0 + 8, _out02);
3916                         vst1q_f32(output0 + 12, _out03);
3917 
3918                         output0 += outw * 4;
3919                     }
3920                 }
3921             }
3922         }
3923     }
3924     // END transform output
3925 
3926     // cut result pad
3927     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
3928 }
3929 
conv3x3s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)3930 static void conv3x3s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
3931 {
3932     int w = bottom_blob.w;
3933     int inch = bottom_blob.c;
3934     int outw = top_blob.w;
3935     int outh = top_blob.h;
3936     int outch = top_blob.c;
3937 
3938     const int tailstep = (w - 2 * outw + w) * 4;
3939 
3940     const float* bias = _bias;
3941 
3942     #pragma omp parallel for num_threads(opt.num_threads)
3943     for (int p = 0; p < outch; p++)
3944     {
3945         Mat out0 = top_blob.channel(p);
3946 
3947         float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
3948         out0.fill(_bias0);
3949 
3950         for (int q = 0; q < inch; q++)
3951         {
3952             float* outptr0 = out0.row(0);
3953 
3954             const Mat img0 = bottom_blob.channel(q);
3955 
3956             const float* r0 = img0.row(0);
3957             const float* r1 = img0.row(1);
3958             const float* r2 = img0.row(2);
3959 
3960             const float* kptr = (const float*)kernel.channel(p).row(q);
3961 
3962             int i = 0;
3963             for (; i < outh; i++)
3964             {
3965                 int j = 0;
3966                 for (; j + 3 < outw; j += 4)
3967                 {
3968 #if __aarch64__
3969                     asm volatile(
3970                         "prfm   pldl1keep, [%0, #512]       \n"
3971                         "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3
3972 
3973                         "prfm   pldl1keep, [%1, #512]       \n"
3974                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03
3975 
3976                         "prfm   pldl1keep, [%1, #512]       \n"
3977                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n" // r04 r05 r06 r07
3978 
3979                         "prfm   pldl1keep, [%4, #512]       \n"
3980                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
3981 
3982                         "fmla   v20.4s, v16.4s, v0.s[0]     \n"
3983                         "fmla   v21.4s, v16.4s, v2.s[0]     \n"
3984                         "fmla   v22.4s, v16.4s, v4.s[0]     \n"
3985                         "fmla   v23.4s, v16.4s, v6.s[0]     \n"
3986                         "fmla   v20.4s, v17.4s, v0.s[1]     \n"
3987                         "fmla   v21.4s, v17.4s, v2.s[1]     \n"
3988                         "fmla   v22.4s, v17.4s, v4.s[1]     \n"
3989                         "fmla   v23.4s, v17.4s, v6.s[1]     \n"
3990 
3991                         "prfm   pldl1keep, [%4, #512]       \n"
3992                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
3993 
3994                         "fmla   v20.4s, v18.4s, v0.s[2]     \n"
3995                         "fmla   v21.4s, v18.4s, v2.s[2]     \n"
3996                         "fmla   v22.4s, v18.4s, v4.s[2]     \n"
3997                         "fmla   v23.4s, v18.4s, v6.s[2]     \n"
3998                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
3999                         "fmla   v21.4s, v19.4s, v2.s[3]     \n"
4000                         "fmla   v22.4s, v19.4s, v4.s[3]     \n"
4001                         "fmla   v23.4s, v19.4s, v6.s[3]     \n"
4002 
4003                         "prfm   pldl1keep, [%1, #128]       \n"
4004                         "ld1    {v28.4s}, [%1]              \n" // r08
4005 
4006                         "fmla   v20.4s, v24.4s, v1.s[0]     \n"
4007                         "fmla   v21.4s, v24.4s, v3.s[0]     \n"
4008                         "fmla   v22.4s, v24.4s, v5.s[0]     \n"
4009                         "fmla   v23.4s, v24.4s, v7.s[0]     \n"
4010                         "fmla   v20.4s, v25.4s, v1.s[1]     \n"
4011                         "fmla   v21.4s, v25.4s, v3.s[1]     \n"
4012                         "fmla   v22.4s, v25.4s, v5.s[1]     \n"
4013                         "fmla   v23.4s, v25.4s, v7.s[1]     \n"
4014 
4015                         "prfm   pldl1keep, [%4, #512]       \n"
4016                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4017 
4018                         "fmla   v20.4s, v26.4s, v1.s[2]     \n"
4019                         "fmla   v21.4s, v26.4s, v3.s[2]     \n"
4020                         "fmla   v22.4s, v26.4s, v5.s[2]     \n"
4021                         "fmla   v23.4s, v26.4s, v7.s[2]     \n"
4022                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4023                         "fmla   v21.4s, v27.4s, v3.s[3]     \n"
4024                         "fmla   v22.4s, v27.4s, v5.s[3]     \n"
4025                         "fmla   v23.4s, v27.4s, v7.s[3]     \n"
4026 
4027                         "prfm   pldl1keep, [%2, #512]       \n"
4028                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%2], #64 \n" // r10 r11 r12 r13
4029 
4030                         "fmla   v20.4s, v16.4s, v2.s[0]     \n"
4031                         "fmla   v21.4s, v16.4s, v4.s[0]     \n"
4032                         "fmla   v22.4s, v16.4s, v6.s[0]     \n"
4033                         "fmla   v23.4s, v16.4s, v28.s[0]    \n"
4034                         "fmla   v20.4s, v17.4s, v2.s[1]     \n"
4035                         "fmla   v21.4s, v17.4s, v4.s[1]     \n"
4036                         "fmla   v22.4s, v17.4s, v6.s[1]     \n"
4037                         "fmla   v23.4s, v17.4s, v28.s[1]    \n"
4038 
4039                         "prfm   pldl1keep, [%4, #512]       \n"
4040                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4041 
4042                         "fmla   v20.4s, v18.4s, v2.s[2]     \n"
4043                         "fmla   v21.4s, v18.4s, v4.s[2]     \n"
4044                         "fmla   v22.4s, v18.4s, v6.s[2]     \n"
4045                         "fmla   v23.4s, v18.4s, v28.s[2]    \n"
4046                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4047                         "fmla   v21.4s, v19.4s, v4.s[3]     \n"
4048                         "fmla   v22.4s, v19.4s, v6.s[3]     \n"
4049                         "fmla   v23.4s, v19.4s, v28.s[3]    \n"
4050 
4051                         "prfm   pldl1keep, [%2, #512]       \n"
4052                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n" // r14 r15 r16 r17
4053 
4054                         "fmla   v20.4s, v24.4s, v8.s[0]     \n"
4055                         "fmla   v21.4s, v24.4s, v10.s[0]    \n"
4056                         "fmla   v22.4s, v24.4s, v12.s[0]    \n"
4057                         "fmla   v23.4s, v24.4s, v14.s[0]    \n"
4058                         "fmla   v20.4s, v25.4s, v8.s[1]     \n"
4059                         "fmla   v21.4s, v25.4s, v10.s[1]    \n"
4060                         "fmla   v22.4s, v25.4s, v12.s[1]    \n"
4061                         "fmla   v23.4s, v25.4s, v14.s[1]    \n"
4062 
4063                         "prfm   pldl1keep, [%4, #512]       \n"
4064                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4065 
4066                         "fmla   v20.4s, v26.4s, v8.s[2]     \n"
4067                         "fmla   v21.4s, v26.4s, v10.s[2]    \n"
4068                         "fmla   v22.4s, v26.4s, v12.s[2]    \n"
4069                         "fmla   v23.4s, v26.4s, v14.s[2]    \n"
4070                         "fmla   v20.4s, v27.4s, v8.s[3]     \n"
4071                         "fmla   v21.4s, v27.4s, v10.s[3]    \n"
4072                         "fmla   v22.4s, v27.4s, v12.s[3]    \n"
4073                         "fmla   v23.4s, v27.4s, v14.s[3]    \n"
4074 
4075                         "prfm   pldl1keep, [%2, #128]       \n"
4076                         "ld1    {v28.4s}, [%2]              \n" // r18
4077 
4078                         "fmla   v20.4s, v16.4s, v9.s[0]     \n"
4079                         "fmla   v21.4s, v16.4s, v11.s[0]    \n"
4080                         "fmla   v22.4s, v16.4s, v13.s[0]    \n"
4081                         "fmla   v23.4s, v16.4s, v15.s[0]    \n"
4082                         "fmla   v20.4s, v17.4s, v9.s[1]     \n"
4083                         "fmla   v21.4s, v17.4s, v11.s[1]    \n"
4084                         "fmla   v22.4s, v17.4s, v13.s[1]    \n"
4085                         "fmla   v23.4s, v17.4s, v15.s[1]    \n"
4086 
4087                         "prfm   pldl1keep, [%4, #512]       \n"
4088                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4089 
4090                         "fmla   v20.4s, v18.4s, v9.s[2]     \n"
4091                         "fmla   v21.4s, v18.4s, v11.s[2]    \n"
4092                         "fmla   v22.4s, v18.4s, v13.s[2]    \n"
4093                         "fmla   v23.4s, v18.4s, v15.s[2]    \n"
4094                         "fmla   v20.4s, v19.4s, v9.s[3]     \n"
4095                         "fmla   v21.4s, v19.4s, v11.s[3]    \n"
4096                         "fmla   v22.4s, v19.4s, v13.s[3]    \n"
4097                         "fmla   v23.4s, v19.4s, v15.s[3]    \n"
4098 
4099                         "prfm   pldl1keep, [%3, #512]       \n"
4100                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23
4101 
4102                         "fmla   v20.4s, v24.4s, v10.s[0]    \n"
4103                         "fmla   v21.4s, v24.4s, v12.s[0]    \n"
4104                         "fmla   v22.4s, v24.4s, v14.s[0]    \n"
4105                         "fmla   v23.4s, v24.4s, v28.s[0]    \n"
4106                         "fmla   v20.4s, v25.4s, v10.s[1]    \n"
4107                         "fmla   v21.4s, v25.4s, v12.s[1]    \n"
4108                         "fmla   v22.4s, v25.4s, v14.s[1]    \n"
4109                         "fmla   v23.4s, v25.4s, v28.s[1]    \n"
4110 
4111                         "prfm   pldl1keep, [%4, #512]       \n"
4112                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4113 
4114                         "fmla   v20.4s, v26.4s, v10.s[2]    \n"
4115                         "fmla   v21.4s, v26.4s, v12.s[2]    \n"
4116                         "fmla   v22.4s, v26.4s, v14.s[2]    \n"
4117                         "fmla   v23.4s, v26.4s, v28.s[2]    \n"
4118                         "fmla   v20.4s, v27.4s, v10.s[3]    \n"
4119                         "fmla   v21.4s, v27.4s, v12.s[3]    \n"
4120                         "fmla   v22.4s, v27.4s, v14.s[3]    \n"
4121                         "fmla   v23.4s, v27.4s, v28.s[3]    \n"
4122 
4123                         "prfm   pldl1keep, [%3, #512]       \n"
4124                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r24 r25 r26 r27
4125 
4126                         "fmla   v20.4s, v16.4s, v0.s[0]     \n"
4127                         "fmla   v21.4s, v16.4s, v2.s[0]     \n"
4128                         "fmla   v22.4s, v16.4s, v4.s[0]     \n"
4129                         "fmla   v23.4s, v16.4s, v6.s[0]     \n"
4130                         "fmla   v20.4s, v17.4s, v0.s[1]     \n"
4131                         "fmla   v21.4s, v17.4s, v2.s[1]     \n"
4132                         "fmla   v22.4s, v17.4s, v4.s[1]     \n"
4133                         "fmla   v23.4s, v17.4s, v6.s[1]     \n"
4134 
4135                         "prfm   pldl1keep, [%4, #512]       \n"
4136                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4137 
4138                         "fmla   v20.4s, v18.4s, v0.s[2]     \n"
4139                         "fmla   v21.4s, v18.4s, v2.s[2]     \n"
4140                         "fmla   v22.4s, v18.4s, v4.s[2]     \n"
4141                         "fmla   v23.4s, v18.4s, v6.s[2]     \n"
4142                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
4143                         "fmla   v21.4s, v19.4s, v2.s[3]     \n"
4144                         "fmla   v22.4s, v19.4s, v4.s[3]     \n"
4145                         "fmla   v23.4s, v19.4s, v6.s[3]     \n"
4146 
4147                         "prfm   pldl1keep, [%3, #128]       \n"
4148                         "ld1    {v28.4s}, [%3]              \n" // r28
4149 
4150                         "fmla   v20.4s, v24.4s, v1.s[0]     \n"
4151                         "fmla   v21.4s, v24.4s, v3.s[0]     \n"
4152                         "fmla   v22.4s, v24.4s, v5.s[0]     \n"
4153                         "fmla   v23.4s, v24.4s, v7.s[0]     \n"
4154                         "fmla   v20.4s, v25.4s, v1.s[1]     \n"
4155                         "fmla   v21.4s, v25.4s, v3.s[1]     \n"
4156                         "fmla   v22.4s, v25.4s, v5.s[1]     \n"
4157                         "fmla   v23.4s, v25.4s, v7.s[1]     \n"
4158 
4159                         //                         "prfm   pldl1keep, [%4, #512]       \n"
4160                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4161 
4162                         "fmla   v20.4s, v26.4s, v1.s[2]     \n"
4163                         "fmla   v21.4s, v26.4s, v3.s[2]     \n"
4164                         "fmla   v22.4s, v26.4s, v5.s[2]     \n"
4165                         "fmla   v23.4s, v26.4s, v7.s[2]     \n"
4166                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4167                         "fmla   v21.4s, v27.4s, v3.s[3]     \n"
4168                         "fmla   v22.4s, v27.4s, v5.s[3]     \n"
4169                         "fmla   v23.4s, v27.4s, v7.s[3]     \n"
4170 
4171                         "fmla   v20.4s, v16.4s, v2.s[0]     \n"
4172                         "fmla   v21.4s, v16.4s, v4.s[0]     \n"
4173                         "fmla   v22.4s, v16.4s, v6.s[0]     \n"
4174                         "fmla   v23.4s, v16.4s, v28.s[0]    \n"
4175                         "fmla   v20.4s, v17.4s, v2.s[1]     \n"
4176                         "fmla   v21.4s, v17.4s, v4.s[1]     \n"
4177                         "fmla   v22.4s, v17.4s, v6.s[1]     \n"
4178                         "fmla   v23.4s, v17.4s, v28.s[1]    \n"
4179                         "fmla   v20.4s, v18.4s, v2.s[2]     \n"
4180                         "fmla   v21.4s, v18.4s, v4.s[2]     \n"
4181                         "fmla   v22.4s, v18.4s, v6.s[2]     \n"
4182                         "fmla   v23.4s, v18.4s, v28.s[2]    \n"
4183                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4184                         "fmla   v21.4s, v19.4s, v4.s[3]     \n"
4185                         "fmla   v22.4s, v19.4s, v6.s[3]     \n"
4186                         "fmla   v23.4s, v19.4s, v28.s[3]    \n"
4187 
4188                         "sub    %4, %4, #512                \n" // kptr -= 8 * 16;
4189 
4190                         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
4191 
4192                         : "=r"(outptr0), // %0
4193                         "=r"(r0),      // %1
4194                         "=r"(r1),      // %2
4195                         "=r"(r2),      // %3
4196                         "=r"(kptr)     // %4
4197                         : "0"(outptr0),
4198                         "1"(r0),
4199                         "2"(r1),
4200                         "3"(r2),
4201                         "4"(kptr)
4202                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
4203 #else  // __aarch64__
4204                     asm volatile(
4205                         "pld        [%0, #512]          \n"
4206                         "vldm       %0, {d24-d31}       \n" // sum0 sum1 sum2 sum3
4207 
4208                         "pld        [%1, #512]          \n"
4209                         "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03
4210 
4211                         "pld        [%1, #512]          \n"
4212                         "vldm       %1!, {d8-d15}       \n" // r04 r05 r06 r07
4213 
4214                         "pld        [%4, #512]          \n"
4215                         "vldm       %4!, {d16-d23}      \n"
4216 
4217                         "vmla.f32   q12, q8, d0[0]      \n"
4218                         "vmla.f32   q13, q8, d4[0]      \n"
4219                         "vmla.f32   q14, q8, d8[0]      \n"
4220                         "vmla.f32   q15, q8, d12[0]     \n"
4221                         "vmla.f32   q12, q9, d0[1]      \n"
4222                         "vmla.f32   q13, q9, d4[1]      \n"
4223                         "vmla.f32   q14, q9, d8[1]      \n"
4224                         "vmla.f32   q15, q9, d12[1]     \n"
4225                         "vmla.f32   q12, q10, d1[0]     \n"
4226                         "vmla.f32   q13, q10, d5[0]     \n"
4227                         "vmla.f32   q14, q10, d9[0]     \n"
4228                         "vmla.f32   q15, q10, d13[0]    \n"
4229                         "vmla.f32   q12, q11, d1[1]     \n"
4230                         "vmla.f32   q13, q11, d5[1]     \n"
4231                         "vmla.f32   q14, q11, d9[1]     \n"
4232                         "vmla.f32   q15, q11, d13[1]    \n"
4233 
4234                         "pld        [%4, #512]          \n"
4235                         "vldm       %4!, {d16-d23}      \n"
4236 
4237                         "pld        [%1, #128]          \n"
4238                         "vld1.f32   {d0-d1}, [%1 :128]  \n" // r08
4239 
4240                         "vmla.f32   q12, q8, d2[0]      \n"
4241                         "vmla.f32   q13, q8, d6[0]      \n"
4242                         "vmla.f32   q14, q8, d10[0]     \n"
4243                         "vmla.f32   q15, q8, d14[0]     \n"
4244                         "vmla.f32   q12, q9, d2[1]      \n"
4245                         "vmla.f32   q13, q9, d6[1]      \n"
4246                         "vmla.f32   q14, q9, d10[1]     \n"
4247                         "vmla.f32   q15, q9, d14[1]     \n"
4248                         "vmla.f32   q12, q10, d3[0]     \n"
4249                         "vmla.f32   q13, q10, d7[0]     \n"
4250                         "vmla.f32   q14, q10, d11[0]    \n"
4251                         "vmla.f32   q15, q10, d15[0]    \n"
4252                         "vmla.f32   q12, q11, d3[1]     \n"
4253                         "vmla.f32   q13, q11, d7[1]     \n"
4254                         "vmla.f32   q14, q11, d11[1]    \n"
4255                         "vmla.f32   q15, q11, d15[1]    \n"
4256 
4257                         "pld        [%4, #512]          \n"
4258                         "vldm       %4!, {d16-d23}      \n"
4259 
4260                         "vmla.f32   q12, q8, d4[0]      \n"
4261                         "vmla.f32   q13, q8, d8[0]      \n"
4262                         "vmla.f32   q14, q8, d12[0]     \n"
4263                         "vmla.f32   q15, q8, d0[0]      \n"
4264                         "vmla.f32   q12, q9, d4[1]      \n"
4265                         "vmla.f32   q13, q9, d8[1]      \n"
4266                         "vmla.f32   q14, q9, d12[1]     \n"
4267                         "vmla.f32   q15, q9, d0[1]      \n"
4268                         "vmla.f32   q12, q10, d5[0]     \n"
4269                         "vmla.f32   q13, q10, d9[0]     \n"
4270                         "vmla.f32   q14, q10, d13[0]    \n"
4271                         "vmla.f32   q15, q10, d1[0]     \n"
4272                         "vmla.f32   q12, q11, d5[1]     \n"
4273                         "vmla.f32   q13, q11, d9[1]     \n"
4274                         "vmla.f32   q14, q11, d13[1]    \n"
4275                         "vmla.f32   q15, q11, d1[1]     \n"
4276 
4277                         "pld        [%2, #512]          \n"
4278                         "vldm       %2!, {d8-d15}       \n" // r10 r11 r12 r13
4279 
4280                         "pld        [%2, #512]          \n"
4281                         "vldm       %2!, {d0-d7}        \n" // r14 r15 r16 r17
4282 
4283                         "pld        [%4, #512]          \n"
4284                         "vldm       %4!, {d16-d23}      \n"
4285 
4286                         "vmla.f32   q12, q8, d8[0]      \n"
4287                         "vmla.f32   q13, q8, d12[0]     \n"
4288                         "vmla.f32   q14, q8, d0[0]      \n"
4289                         "vmla.f32   q15, q8, d4[0]      \n"
4290                         "vmla.f32   q12, q9, d8[1]      \n"
4291                         "vmla.f32   q13, q9, d12[1]     \n"
4292                         "vmla.f32   q14, q9, d0[1]      \n"
4293                         "vmla.f32   q15, q9, d4[1]      \n"
4294                         "vmla.f32   q12, q10, d9[0]     \n"
4295                         "vmla.f32   q13, q10, d13[0]    \n"
4296                         "vmla.f32   q14, q10, d1[0]     \n"
4297                         "vmla.f32   q15, q10, d5[0]     \n"
4298                         "vmla.f32   q12, q11, d9[1]     \n"
4299                         "vmla.f32   q13, q11, d13[1]    \n"
4300                         "vmla.f32   q14, q11, d1[1]     \n"
4301                         "vmla.f32   q15, q11, d5[1]     \n"
4302 
4303                         "pld        [%4, #512]          \n"
4304                         "vldm       %4!, {d16-d23}      \n"
4305 
4306                         "pld        [%2, #128]          \n"
4307                         "vld1.f32   {d8-d9}, [%2 :128]  \n" // r18
4308 
4309                         "vmla.f32   q12, q8, d10[0]     \n"
4310                         "vmla.f32   q13, q8, d14[0]     \n"
4311                         "vmla.f32   q14, q8, d2[0]      \n"
4312                         "vmla.f32   q15, q8, d6[0]      \n"
4313                         "vmla.f32   q12, q9, d10[1]     \n"
4314                         "vmla.f32   q13, q9, d14[1]     \n"
4315                         "vmla.f32   q14, q9, d2[1]      \n"
4316                         "vmla.f32   q15, q9, d6[1]      \n"
4317                         "vmla.f32   q12, q10, d11[0]    \n"
4318                         "vmla.f32   q13, q10, d15[0]    \n"
4319                         "vmla.f32   q14, q10, d3[0]     \n"
4320                         "vmla.f32   q15, q10, d7[0]     \n"
4321                         "vmla.f32   q12, q11, d11[1]    \n"
4322                         "vmla.f32   q13, q11, d15[1]    \n"
4323                         "vmla.f32   q14, q11, d3[1]     \n"
4324                         "vmla.f32   q15, q11, d7[1]     \n"
4325 
4326                         "pld        [%4, #512]          \n"
4327                         "vldm       %4!, {d16-d23}      \n"
4328 
4329                         "vmla.f32   q12, q8, d12[0]     \n"
4330                         "vmla.f32   q13, q8, d0[0]      \n"
4331                         "vmla.f32   q14, q8, d4[0]      \n"
4332                         "vmla.f32   q15, q8, d8[0]      \n"
4333                         "vmla.f32   q12, q9, d12[1]     \n"
4334                         "vmla.f32   q13, q9, d0[1]      \n"
4335                         "vmla.f32   q14, q9, d4[1]      \n"
4336                         "vmla.f32   q15, q9, d8[1]      \n"
4337                         "vmla.f32   q12, q10, d13[0]    \n"
4338                         "vmla.f32   q13, q10, d1[0]     \n"
4339                         "vmla.f32   q14, q10, d5[0]     \n"
4340                         "vmla.f32   q15, q10, d9[0]     \n"
4341                         "vmla.f32   q12, q11, d13[1]    \n"
4342                         "vmla.f32   q13, q11, d1[1]     \n"
4343                         "vmla.f32   q14, q11, d5[1]     \n"
4344                         "vmla.f32   q15, q11, d9[1]     \n"
4345 
4346                         "pld        [%3, #512]          \n"
4347                         "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23
4348 
4349                         "pld        [%3, #512]          \n"
4350                         "vldm       %3!, {d8-d15}       \n" // r24 r25 r26 r27
4351 
4352                         "pld        [%4, #512]          \n"
4353                         "vldm       %4!, {d16-d23}      \n"
4354 
4355                         "vmla.f32   q12, q8, d0[0]      \n"
4356                         "vmla.f32   q13, q8, d4[0]      \n"
4357                         "vmla.f32   q14, q8, d8[0]      \n"
4358                         "vmla.f32   q15, q8, d12[0]     \n"
4359                         "vmla.f32   q12, q9, d0[1]      \n"
4360                         "vmla.f32   q13, q9, d4[1]      \n"
4361                         "vmla.f32   q14, q9, d8[1]      \n"
4362                         "vmla.f32   q15, q9, d12[1]     \n"
4363                         "vmla.f32   q12, q10, d1[0]     \n"
4364                         "vmla.f32   q13, q10, d5[0]     \n"
4365                         "vmla.f32   q14, q10, d9[0]     \n"
4366                         "vmla.f32   q15, q10, d13[0]    \n"
4367                         "vmla.f32   q12, q11, d1[1]     \n"
4368                         "vmla.f32   q13, q11, d5[1]     \n"
4369                         "vmla.f32   q14, q11, d9[1]     \n"
4370                         "vmla.f32   q15, q11, d13[1]    \n"
4371 
4372                         "pld        [%4, #512]          \n"
4373                         "vldm       %4!, {d16-d23}      \n"
4374 
4375                         "pld        [%3, #128]          \n"
4376                         "vld1.f32   {d0-d1}, [%3 :128]  \n" // r28
4377 
4378                         "vmla.f32   q12, q8, d2[0]      \n"
4379                         "vmla.f32   q13, q8, d6[0]      \n"
4380                         "vmla.f32   q14, q8, d10[0]     \n"
4381                         "vmla.f32   q15, q8, d14[0]     \n"
4382                         "vmla.f32   q12, q9, d2[1]      \n"
4383                         "vmla.f32   q13, q9, d6[1]      \n"
4384                         "vmla.f32   q14, q9, d10[1]     \n"
4385                         "vmla.f32   q15, q9, d14[1]     \n"
4386                         "vmla.f32   q12, q10, d3[0]     \n"
4387                         "vmla.f32   q13, q10, d7[0]     \n"
4388                         "vmla.f32   q14, q10, d11[0]    \n"
4389                         "vmla.f32   q15, q10, d15[0]    \n"
4390                         "vmla.f32   q12, q11, d3[1]     \n"
4391                         "vmla.f32   q13, q11, d7[1]     \n"
4392                         "vmla.f32   q14, q11, d11[1]    \n"
4393                         "vmla.f32   q15, q11, d15[1]    \n"
4394 
4395                         //                         "pld        [%4, #512]          \n"
4396                         "vldm       %4, {d16-d23}       \n"
4397 
4398                         "vmla.f32   q12, q8, d4[0]      \n"
4399                         "vmla.f32   q13, q8, d8[0]      \n"
4400                         "vmla.f32   q14, q8, d12[0]     \n"
4401                         "vmla.f32   q15, q8, d0[0]      \n"
4402                         "vmla.f32   q12, q9, d4[1]      \n"
4403                         "vmla.f32   q13, q9, d8[1]      \n"
4404                         "vmla.f32   q14, q9, d12[1]     \n"
4405                         "vmla.f32   q15, q9, d0[1]      \n"
4406                         "vmla.f32   q12, q10, d5[0]     \n"
4407                         "vmla.f32   q13, q10, d9[0]     \n"
4408                         "vmla.f32   q14, q10, d13[0]    \n"
4409                         "vmla.f32   q15, q10, d1[0]     \n"
4410                         "vmla.f32   q12, q11, d5[1]     \n"
4411                         "vmla.f32   q13, q11, d9[1]     \n"
4412                         "vmla.f32   q14, q11, d13[1]    \n"
4413                         "vmla.f32   q15, q11, d1[1]     \n"
4414 
4415                         "sub        %4, %4, #512        \n" // kptr -= 8 * 16;
4416 
4417                         "vstm       %0!, {d24-d31}      \n"
4418 
4419                         : "=r"(outptr0), // %0
4420                         "=r"(r0),      // %1
4421                         "=r"(r1),      // %2
4422                         "=r"(r2),      // %3
4423                         "=r"(kptr)     // %4
4424                         : "0"(outptr0),
4425                         "1"(r0),
4426                         "2"(r1),
4427                         "3"(r2),
4428                         "4"(kptr)
4429                         : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4430 #endif // __aarch64__
4431                 }
4432                 for (; j + 1 < outw; j += 2)
4433                 {
4434 #if __aarch64__
4435                     asm volatile(
4436                         "prfm   pldl1keep, [%0, #256]       \n"
4437                         "ld1    {v20.4s, v21.4s}, [%0]      \n" // sum0 sum1
4438 
4439                         "prfm   pldl1keep, [%1, #512]       \n"
4440                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03
4441 
4442                         "prfm   pldl1keep, [%4, #512]       \n"
4443                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4444 
4445                         "fmul   v22.4s, v16.4s, v0.s[0]     \n"
4446                         "fmul   v23.4s, v16.4s, v2.s[0]     \n"
4447                         "fmla   v20.4s, v17.4s, v0.s[1]     \n"
4448                         "fmla   v21.4s, v17.4s, v2.s[1]     \n"
4449 
4450                         "prfm   pldl1keep, [%4, #512]       \n"
4451                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4452 
4453                         "fmla   v22.4s, v18.4s, v0.s[2]     \n"
4454                         "fmla   v23.4s, v18.4s, v2.s[2]     \n"
4455                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
4456                         "fmla   v21.4s, v19.4s, v2.s[3]     \n"
4457 
4458                         "prfm   pldl1keep, [%1, #128]       \n"
4459                         "ld1    {v4.4s}, [%1]               \n" // r04
4460 
4461                         "fmla   v22.4s, v24.4s, v1.s[0]     \n"
4462                         "fmla   v23.4s, v24.4s, v3.s[0]     \n"
4463                         "fmla   v20.4s, v25.4s, v1.s[1]     \n"
4464                         "fmla   v21.4s, v25.4s, v3.s[1]     \n"
4465 
4466                         "prfm   pldl1keep, [%4, #512]       \n"
4467                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4468 
4469                         "fmla   v22.4s, v26.4s, v1.s[2]     \n"
4470                         "fmla   v23.4s, v26.4s, v3.s[2]     \n"
4471                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4472                         "fmla   v21.4s, v27.4s, v3.s[3]     \n"
4473 
4474                         "fmla   v22.4s, v16.4s, v2.s[0]     \n"
4475                         "fmla   v23.4s, v16.4s, v4.s[0]     \n"
4476                         "fmla   v20.4s, v17.4s, v2.s[1]     \n"
4477                         "fmla   v21.4s, v17.4s, v4.s[1]     \n"
4478 
4479                         "prfm   pldl1keep, [%4, #512]       \n"
4480                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4481 
4482                         "fmla   v22.4s, v18.4s, v2.s[2]     \n"
4483                         "fmla   v23.4s, v18.4s, v4.s[2]     \n"
4484                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4485                         "fmla   v21.4s, v19.4s, v4.s[3]     \n"
4486 
4487                         "prfm   pldl1keep, [%2, #512]       \n"
4488                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13
4489 
4490                         "fmla   v22.4s, v24.4s, v0.s[0]     \n"
4491                         "fmla   v23.4s, v24.4s, v2.s[0]     \n"
4492                         "fmla   v20.4s, v25.4s, v0.s[1]     \n"
4493                         "fmla   v21.4s, v25.4s, v2.s[1]     \n"
4494 
4495                         "prfm   pldl1keep, [%4, #512]       \n"
4496                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4497 
4498                         "fmla   v22.4s, v26.4s, v0.s[2]     \n"
4499                         "fmla   v23.4s, v26.4s, v2.s[2]     \n"
4500                         "fmla   v20.4s, v27.4s, v0.s[3]     \n"
4501                         "fmla   v21.4s, v27.4s, v2.s[3]     \n"
4502 
4503                         "prfm   pldl1keep, [%2, #128]       \n"
4504                         "ld1    {v4.4s}, [%2]               \n" // r14
4505 
4506                         "fmla   v22.4s, v16.4s, v1.s[0]     \n"
4507                         "fmla   v23.4s, v16.4s, v3.s[0]     \n"
4508                         "fmla   v20.4s, v17.4s, v1.s[1]     \n"
4509                         "fmla   v21.4s, v17.4s, v3.s[1]     \n"
4510 
4511                         "prfm   pldl1keep, [%4, #512]       \n"
4512                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4513 
4514                         "fmla   v22.4s, v18.4s, v1.s[2]     \n"
4515                         "fmla   v23.4s, v18.4s, v3.s[2]     \n"
4516                         "fmla   v20.4s, v19.4s, v1.s[3]     \n"
4517                         "fmla   v21.4s, v19.4s, v3.s[3]     \n"
4518 
4519                         "fmla   v22.4s, v24.4s, v2.s[0]     \n"
4520                         "fmla   v23.4s, v24.4s, v4.s[0]     \n"
4521                         "fmla   v20.4s, v25.4s, v2.s[1]     \n"
4522                         "fmla   v21.4s, v25.4s, v4.s[1]     \n"
4523 
4524                         "prfm   pldl1keep, [%4, #512]       \n"
4525                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4526 
4527                         "fmla   v22.4s, v26.4s, v2.s[2]     \n"
4528                         "fmla   v23.4s, v26.4s, v4.s[2]     \n"
4529                         "fmla   v20.4s, v27.4s, v2.s[3]     \n"
4530                         "fmla   v21.4s, v27.4s, v4.s[3]     \n"
4531 
4532                         "prfm   pldl1keep, [%3, #512]       \n"
4533                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23
4534 
4535                         "fmla   v22.4s, v16.4s, v0.s[0]     \n"
4536                         "fmla   v23.4s, v16.4s, v2.s[0]     \n"
4537                         "fmla   v20.4s, v17.4s, v0.s[1]     \n"
4538                         "fmla   v21.4s, v17.4s, v2.s[1]     \n"
4539 
4540                         "prfm   pldl1keep, [%4, #512]       \n"
4541                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4542 
4543                         "fmla   v22.4s, v18.4s, v0.s[2]     \n"
4544                         "fmla   v23.4s, v18.4s, v2.s[2]     \n"
4545                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
4546                         "fmla   v21.4s, v19.4s, v2.s[3]     \n"
4547 
4548                         "prfm   pldl1keep, [%3, #128]       \n"
4549                         "ld1    {v4.4s}, [%3]               \n" // r24
4550 
4551                         "fmla   v22.4s, v24.4s, v1.s[0]     \n"
4552                         "fmla   v23.4s, v24.4s, v3.s[0]     \n"
4553                         "fmla   v20.4s, v25.4s, v1.s[1]     \n"
4554                         "fmla   v21.4s, v25.4s, v3.s[1]     \n"
4555 
4556                         //                         "prfm   pldl1keep, [%4, #512]       \n"
4557                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4558 
4559                         "fmla   v22.4s, v26.4s, v1.s[2]     \n"
4560                         "fmla   v23.4s, v26.4s, v3.s[2]     \n"
4561                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4562                         "fmla   v21.4s, v27.4s, v3.s[3]     \n"
4563 
4564                         "fmla   v22.4s, v16.4s, v2.s[0]     \n"
4565                         "fmla   v23.4s, v16.4s, v4.s[0]     \n"
4566                         "fmla   v20.4s, v17.4s, v2.s[1]     \n"
4567                         "fmla   v21.4s, v17.4s, v4.s[1]     \n"
4568                         "fmla   v22.4s, v18.4s, v2.s[2]     \n"
4569                         "fmla   v23.4s, v18.4s, v4.s[2]     \n"
4570                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4571                         "fmla   v21.4s, v19.4s, v4.s[3]     \n"
4572 
4573                         "fadd   v20.4s, v20.4s, v22.4s      \n"
4574                         "fadd   v21.4s, v21.4s, v23.4s      \n"
4575 
4576                         "sub    %4, %4, #512                \n" // kptr -= 8 * 16;
4577 
4578                         "st1    {v20.4s, v21.4s}, [%0], #32 \n"
4579 
4580                         : "=r"(outptr0), // %0
4581                         "=r"(r0),      // %1
4582                         "=r"(r1),      // %2
4583                         "=r"(r2),      // %3
4584                         "=r"(kptr)     // %4
4585                         : "0"(outptr0),
4586                         "1"(r0),
4587                         "2"(r1),
4588                         "3"(r2),
4589                         "4"(kptr)
4590                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
4591 #else  // __aarch64__
4592                     asm volatile(
4593                         "pld        [%0, #256]          \n"
4594                         "vld1.f32   {d24-d27}, [%0 :128] \n" // sum0 sum1
4595 
4596                         "pld        [%1, #512]          \n"
4597                         "vldm       %1!, {d0-d7}        \n" // r00 r01 r02 r03
4598 
4599                         "pld        [%4, #512]          \n"
4600                         "vldm       %4!, {d16-d23}      \n"
4601 
4602                         "vmul.f32   q14, q8, d0[0]      \n"
4603                         "vmul.f32   q15, q8, d4[0]      \n"
4604                         "vmla.f32   q12, q9, d0[1]      \n"
4605                         "vmla.f32   q13, q9, d4[1]      \n"
4606                         "vmla.f32   q14, q10, d1[0]     \n"
4607                         "vmla.f32   q15, q10, d5[0]     \n"
4608                         "vmla.f32   q12, q11, d1[1]     \n"
4609                         "vmla.f32   q13, q11, d5[1]     \n"
4610 
4611                         "pld        [%4, #512]          \n"
4612                         "vldm       %4!, {d16-d23}      \n"
4613 
4614                         "pld        [%1, #128]          \n"
4615                         "vld1.f32   {d8-d9}, [%1 :128]  \n" // r04
4616 
4617                         "vmla.f32   q14, q8, d2[0]      \n"
4618                         "vmla.f32   q15, q8, d6[0]      \n"
4619                         "vmla.f32   q12, q9, d2[1]      \n"
4620                         "vmla.f32   q13, q9, d6[1]      \n"
4621                         "vmla.f32   q14, q10, d3[0]     \n"
4622                         "vmla.f32   q15, q10, d7[0]     \n"
4623                         "vmla.f32   q12, q11, d3[1]     \n"
4624                         "vmla.f32   q13, q11, d7[1]     \n"
4625 
4626                         "pld        [%4, #512]          \n"
4627                         "vldm       %4!, {d16-d23}      \n"
4628 
4629                         "vmla.f32   q14, q8, d4[0]      \n"
4630                         "vmla.f32   q15, q8, d8[0]      \n"
4631                         "vmla.f32   q12, q9, d4[1]      \n"
4632                         "vmla.f32   q13, q9, d8[1]      \n"
4633                         "vmla.f32   q14, q10, d5[0]     \n"
4634                         "vmla.f32   q15, q10, d9[0]     \n"
4635                         "vmla.f32   q12, q11, d5[1]     \n"
4636                         "vmla.f32   q13, q11, d9[1]     \n"
4637 
4638                         "pld        [%2, #512]          \n"
4639                         "vldm       %2!, {d0-d7}        \n" // r10 r11 r12 r13
4640 
4641                         "pld        [%4, #512]          \n"
4642                         "vldm       %4!, {d16-d23}      \n"
4643 
4644                         "vmla.f32   q14, q8, d0[0]      \n"
4645                         "vmla.f32   q15, q8, d4[0]      \n"
4646                         "vmla.f32   q12, q9, d0[1]      \n"
4647                         "vmla.f32   q13, q9, d4[1]      \n"
4648                         "vmla.f32   q14, q10, d1[0]     \n"
4649                         "vmla.f32   q15, q10, d5[0]     \n"
4650                         "vmla.f32   q12, q11, d1[1]     \n"
4651                         "vmla.f32   q13, q11, d5[1]     \n"
4652 
4653                         "pld        [%4, #512]          \n"
4654                         "vldm       %4!, {d16-d23}      \n"
4655 
4656                         "pld        [%2, #128]          \n"
4657                         "vld1.f32   {d8-d9}, [%2 :128]  \n" // r14
4658 
4659                         "vmla.f32   q14, q8, d2[0]      \n"
4660                         "vmla.f32   q15, q8, d6[0]      \n"
4661                         "vmla.f32   q12, q9, d2[1]      \n"
4662                         "vmla.f32   q13, q9, d6[1]      \n"
4663                         "vmla.f32   q14, q10, d3[0]     \n"
4664                         "vmla.f32   q15, q10, d7[0]     \n"
4665                         "vmla.f32   q12, q11, d3[1]     \n"
4666                         "vmla.f32   q13, q11, d7[1]     \n"
4667 
4668                         "pld        [%4, #512]          \n"
4669                         "vldm       %4!, {d16-d23}      \n"
4670 
4671                         "vmla.f32   q14, q8, d4[0]      \n"
4672                         "vmla.f32   q15, q8, d8[0]      \n"
4673                         "vmla.f32   q12, q9, d4[1]      \n"
4674                         "vmla.f32   q13, q9, d8[1]      \n"
4675                         "vmla.f32   q14, q10, d5[0]     \n"
4676                         "vmla.f32   q15, q10, d9[0]     \n"
4677                         "vmla.f32   q12, q11, d5[1]     \n"
4678                         "vmla.f32   q13, q11, d9[1]     \n"
4679 
4680                         "pld        [%3, #512]          \n"
4681                         "vldm       %3!, {d0-d7}        \n" // r20 r21 r22 r23
4682 
4683                         "pld        [%4, #512]          \n"
4684                         "vldm       %4!, {d16-d23}      \n"
4685 
4686                         "vmla.f32   q14, q8, d0[0]      \n"
4687                         "vmla.f32   q15, q8, d4[0]      \n"
4688                         "vmla.f32   q12, q9, d0[1]      \n"
4689                         "vmla.f32   q13, q9, d4[1]      \n"
4690                         "vmla.f32   q14, q10, d1[0]     \n"
4691                         "vmla.f32   q15, q10, d5[0]     \n"
4692                         "vmla.f32   q12, q11, d1[1]     \n"
4693                         "vmla.f32   q13, q11, d5[1]     \n"
4694 
4695                         "pld        [%4, #512]          \n"
4696                         "vldm       %4!, {d16-d23}      \n"
4697 
4698                         "pld        [%3, #128]          \n"
4699                         "vld1.f32   {d8-d9}, [%3 :128]  \n" // r24
4700 
4701                         "vmla.f32   q14, q8, d2[0]      \n"
4702                         "vmla.f32   q15, q8, d6[0]      \n"
4703                         "vmla.f32   q12, q9, d2[1]      \n"
4704                         "vmla.f32   q13, q9, d6[1]      \n"
4705                         "vmla.f32   q14, q10, d3[0]     \n"
4706                         "vmla.f32   q15, q10, d7[0]     \n"
4707                         "vmla.f32   q12, q11, d3[1]     \n"
4708                         "vmla.f32   q13, q11, d7[1]     \n"
4709 
4710                         //                         "pld        [%4, #512]          \n"
4711                         "vldm       %4, {d16-d23}       \n"
4712 
4713                         "vmla.f32   q14, q8, d4[0]      \n"
4714                         "vmla.f32   q15, q8, d8[0]      \n"
4715                         "vmla.f32   q12, q9, d4[1]      \n"
4716                         "vmla.f32   q13, q9, d8[1]      \n"
4717                         "vmla.f32   q14, q10, d5[0]     \n"
4718                         "vmla.f32   q15, q10, d9[0]     \n"
4719                         "vmla.f32   q12, q11, d5[1]     \n"
4720                         "vmla.f32   q13, q11, d9[1]     \n"
4721 
4722                         "vadd.f32   q12, q12, q14       \n"
4723                         "vadd.f32   q13, q13, q15       \n"
4724 
4725                         "sub        %4, %4, #512        \n" // kptr -= 8 * 16;
4726 
4727                         "vst1.f32   {d24-d27}, [%0 :128]! \n"
4728 
4729                         : "=r"(outptr0), // %0
4730                         "=r"(r0),      // %1
4731                         "=r"(r1),      // %2
4732                         "=r"(r2),      // %3
4733                         "=r"(kptr)     // %4
4734                         : "0"(outptr0),
4735                         "1"(r0),
4736                         "2"(r1),
4737                         "3"(r2),
4738                         "4"(kptr)
4739                         : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4740 #endif // __aarch64__
4741                 }
4742                 for (; j < outw; j++)
4743                 {
4744 #if __aarch64__
4745                     asm volatile(
4746                         "prfm   pldl1keep, [%0, #128]       \n"
4747                         "ld1    {v20.4s}, [%0]              \n" // sum0
4748 
4749                         "prfm   pldl1keep, [%1, #384]       \n"
4750                         "ld1    {v0.4s, v1.4s, v2.4s}, [%1] \n" // r00 r01 r02
4751 
4752                         "prfm   pldl1keep, [%4, #512]       \n"
4753                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4754 
4755                         "fmul   v21.4s, v16.4s, v0.s[0]     \n"
4756                         "fmul   v22.4s, v17.4s, v0.s[1]     \n"
4757 
4758                         "prfm   pldl1keep, [%4, #512]       \n"
4759                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4760 
4761                         "fmul   v23.4s, v18.4s, v0.s[2]     \n"
4762                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
4763 
4764                         "fmla   v21.4s, v24.4s, v1.s[0]     \n"
4765                         "fmla   v22.4s, v25.4s, v1.s[1]     \n"
4766 
4767                         "prfm   pldl1keep, [%4, #512]       \n"
4768                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4769 
4770                         "fmla   v23.4s, v26.4s, v1.s[2]     \n"
4771                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4772 
4773                         "prfm   pldl1keep, [%2, #384]       \n"
4774                         "ld1    {v3.4s, v4.4s, v5.4s}, [%2] \n" // r10 r11 r12
4775 
4776                         "fmla   v21.4s, v16.4s, v2.s[0]     \n"
4777                         "fmla   v22.4s, v17.4s, v2.s[1]     \n"
4778 
4779                         "prfm   pldl1keep, [%4, #512]       \n"
4780                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4781 
4782                         "fmla   v23.4s, v18.4s, v2.s[2]     \n"
4783                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4784 
4785                         "fmla   v21.4s, v24.4s, v3.s[0]     \n"
4786                         "fmla   v22.4s, v25.4s, v3.s[1]     \n"
4787 
4788                         "prfm   pldl1keep, [%4, #512]       \n"
4789                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4790 
4791                         "fmla   v23.4s, v26.4s, v3.s[2]     \n"
4792                         "fmla   v20.4s, v27.4s, v3.s[3]     \n"
4793 
4794                         "fmla   v21.4s, v16.4s, v4.s[0]     \n"
4795                         "fmla   v22.4s, v17.4s, v4.s[1]     \n"
4796 
4797                         "prfm   pldl1keep, [%4, #512]       \n"
4798                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4799 
4800                         "fmla   v23.4s, v18.4s, v4.s[2]     \n"
4801                         "fmla   v20.4s, v19.4s, v4.s[3]     \n"
4802 
4803                         "prfm   pldl1keep, [%3, #384]       \n"
4804                         "ld1    {v0.4s, v1.4s, v2.4s}, [%3] \n" // r20 r21 r22
4805 
4806                         "fmla   v21.4s, v24.4s, v5.s[0]     \n"
4807                         "fmla   v22.4s, v25.4s, v5.s[1]     \n"
4808 
4809                         "prfm   pldl1keep, [%4, #512]       \n"
4810                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4811 
4812                         "fmla   v23.4s, v26.4s, v5.s[2]     \n"
4813                         "fmla   v20.4s, v27.4s, v5.s[3]     \n"
4814 
4815                         "fmla   v21.4s, v16.4s, v0.s[0]     \n"
4816                         "fmla   v22.4s, v17.4s, v0.s[1]     \n"
4817 
4818                         "prfm   pldl1keep, [%4, #512]       \n"
4819                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4820 
4821                         "fmla   v23.4s, v18.4s, v0.s[2]     \n"
4822                         "fmla   v20.4s, v19.4s, v0.s[3]     \n"
4823 
4824                         "fmla   v21.4s, v24.4s, v1.s[0]     \n"
4825                         "fmla   v22.4s, v25.4s, v1.s[1]     \n"
4826 
4827                         //                         "prfm   pldl1keep, [%4, #512]       \n"
4828                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4829 
4830                         "fmla   v23.4s, v26.4s, v1.s[2]     \n"
4831                         "fmla   v20.4s, v27.4s, v1.s[3]     \n"
4832 
4833                         "fmla   v21.4s, v16.4s, v2.s[0]     \n"
4834                         "fmla   v22.4s, v17.4s, v2.s[1]     \n"
4835                         "fmla   v23.4s, v18.4s, v2.s[2]     \n"
4836                         "fmla   v20.4s, v19.4s, v2.s[3]     \n"
4837 
4838                         "add    %1, %1, #32                 \n"
4839 
4840                         "fadd   v22.4s, v21.4s, v22.4s      \n"
4841 
4842                         "add    %2, %2, #32                 \n"
4843 
4844                         "fadd   v23.4s, v23.4s, v22.4s      \n"
4845 
4846                         "add    %3, %3, #32                 \n"
4847 
4848                         "fadd   v20.4s, v20.4s, v23.4s      \n"
4849 
4850                         "sub    %4, %4, #512                \n" // kptr -= 8 * 16;
4851 
4852                         "st1    {v20.4s}, [%0], #16         \n"
4853 
4854                         : "=r"(outptr0), // %0
4855                         "=r"(r0),      // %1
4856                         "=r"(r1),      // %2
4857                         "=r"(r2),      // %3
4858                         "=r"(kptr)     // %4
4859                         : "0"(outptr0),
4860                         "1"(r0),
4861                         "2"(r1),
4862                         "3"(r2),
4863                         "4"(kptr)
4864                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
4865 #else  // __aarch64__
4866                     asm volatile(
4867                         "pld        [%0, #128]          \n"
4868                         "vld1.f32   {d24-d25}, [%0 :128] \n" // sum0
4869 
4870                         "pld        [%1, #384]          \n"
4871                         "vldm       %1, {d0-d5}         \n" // r00 r01 r02
4872 
4873                         "pld        [%4, #512]          \n"
4874                         "vldm       %4!, {d16-d23}      \n"
4875 
4876                         "vmul.f32   q13, q8, d0[0]      \n"
4877                         "vmul.f32   q14, q9, d0[1]      \n"
4878                         "vmul.f32   q15, q10, d1[0]     \n"
4879                         "vmla.f32   q12, q11, d1[1]     \n"
4880 
4881                         "pld        [%4, #512]          \n"
4882                         "vldm       %4!, {d16-d23}      \n"
4883 
4884                         "vmla.f32   q13, q8, d2[0]      \n"
4885                         "vmla.f32   q14, q9, d2[1]      \n"
4886                         "vmla.f32   q15, q10, d3[0]     \n"
4887                         "vmla.f32   q12, q11, d3[1]     \n"
4888 
4889                         "pld        [%4, #512]          \n"
4890                         "vldm       %4!, {d16-d23}      \n"
4891 
4892                         "vmla.f32   q13, q8, d4[0]      \n"
4893                         "vmla.f32   q14, q9, d4[1]      \n"
4894                         "vmla.f32   q15, q10, d5[0]     \n"
4895                         "vmla.f32   q12, q11, d5[1]     \n"
4896 
4897                         "pld        [%2, #384]          \n"
4898                         "vldm       %2, {d0-d5}         \n" // r10 r11 r12
4899 
4900                         "pld        [%4, #512]          \n"
4901                         "vldm       %4!, {d16-d23}      \n"
4902 
4903                         "vmla.f32   q13, q8, d0[0]      \n"
4904                         "vmla.f32   q14, q9, d0[1]      \n"
4905                         "vmla.f32   q15, q10, d1[0]     \n"
4906                         "vmla.f32   q12, q11, d1[1]     \n"
4907 
4908                         "pld        [%4, #512]          \n"
4909                         "vldm       %4!, {d16-d23}      \n"
4910 
4911                         "vmla.f32   q13, q8, d2[0]      \n"
4912                         "vmla.f32   q14, q9, d2[1]      \n"
4913                         "vmla.f32   q15, q10, d3[0]     \n"
4914                         "vmla.f32   q12, q11, d3[1]     \n"
4915 
4916                         "pld        [%4, #512]          \n"
4917                         "vldm       %4!, {d16-d23}      \n"
4918 
4919                         "vmla.f32   q13, q8, d4[0]      \n"
4920                         "vmla.f32   q14, q9, d4[1]      \n"
4921                         "vmla.f32   q15, q10, d5[0]     \n"
4922                         "vmla.f32   q12, q11, d5[1]     \n"
4923 
4924                         "pld        [%3, #384]          \n"
4925                         "vldm       %3, {d0-d5}         \n" // r20 r21 r22
4926 
4927                         "pld        [%4, #512]          \n"
4928                         "vldm       %4!, {d16-d23}      \n"
4929 
4930                         "vmla.f32   q13, q8, d0[0]      \n"
4931                         "vmla.f32   q14, q9, d0[1]      \n"
4932                         "vmla.f32   q15, q10, d1[0]     \n"
4933                         "vmla.f32   q12, q11, d1[1]     \n"
4934 
4935                         "pld        [%4, #512]          \n"
4936                         "vldm       %4!, {d16-d23}      \n"
4937 
4938                         "vmla.f32   q13, q8, d2[0]      \n"
4939                         "vmla.f32   q14, q9, d2[1]      \n"
4940                         "vmla.f32   q15, q10, d3[0]     \n"
4941                         "vmla.f32   q12, q11, d3[1]     \n"
4942 
4943                         //                         "pld        [%4, #512]          \n"
4944                         "vldm       %4, {d16-d23}       \n"
4945 
4946                         "vmla.f32   q13, q8, d4[0]      \n"
4947                         "vmla.f32   q14, q9, d4[1]      \n"
4948                         "vmla.f32   q15, q10, d5[0]     \n"
4949                         "vmla.f32   q12, q11, d5[1]     \n"
4950 
4951                         "vadd.f32   q14, q14, q13       \n"
4952 
4953                         "add        %1, %1, #32         \n"
4954 
4955                         "vadd.f32   q15, q15, q14       \n"
4956 
4957                         "add        %2, %2, #32         \n"
4958 
4959                         "vadd.f32   q12, q12, q15       \n"
4960 
4961                         "add        %3, %3, #32         \n"
4962 
4963                         "sub        %4, %4, #512        \n" // kptr -= 8 * 16;
4964 
4965                         "vst1.f32   {d24-d25}, [%0 :128]! \n"
4966 
4967                         : "=r"(outptr0), // %0
4968                         "=r"(r0),      // %1
4969                         "=r"(r1),      // %2
4970                         "=r"(r2),      // %3
4971                         "=r"(kptr)     // %4
4972                         : "0"(outptr0),
4973                         "1"(r0),
4974                         "2"(r1),
4975                         "3"(r2),
4976                         "4"(kptr)
4977                         : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4978 #endif // __aarch64__
4979                 }
4980 
4981                 r0 += tailstep;
4982                 r1 += tailstep;
4983                 r2 += tailstep;
4984             }
4985         }
4986     }
4987 }
4988 
conv3x3s2_im2col_sgemm_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)4989 static void conv3x3s2_im2col_sgemm_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
4990 {
4991     int w = bottom_blob.w;
4992     int inch = bottom_blob.c;
4993 
4994     int outw = top_blob.w;
4995     int outh = top_blob.h;
4996     const int size = outw * outh;
4997 
4998     // im2col
4999     Mat bottom_im2col(size, 9, inch, 16u, 4, opt.workspace_allocator);
5000     {
5001         const int gap = (w * 2 - outw * 2) * 4;
5002 
5003         #pragma omp parallel for num_threads(opt.num_threads)
5004         for (int p = 0; p < inch; p++)
5005         {
5006             const Mat img = bottom_blob.channel(p);
5007             Mat out = bottom_im2col.channel(p);
5008 
5009             float* ptr0 = out.row(0);
5010             float* ptr1 = out.row(1);
5011             float* ptr2 = out.row(2);
5012             float* ptr3 = out.row(3);
5013             float* ptr4 = out.row(4);
5014             float* ptr5 = out.row(5);
5015             float* ptr6 = out.row(6);
5016             float* ptr7 = out.row(7);
5017             float* ptr8 = out.row(8);
5018 
5019             const float* r0 = img.row(0);
5020             const float* r1 = img.row(1);
5021             const float* r2 = img.row(2);
5022 
5023             for (int i = 0; i < outh; i++)
5024             {
5025                 int j = 0;
5026                 for (; j + 1 < outw; j += 2)
5027                 {
5028                     float32x4_t _r00 = vld1q_f32(r0);
5029                     float32x4_t _r01 = vld1q_f32(r0 + 4);
5030                     float32x4_t _r02 = vld1q_f32(r0 + 8);
5031                     float32x4_t _r03 = vld1q_f32(r0 + 12);
5032                     float32x4_t _r04 = vld1q_f32(r0 + 16);
5033 
5034                     float32x4_t _r10 = vld1q_f32(r1);
5035                     float32x4_t _r11 = vld1q_f32(r1 + 4);
5036                     float32x4_t _r12 = vld1q_f32(r1 + 8);
5037                     float32x4_t _r13 = vld1q_f32(r1 + 12);
5038                     float32x4_t _r14 = vld1q_f32(r1 + 16);
5039 
5040                     float32x4_t _r20 = vld1q_f32(r2);
5041                     float32x4_t _r21 = vld1q_f32(r2 + 4);
5042                     float32x4_t _r22 = vld1q_f32(r2 + 8);
5043                     float32x4_t _r23 = vld1q_f32(r2 + 12);
5044                     float32x4_t _r24 = vld1q_f32(r2 + 16);
5045 
5046                     vst1q_f32(ptr0, _r00);
5047                     vst1q_f32(ptr0 + 4, _r02);
5048                     vst1q_f32(ptr1, _r01);
5049                     vst1q_f32(ptr1 + 4, _r03);
5050                     vst1q_f32(ptr2, _r02);
5051                     vst1q_f32(ptr2 + 4, _r04);
5052 
5053                     vst1q_f32(ptr3, _r10);
5054                     vst1q_f32(ptr3 + 4, _r12);
5055                     vst1q_f32(ptr4, _r11);
5056                     vst1q_f32(ptr4 + 4, _r13);
5057                     vst1q_f32(ptr5, _r12);
5058                     vst1q_f32(ptr5 + 4, _r14);
5059 
5060                     vst1q_f32(ptr6, _r20);
5061                     vst1q_f32(ptr6 + 4, _r22);
5062                     vst1q_f32(ptr7, _r21);
5063                     vst1q_f32(ptr7 + 4, _r23);
5064                     vst1q_f32(ptr8, _r22);
5065                     vst1q_f32(ptr8 + 4, _r24);
5066 
5067                     r0 += 16;
5068                     r1 += 16;
5069                     r2 += 16;
5070 
5071                     ptr0 += 8;
5072                     ptr1 += 8;
5073                     ptr2 += 8;
5074                     ptr3 += 8;
5075                     ptr4 += 8;
5076                     ptr5 += 8;
5077                     ptr6 += 8;
5078                     ptr7 += 8;
5079                     ptr8 += 8;
5080                 }
5081                 for (; j < outw; j++)
5082                 {
5083                     float32x4_t _r00 = vld1q_f32(r0);
5084                     float32x4_t _r01 = vld1q_f32(r0 + 4);
5085                     float32x4_t _r02 = vld1q_f32(r0 + 8);
5086 
5087                     float32x4_t _r10 = vld1q_f32(r1);
5088                     float32x4_t _r11 = vld1q_f32(r1 + 4);
5089                     float32x4_t _r12 = vld1q_f32(r1 + 8);
5090 
5091                     float32x4_t _r20 = vld1q_f32(r2);
5092                     float32x4_t _r21 = vld1q_f32(r2 + 4);
5093                     float32x4_t _r22 = vld1q_f32(r2 + 8);
5094 
5095                     vst1q_f32(ptr0, _r00);
5096                     vst1q_f32(ptr1, _r01);
5097                     vst1q_f32(ptr2, _r02);
5098                     vst1q_f32(ptr3, _r10);
5099                     vst1q_f32(ptr4, _r11);
5100                     vst1q_f32(ptr5, _r12);
5101                     vst1q_f32(ptr6, _r20);
5102                     vst1q_f32(ptr7, _r21);
5103                     vst1q_f32(ptr8, _r22);
5104 
5105                     r0 += 8;
5106                     r1 += 8;
5107                     r2 += 8;
5108 
5109                     ptr0 += 4;
5110                     ptr1 += 4;
5111                     ptr2 += 4;
5112                     ptr3 += 4;
5113                     ptr4 += 4;
5114                     ptr5 += 4;
5115                     ptr6 += 4;
5116                     ptr7 += 4;
5117                     ptr8 += 4;
5118                 }
5119 
5120                 r0 += gap;
5121                 r1 += gap;
5122                 r2 += gap;
5123             }
5124         }
5125     }
5126 
5127     im2col_sgemm_pack4_neon(bottom_im2col, top_blob, kernel, _bias, opt);
5128 }
5129