1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv3x3s1_winograd64_pack4to1_bf16s_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)15 static void conv3x3s1_winograd64_pack4to1_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
16 {
17     int w = bottom_blob.w;
18     int h = bottom_blob.h;
19     int inch = bottom_blob.c;
20     //size_t elemsize = bottom_blob.elemsize;
21     int elempack = bottom_blob.elempack;
22 
23     int outw = top_blob.w;
24     int outh = top_blob.h;
25     int outch = top_blob.c;
26 
27     // pad to 6n+2
28     Mat bottom_blob_bordered = bottom_blob;
29 
30     outw = (outw + 5) / 6 * 6;
31     outh = (outh + 5) / 6 * 6;
32 
33     w = outw + 2;
34     h = outh + 2;
35     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
36 
37     const float* bias = _bias;
38 
39     // BEGIN transform input
40     Mat bottom_blob_tm;
41     {
42         int w_tm = outw / 6 * 8;
43         int h_tm = outh / 6 * 8;
44 
45         const int tiles = w_tm / 8 * h_tm / 8;
46 
47         //         bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
48         bottom_blob_tm.create(tiles, 64, inch, 4u * elempack, elempack, opt.workspace_allocator);
49 
50         //         const float itm[8][8] = {
51         //             {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
52         //
53         //             {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
54         //             {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
55         //
56         //             {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
57         //             {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
58         //
59         //             {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
60         //             {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
61         //
62         //             {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
63         //         };
64 
65         // 0 = r00 - r06 + (r04 - r02) * 5.25
66         // 7 = r07 - r01 + (r03 - r05) * 5.25
67 
68         // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
69         // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
70 
71         // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
72         // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
73 
74         // reuse r04 * 1.25
75         // reuse r03 * 2.5
76         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
77         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
78 
79         #pragma omp parallel for num_threads(opt.num_threads)
80         for (int q = 0; q < inch; q++)
81         {
82             const Mat img0 = bottom_blob_bordered.channel(q);
83             Mat img0_tm = bottom_blob_tm.channel(q);
84 
85             float tmp[8][8][4];
86 
87             // tile
88             for (int i = 0; i < h_tm / 8; i++)
89             {
90                 for (int j = 0; j < w_tm / 8; j++)
91                 {
92                     const unsigned short* r0 = img0.row<const unsigned short>(i * 6) + (j * 6) * 4;
93 
94                     for (int m = 0; m < 8; m++)
95                     {
96                         float32x4_t _r00 = vcvt_f32_bf16(vld1_u16(r0));
97                         float32x4_t _r01 = vcvt_f32_bf16(vld1_u16(r0 + 4));
98                         float32x4_t _r02 = vcvt_f32_bf16(vld1_u16(r0 + 8));
99                         float32x4_t _r03 = vcvt_f32_bf16(vld1_u16(r0 + 12));
100                         float32x4_t _r04 = vcvt_f32_bf16(vld1_u16(r0 + 16));
101                         float32x4_t _r05 = vcvt_f32_bf16(vld1_u16(r0 + 20));
102                         float32x4_t _r06 = vcvt_f32_bf16(vld1_u16(r0 + 24));
103                         float32x4_t _r07 = vcvt_f32_bf16(vld1_u16(r0 + 28));
104 
105                         float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
106                         float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
107                         vst1q_f32(tmp[0][m], _tmp0m);
108                         vst1q_f32(tmp[7][m], _tmp7m);
109 
110                         //                         tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25;
111                         //                         tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25;
112 
113                         float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
114                         float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
115 
116                         //                         float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25);
117                         //                         float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25);
118 
119                         float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
120                         float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
121                         vst1q_f32(tmp[1][m], _tmp1m);
122                         vst1q_f32(tmp[2][m], _tmp2m);
123 
124                         //                         tmp[1][m] = tmp12a + tmp12b;
125                         //                         tmp[2][m] = tmp12a - tmp12b;
126 
127                         float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
128                         float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
129 
130                         //                         float tmp34a = (r0[6] + r0[2] * 0.25 - r0[4] * 1.25);
131                         //                         float tmp34b = (r0[1] * 0.5 - r0[3] * 2.5 + r0[5] * 2);
132 
133                         float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
134                         float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
135                         vst1q_f32(tmp[3][m], _tmp3m);
136                         vst1q_f32(tmp[4][m], _tmp4m);
137 
138                         //                         tmp[3][m] = tmp34a + tmp34b;
139                         //                         tmp[4][m] = tmp34a - tmp34b;
140 
141                         float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
142                         float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
143 
144                         //                         float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25) * 4);
145                         //                         float tmp56b = (r0[1] * 2 - r0[3] * 2.5 + r0[5] * 0.5);
146 
147                         float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
148                         float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
149                         vst1q_f32(tmp[5][m], _tmp5m);
150                         vst1q_f32(tmp[6][m], _tmp6m);
151 
152                         //                         tmp[5][m] = tmp56a + tmp56b;
153                         //                         tmp[6][m] = tmp56a - tmp56b;
154 
155                         r0 += w * 4;
156                     }
157 
158                     float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 8 + j) * 4;
159                     float* r0_tm_1 = r0_tm_0 + tiles * 4;
160                     float* r0_tm_2 = r0_tm_0 + tiles * 8;
161                     float* r0_tm_3 = r0_tm_0 + tiles * 12;
162                     float* r0_tm_4 = r0_tm_0 + tiles * 16;
163                     float* r0_tm_5 = r0_tm_0 + tiles * 20;
164                     float* r0_tm_6 = r0_tm_0 + tiles * 24;
165                     float* r0_tm_7 = r0_tm_0 + tiles * 28;
166 
167                     for (int m = 0; m < 8; m++)
168                     {
169                         float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
170                         float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
171                         float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
172                         float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
173                         float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
174                         float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
175                         float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
176                         float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
177 
178                         float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
179                         float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
180 
181                         //                         r0_tm[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25;
182                         //                         r0_tm[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25;
183 
184                         float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
185                         float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
186 
187                         //                         float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25);
188                         //                         float tmp12b = (tmp0[1] + tmp0[5] - tmp0[3] * 4.25);
189 
190                         float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
191                         float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
192 
193                         //                         r0_tm[1] = tmp12a + tmp12b;
194                         //                         r0_tm[2] = tmp12a - tmp12b;
195 
196                         float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
197                         float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
198 
199                         //                         float tmp34a = (tmp0[6] + tmp0[2] * 0.25 - tmp0[4] * 1.25);
200                         //                         float tmp34b = (tmp0[1] * 0.5 - tmp0[3] * 2.5 + tmp0[5] * 2);
201 
202                         float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
203                         float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
204 
205                         //                         r0_tm[3] = tmp34a + tmp34b;
206                         //                         r0_tm[4] = tmp34a - tmp34b;
207 
208                         float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
209                         float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
210 
211                         //                         float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25) * 4);
212                         //                         float tmp56b = (tmp0[1] * 2 - tmp0[3] * 2.5 + tmp0[5] * 0.5);
213 
214                         float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
215                         float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
216 
217                         //                         r0_tm[5] = tmp56a + tmp56b;
218                         //                         r0_tm[6] = tmp56a - tmp56b;
219 
220                         vst1q_f32(r0_tm_0, _r0tm0);
221                         vst1q_f32(r0_tm_1, _r0tm1);
222                         vst1q_f32(r0_tm_2, _r0tm2);
223                         vst1q_f32(r0_tm_3, _r0tm3);
224                         vst1q_f32(r0_tm_4, _r0tm4);
225                         vst1q_f32(r0_tm_5, _r0tm5);
226                         vst1q_f32(r0_tm_6, _r0tm6);
227                         vst1q_f32(r0_tm_7, _r0tm7);
228 
229                         r0_tm_0 += tiles * 32;
230                         r0_tm_1 += tiles * 32;
231                         r0_tm_2 += tiles * 32;
232                         r0_tm_3 += tiles * 32;
233                         r0_tm_4 += tiles * 32;
234                         r0_tm_5 += tiles * 32;
235                         r0_tm_6 += tiles * 32;
236                         r0_tm_7 += tiles * 32;
237                     }
238                 }
239             }
240         }
241     }
242     bottom_blob_bordered = Mat();
243     // END transform input
244 
245     // BEGIN dot
246     Mat top_blob_tm;
247     {
248         int w_tm = outw / 6 * 8;
249         int h_tm = outh / 6 * 8;
250 
251         const int tiles = h_tm / 8 * w_tm / 8;
252 
253         // permute
254         //         bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
255         Mat bottom_blob_tm2;
256 #if __aarch64__
257         if (tiles >= 12)
258             bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + tiles % 12 % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
259         else if (tiles >= 8)
260             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
261         else if (tiles >= 4)
262             bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
263         else // if (tiles >= 1)
264             bottom_blob_tm2.create(1 * inch, tiles, 64, 4u * elempack, elempack, opt.workspace_allocator);
265 #else
266         if (tiles >= 8)
267             bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
268         else if (tiles >= 4)
269             bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
270         else // if (tiles >= 1)
271             bottom_blob_tm2.create(1 * inch, tiles, 64, 4u * elempack, elempack, opt.workspace_allocator);
272 #endif
273 
274         #pragma omp parallel for num_threads(opt.num_threads)
275         for (int r = 0; r < 64; r++)
276         {
277             Mat tm2 = bottom_blob_tm2.channel(r);
278 
279             // tile
280             int i = 0;
281 #if __aarch64__
282             for (; i + 11 < tiles; i += 12)
283             {
284                 float* tm2p = tm2.row(i / 12);
285 
286                 const float* r0 = bottom_blob_tm;
287 
288                 r0 += (r * tiles + i) * 4;
289 
290                 for (int q = 0; q < inch; q++)
291                 {
292                     asm volatile(
293                         "prfm   pldl1keep, [%0, #512]       \n"
294                         "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
295                         "prfm   pldl1keep, [%0, #512]       \n"
296                         "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
297                         "prfm   pldl1keep, [%0, #512]       \n"
298                         "ld4    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"
299                         "sub    %0, %0, #128                \n"
300                         "st1    {v0.4s}, [%1], #16          \n"
301                         "st1    {v4.4s}, [%1], #16          \n"
302                         "st1    {v16.4s}, [%1], #16         \n"
303                         "st1    {v1.4s}, [%1], #16          \n"
304                         "st1    {v5.4s}, [%1], #16          \n"
305                         "st1    {v17.4s}, [%1], #16         \n"
306                         "st1    {v2.4s}, [%1], #16          \n"
307                         "st1    {v6.4s}, [%1], #16          \n"
308                         "st1    {v18.4s}, [%1], #16         \n"
309                         "st1    {v3.4s}, [%1], #16          \n"
310                         "st1    {v7.4s}, [%1], #16          \n"
311                         "st1    {v19.4s}, [%1], #16         \n"
312                         : "=r"(r0),  // %0
313                         "=r"(tm2p) // %1
314                         : "0"(r0),
315                         "1"(tm2p)
316                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
317                     r0 += bottom_blob_tm.cstep * 4;
318                 }
319             }
320 #endif
321             for (; i + 7 < tiles; i += 8)
322             {
323 #if __aarch64__
324                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
325 #else
326                 float* tm2p = tm2.row(i / 8);
327 #endif
328 
329                 const float* r0 = bottom_blob_tm;
330 
331                 r0 += (r * tiles + i) * 4;
332 
333                 for (int q = 0; q < inch; q++)
334                 {
335 #if __aarch64__
336                     asm volatile(
337                         "prfm   pldl1keep, [%0, #512]       \n"
338                         "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
339                         "prfm   pldl1keep, [%0, #512]       \n"
340                         "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
341                         "sub    %0, %0, #64                 \n"
342                         "st1    {v0.4s}, [%1], #16          \n"
343                         "st1    {v4.4s}, [%1], #16          \n"
344                         "st1    {v1.4s}, [%1], #16          \n"
345                         "st1    {v5.4s}, [%1], #16          \n"
346                         "st1    {v2.4s}, [%1], #16          \n"
347                         "st1    {v6.4s}, [%1], #16          \n"
348                         "st1    {v3.4s}, [%1], #16          \n"
349                         "st1    {v7.4s}, [%1], #16          \n"
350                         : "=r"(r0),  // %0
351                         "=r"(tm2p) // %1
352                         : "0"(r0),
353                         "1"(tm2p)
354                         : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
355 #else
356                     asm volatile(
357                         "pld        [%0, #256]          \n"
358                         "vld4.f32   {d0-d3}, [%0 :128]! \n"
359                         "pld        [%0, #256]          \n"
360                         "vld4.f32   {d4-d7}, [%0 :128]! \n"
361                         "pld        [%0, #256]          \n"
362                         "vld4.f32   {d16-d19}, [%0 :128]! \n"
363                         "pld        [%0, #256]          \n"
364                         "vld4.f32   {d20-d23}, [%0 :128] \n"
365                         "sub        %0, %0, #96         \n"
366                         "vswp       d1, d4              \n"
367                         "vswp       d3, d6              \n"
368                         "vswp       d17, d20            \n"
369                         "vswp       d19, d22            \n"
370                         "vst1.f32   {d0-d1}, [%1 :128]! \n"
371                         "vst1.f32   {d16-d17}, [%1 :128]! \n"
372                         "vst1.f32   {d4-d5}, [%1 :128]! \n"
373                         "vst1.f32   {d20-d21}, [%1 :128]! \n"
374                         "vst1.f32   {d2-d3}, [%1 :128]! \n"
375                         "vst1.f32   {d18-d19}, [%1 :128]! \n"
376                         "vst1.f32   {d6-d7}, [%1 :128]! \n"
377                         "vst1.f32   {d22-d23}, [%1 :128]! \n"
378                         : "=r"(r0),  // %0
379                         "=r"(tm2p) // %1
380                         : "0"(r0),
381                         "1"(tm2p)
382                         : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
383 #endif
384                     r0 += bottom_blob_tm.cstep * 4;
385                 }
386             }
387             for (; i + 3 < tiles; i += 4)
388             {
389 #if __aarch64__
390                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
391 #else
392                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
393 #endif
394 
395                 const float* r0 = bottom_blob_tm;
396 
397                 r0 += (r * tiles + i) * 4;
398 
399                 for (int q = 0; q < inch; q++)
400                 {
401 #if __aarch64__
402                     asm volatile(
403                         "prfm   pldl1keep, [%0, #512]       \n"
404                         "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
405                         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
406                         : "=r"(r0),  // %0
407                         "=r"(tm2p) // %1
408                         : "0"(r0),
409                         "1"(tm2p)
410                         : "memory", "v0", "v1", "v2", "v3");
411 #else
412                     asm volatile(
413                         "pld        [%0, #256]          \n"
414                         "vld4.f32   {d0-d3}, [%0 :128]! \n"
415                         "pld        [%0, #256]          \n"
416                         "vld4.f32   {d4-d7}, [%0 :128]  \n"
417                         "sub        %0, %0, #32         \n"
418                         "vswp       d1, d4              \n"
419                         "vswp       d3, d6              \n"
420                         "vst1.f32   {d0-d1}, [%1 :128]! \n"
421                         "vst1.f32   {d4-d5}, [%1 :128]! \n"
422                         "vst1.f32   {d2-d3}, [%1 :128]! \n"
423                         "vst1.f32   {d6-d7}, [%1 :128]! \n"
424                         : "=r"(r0),  // %0
425                         "=r"(tm2p) // %1
426                         : "0"(r0),
427                         "1"(tm2p)
428                         : "memory", "q0", "q1", "q2", "q3");
429 #endif // __aarch64__
430                     r0 += bottom_blob_tm.cstep * 4;
431                 }
432             }
433             for (; i < tiles; i++)
434             {
435 #if __aarch64__
436                 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
437 #else
438                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + i % 4);
439 #endif
440 
441                 const float* r0 = bottom_blob_tm;
442 
443                 r0 += (r * tiles + i) * 4;
444 
445                 for (int q = 0; q < inch; q++)
446                 {
447 #if __aarch64__
448                     asm volatile(
449                         "prfm   pldl1keep, [%0, #128]       \n"
450                         "ld1    {v0.4s}, [%0]               \n"
451                         "st1    {v0.4s}, [%1], #16          \n"
452                         : "=r"(r0),  // %0
453                         "=r"(tm2p) // %1
454                         : "0"(r0),
455                         "1"(tm2p)
456                         : "memory", "v0");
457 #else
458                     asm volatile(
459                         "pld        [%0, #128]          \n"
460                         "vld1.f32   {d0-d1}, [%0 :128]  \n"
461                         "vst1.f32   {d0-d1}, [%1 :128]! \n"
462                         : "=r"(r0),  // %0
463                         "=r"(tm2p) // %1
464                         : "0"(r0),
465                         "1"(tm2p)
466                         : "memory", "q0");
467 #endif // __aarch64__
468                     r0 += bottom_blob_tm.cstep * 4;
469                 }
470             }
471         }
472 
473         bottom_blob_tm = Mat();
474         // permute end
475 
476         top_blob_tm.create(tiles, 64, outch, 4u, 1, opt.workspace_allocator);
477 
478         int nn_outch = 0;
479         int remain_outch_start = 0;
480 
481 #if __aarch64__
482         nn_outch = outch >> 3;
483 
484         #pragma omp parallel for num_threads(opt.num_threads)
485         for (int pp = 0; pp < nn_outch; pp++)
486         {
487             int p = pp * 8;
488 
489             float* output0_tm = top_blob_tm.channel(p);
490             float* output1_tm = top_blob_tm.channel(p + 1);
491             float* output2_tm = top_blob_tm.channel(p + 2);
492             float* output3_tm = top_blob_tm.channel(p + 3);
493             float* output4_tm = top_blob_tm.channel(p + 4);
494             float* output5_tm = top_blob_tm.channel(p + 5);
495             float* output6_tm = top_blob_tm.channel(p + 6);
496             float* output7_tm = top_blob_tm.channel(p + 7);
497 
498             const Mat kernel01_tm = kernel_tm.channel(p / 8);
499 
500             for (int r = 0; r < 64; r++)
501             {
502                 const Mat bb2 = bottom_blob_tm2.channel(r);
503 
504                 int i = 0;
505                 for (; i + 11 < tiles; i += 12)
506                 {
507                     const float* r0 = bb2.row(i / 12);
508 
509                     const float* kptr = kernel01_tm.row(r);
510 
511                     int nn = inch; // inch always > 0
512 
513                     asm volatile(
514                         "eor    v8.16b, v8.16b, v8.16b      \n"
515                         "eor    v9.16b, v9.16b, v9.16b      \n"
516                         "eor    v10.16b, v10.16b, v10.16b   \n"
517                         "eor    v11.16b, v11.16b, v11.16b   \n"
518                         "eor    v12.16b, v12.16b, v12.16b   \n"
519                         "eor    v13.16b, v13.16b, v13.16b   \n"
520                         "eor    v14.16b, v14.16b, v14.16b   \n"
521                         "eor    v15.16b, v15.16b, v15.16b   \n"
522                         "eor    v16.16b, v16.16b, v16.16b   \n"
523                         "eor    v17.16b, v17.16b, v17.16b   \n"
524                         "eor    v18.16b, v18.16b, v18.16b   \n"
525                         "eor    v19.16b, v19.16b, v19.16b   \n"
526                         "eor    v20.16b, v20.16b, v20.16b   \n"
527                         "eor    v21.16b, v21.16b, v21.16b   \n"
528                         "eor    v22.16b, v22.16b, v22.16b   \n"
529                         "eor    v23.16b, v23.16b, v23.16b   \n"
530                         "eor    v24.16b, v24.16b, v24.16b   \n"
531                         "eor    v25.16b, v25.16b, v25.16b   \n"
532                         "eor    v26.16b, v26.16b, v26.16b   \n"
533                         "eor    v27.16b, v27.16b, v27.16b   \n"
534                         "eor    v28.16b, v28.16b, v28.16b   \n"
535                         "eor    v29.16b, v29.16b, v29.16b   \n"
536                         "eor    v30.16b, v30.16b, v30.16b   \n"
537                         "eor    v31.16b, v31.16b, v31.16b   \n"
538 
539                         "0:                             \n"
540 
541                         "prfm   pldl1keep, [%9, #512]   \n"
542                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
543 
544                         "prfm   pldl1keep, [%10, #512]  \n"
545                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
546 
547                         "subs   %w0, %w0, #1            \n"
548 
549                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
550                         "fmla   v11.4s, v0.4s, v4.s[1]  \n"
551                         "fmla   v14.4s, v0.4s, v4.s[2]  \n"
552                         "fmla   v17.4s, v0.4s, v4.s[3]  \n"
553                         "fmla   v20.4s, v0.4s, v5.s[0]  \n"
554                         "fmla   v23.4s, v0.4s, v5.s[1]  \n"
555                         "fmla   v26.4s, v0.4s, v5.s[2]  \n"
556                         "fmla   v29.4s, v0.4s, v5.s[3]  \n"
557 
558                         "fmla   v9.4s, v1.4s, v4.s[0]   \n"
559                         "fmla   v12.4s, v1.4s, v4.s[1]  \n"
560                         "fmla   v15.4s, v1.4s, v4.s[2]  \n"
561                         "fmla   v18.4s, v1.4s, v4.s[3]  \n"
562                         "fmla   v21.4s, v1.4s, v5.s[0]  \n"
563                         "fmla   v24.4s, v1.4s, v5.s[1]  \n"
564                         "fmla   v27.4s, v1.4s, v5.s[2]  \n"
565                         "fmla   v30.4s, v1.4s, v5.s[3]  \n"
566 
567                         "fmla   v10.4s, v2.4s, v4.s[0]  \n"
568                         "fmla   v13.4s, v2.4s, v4.s[1]  \n"
569                         "fmla   v16.4s, v2.4s, v4.s[2]  \n"
570                         "fmla   v19.4s, v2.4s, v4.s[3]  \n"
571                         "fmla   v22.4s, v2.4s, v5.s[0]  \n"
572                         "fmla   v25.4s, v2.4s, v5.s[1]  \n"
573                         "fmla   v28.4s, v2.4s, v5.s[2]  \n"
574                         "fmla   v31.4s, v2.4s, v5.s[3]  \n"
575 
576                         "fmla   v8.4s, v3.4s, v6.s[0]   \n"
577                         "fmla   v11.4s, v3.4s, v6.s[1]  \n"
578                         "fmla   v14.4s, v3.4s, v6.s[2]  \n"
579                         "fmla   v17.4s, v3.4s, v6.s[3]  \n"
580                         "fmla   v20.4s, v3.4s, v7.s[0]  \n"
581                         "fmla   v23.4s, v3.4s, v7.s[1]  \n"
582                         "fmla   v26.4s, v3.4s, v7.s[2]  \n"
583                         "fmla   v29.4s, v3.4s, v7.s[3]  \n"
584 
585                         "prfm   pldl1keep, [%9, #512]   \n"
586                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
587 
588                         "fmla   v9.4s, v0.4s, v6.s[0]   \n"
589                         "fmla   v12.4s, v0.4s, v6.s[1]  \n"
590                         "fmla   v15.4s, v0.4s, v6.s[2]  \n"
591                         "fmla   v18.4s, v0.4s, v6.s[3]  \n"
592                         "fmla   v21.4s, v0.4s, v7.s[0]  \n"
593                         "fmla   v24.4s, v0.4s, v7.s[1]  \n"
594                         "fmla   v27.4s, v0.4s, v7.s[2]  \n"
595                         "fmla   v30.4s, v0.4s, v7.s[3]  \n"
596 
597                         "fmla   v10.4s, v1.4s, v6.s[0]  \n"
598                         "fmla   v13.4s, v1.4s, v6.s[1]  \n"
599                         "fmla   v16.4s, v1.4s, v6.s[2]  \n"
600                         "fmla   v19.4s, v1.4s, v6.s[3]  \n"
601                         "fmla   v22.4s, v1.4s, v7.s[0]  \n"
602                         "fmla   v25.4s, v1.4s, v7.s[1]  \n"
603                         "fmla   v28.4s, v1.4s, v7.s[2]  \n"
604                         "fmla   v31.4s, v1.4s, v7.s[3]  \n"
605 
606                         "prfm   pldl1keep, [%10, #512]  \n"
607                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
608 
609                         "fmla   v8.4s, v2.4s, v4.s[0]   \n"
610                         "fmla   v11.4s, v2.4s, v4.s[1]  \n"
611                         "fmla   v14.4s, v2.4s, v4.s[2]  \n"
612                         "fmla   v17.4s, v2.4s, v4.s[3]  \n"
613                         "fmla   v20.4s, v2.4s, v5.s[0]  \n"
614                         "fmla   v23.4s, v2.4s, v5.s[1]  \n"
615                         "fmla   v26.4s, v2.4s, v5.s[2]  \n"
616                         "fmla   v29.4s, v2.4s, v5.s[3]  \n"
617 
618                         "fmla   v9.4s, v3.4s, v4.s[0]   \n"
619                         "fmla   v12.4s, v3.4s, v4.s[1]  \n"
620                         "fmla   v15.4s, v3.4s, v4.s[2]  \n"
621                         "fmla   v18.4s, v3.4s, v4.s[3]  \n"
622                         "fmla   v21.4s, v3.4s, v5.s[0]  \n"
623                         "fmla   v24.4s, v3.4s, v5.s[1]  \n"
624                         "fmla   v27.4s, v3.4s, v5.s[2]  \n"
625                         "fmla   v30.4s, v3.4s, v5.s[3]  \n"
626 
627                         "prfm   pldl1keep, [%9, #512]   \n"
628                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
629 
630                         "fmla   v10.4s, v0.4s, v4.s[0]  \n"
631                         "fmla   v13.4s, v0.4s, v4.s[1]  \n"
632                         "fmla   v16.4s, v0.4s, v4.s[2]  \n"
633                         "fmla   v19.4s, v0.4s, v4.s[3]  \n"
634                         "fmla   v22.4s, v0.4s, v5.s[0]  \n"
635                         "fmla   v25.4s, v0.4s, v5.s[1]  \n"
636                         "fmla   v28.4s, v0.4s, v5.s[2]  \n"
637                         "fmla   v31.4s, v0.4s, v5.s[3]  \n"
638 
639                         "fmla   v8.4s, v1.4s, v6.s[0]   \n"
640                         "fmla   v11.4s, v1.4s, v6.s[1]  \n"
641                         "fmla   v14.4s, v1.4s, v6.s[2]  \n"
642                         "fmla   v17.4s, v1.4s, v6.s[3]  \n"
643                         "fmla   v20.4s, v1.4s, v7.s[0]  \n"
644                         "fmla   v23.4s, v1.4s, v7.s[1]  \n"
645                         "fmla   v26.4s, v1.4s, v7.s[2]  \n"
646                         "fmla   v29.4s, v1.4s, v7.s[3]  \n"
647 
648                         "fmla   v9.4s, v2.4s, v6.s[0]   \n"
649                         "fmla   v12.4s, v2.4s, v6.s[1]  \n"
650                         "fmla   v15.4s, v2.4s, v6.s[2]  \n"
651                         "fmla   v18.4s, v2.4s, v6.s[3]  \n"
652                         "fmla   v21.4s, v2.4s, v7.s[0]  \n"
653                         "fmla   v24.4s, v2.4s, v7.s[1]  \n"
654                         "fmla   v27.4s, v2.4s, v7.s[2]  \n"
655                         "fmla   v30.4s, v2.4s, v7.s[3]  \n"
656 
657                         "fmla   v10.4s, v3.4s, v6.s[0]  \n"
658                         "fmla   v13.4s, v3.4s, v6.s[1]  \n"
659                         "fmla   v16.4s, v3.4s, v6.s[2]  \n"
660                         "fmla   v19.4s, v3.4s, v6.s[3]  \n"
661                         "fmla   v22.4s, v3.4s, v7.s[0]  \n"
662                         "fmla   v25.4s, v3.4s, v7.s[1]  \n"
663                         "fmla   v28.4s, v3.4s, v7.s[2]  \n"
664                         "fmla   v31.4s, v3.4s, v7.s[3]  \n"
665 
666                         "bne    0b                      \n"
667 
668                         "st1    {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
669                         "st1    {v11.4s, v12.4s, v13.4s}, [%2], #48 \n"
670                         "st1    {v14.4s, v15.4s, v16.4s}, [%3], #48 \n"
671                         "st1    {v17.4s, v18.4s, v19.4s}, [%4], #48 \n"
672                         "st1    {v20.4s, v21.4s, v22.4s}, [%5], #48 \n"
673                         "st1    {v23.4s, v24.4s, v25.4s}, [%6], #48 \n"
674                         "st1    {v26.4s, v27.4s, v28.4s}, [%7], #48 \n"
675                         "st1    {v29.4s, v30.4s, v31.4s}, [%8], #48 \n"
676 
677                         : "=r"(nn),         // %0
678                         "=r"(output0_tm), // %1
679                         "=r"(output1_tm), // %2
680                         "=r"(output2_tm), // %3
681                         "=r"(output3_tm), // %4
682                         "=r"(output4_tm), // %5
683                         "=r"(output5_tm), // %6
684                         "=r"(output6_tm), // %7
685                         "=r"(output7_tm), // %8
686                         "=r"(r0),         // %9
687                         "=r"(kptr)        // %10
688                         : "0"(nn),
689                         "1"(output0_tm),
690                         "2"(output1_tm),
691                         "3"(output2_tm),
692                         "4"(output3_tm),
693                         "5"(output4_tm),
694                         "6"(output5_tm),
695                         "7"(output6_tm),
696                         "8"(output7_tm),
697                         "9"(r0),
698                         "10"(kptr)
699                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
700                 }
701                 for (; i + 7 < tiles; i += 8)
702                 {
703                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
704 
705                     const float* kptr = kernel01_tm.row(r);
706 
707                     int nn = inch; // inch always > 0
708 
709                     asm volatile(
710                         "eor    v16.16b, v16.16b, v16.16b   \n"
711                         "eor    v17.16b, v17.16b, v17.16b   \n"
712                         "eor    v18.16b, v18.16b, v18.16b   \n"
713                         "eor    v19.16b, v19.16b, v19.16b   \n"
714                         "eor    v20.16b, v20.16b, v20.16b   \n"
715                         "eor    v21.16b, v21.16b, v21.16b   \n"
716                         "eor    v22.16b, v22.16b, v22.16b   \n"
717                         "eor    v23.16b, v23.16b, v23.16b   \n"
718                         "eor    v24.16b, v24.16b, v24.16b   \n"
719                         "eor    v25.16b, v25.16b, v25.16b   \n"
720                         "eor    v26.16b, v26.16b, v26.16b   \n"
721                         "eor    v27.16b, v27.16b, v27.16b   \n"
722                         "eor    v28.16b, v28.16b, v28.16b   \n"
723                         "eor    v29.16b, v29.16b, v29.16b   \n"
724                         "eor    v30.16b, v30.16b, v30.16b   \n"
725                         "eor    v31.16b, v31.16b, v31.16b   \n"
726 
727                         "0:                             \n"
728 
729                         "prfm   pldl1keep, [%9, #512]   \n"
730                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
731 
732                         "prfm   pldl1keep, [%10, #512]  \n"
733                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
734 
735                         "subs   %w0, %w0, #1            \n"
736 
737                         "fmla   v16.4s, v0.4s, v4.s[0]  \n"
738                         "fmla   v18.4s, v0.4s, v4.s[1]  \n"
739                         "fmla   v20.4s, v0.4s, v4.s[2]  \n"
740                         "fmla   v22.4s, v0.4s, v4.s[3]  \n"
741                         "fmla   v24.4s, v0.4s, v5.s[0]  \n"
742                         "fmla   v26.4s, v0.4s, v5.s[1]  \n"
743                         "fmla   v28.4s, v0.4s, v5.s[2]  \n"
744                         "fmla   v30.4s, v0.4s, v5.s[3]  \n"
745                         "fmla   v17.4s, v1.4s, v4.s[0]  \n"
746                         "fmla   v19.4s, v1.4s, v4.s[1]  \n"
747                         "fmla   v21.4s, v1.4s, v4.s[2]  \n"
748                         "fmla   v23.4s, v1.4s, v4.s[3]  \n"
749                         "fmla   v25.4s, v1.4s, v5.s[0]  \n"
750                         "fmla   v27.4s, v1.4s, v5.s[1]  \n"
751                         "fmla   v29.4s, v1.4s, v5.s[2]  \n"
752                         "fmla   v31.4s, v1.4s, v5.s[3]  \n"
753 
754                         "fmla   v16.4s, v2.4s, v6.s[0]  \n"
755                         "fmla   v18.4s, v2.4s, v6.s[1]  \n"
756                         "fmla   v20.4s, v2.4s, v6.s[2]  \n"
757                         "fmla   v22.4s, v2.4s, v6.s[3]  \n"
758                         "fmla   v24.4s, v2.4s, v7.s[0]  \n"
759                         "fmla   v26.4s, v2.4s, v7.s[1]  \n"
760                         "fmla   v28.4s, v2.4s, v7.s[2]  \n"
761                         "fmla   v30.4s, v2.4s, v7.s[3]  \n"
762                         "fmla   v17.4s, v3.4s, v6.s[0]  \n"
763                         "fmla   v19.4s, v3.4s, v6.s[1]  \n"
764                         "fmla   v21.4s, v3.4s, v6.s[2]  \n"
765                         "fmla   v23.4s, v3.4s, v6.s[3]  \n"
766                         "fmla   v25.4s, v3.4s, v7.s[0]  \n"
767                         "fmla   v27.4s, v3.4s, v7.s[1]  \n"
768                         "fmla   v29.4s, v3.4s, v7.s[2]  \n"
769                         "fmla   v31.4s, v3.4s, v7.s[3]  \n"
770 
771                         "prfm   pldl1keep, [%9, #512]   \n"
772                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%9], #64 \n"
773 
774                         "prfm   pldl1keep, [%10, #512]  \n"
775                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
776 
777                         "fmla   v16.4s, v12.4s, v8.s[0] \n"
778                         "fmla   v18.4s, v12.4s, v8.s[1] \n"
779                         "fmla   v20.4s, v12.4s, v8.s[2] \n"
780                         "fmla   v22.4s, v12.4s, v8.s[3] \n"
781                         "fmla   v24.4s, v12.4s, v9.s[0] \n"
782                         "fmla   v26.4s, v12.4s, v9.s[1] \n"
783                         "fmla   v28.4s, v12.4s, v9.s[2] \n"
784                         "fmla   v30.4s, v12.4s, v9.s[3] \n"
785                         "fmla   v17.4s, v13.4s, v8.s[0] \n"
786                         "fmla   v19.4s, v13.4s, v8.s[1] \n"
787                         "fmla   v21.4s, v13.4s, v8.s[2] \n"
788                         "fmla   v23.4s, v13.4s, v8.s[3] \n"
789                         "fmla   v25.4s, v13.4s, v9.s[0] \n"
790                         "fmla   v27.4s, v13.4s, v9.s[1] \n"
791                         "fmla   v29.4s, v13.4s, v9.s[2] \n"
792                         "fmla   v31.4s, v13.4s, v9.s[3] \n"
793 
794                         "fmla   v16.4s, v14.4s, v10.s[0] \n"
795                         "fmla   v18.4s, v14.4s, v10.s[1] \n"
796                         "fmla   v20.4s, v14.4s, v10.s[2] \n"
797                         "fmla   v22.4s, v14.4s, v10.s[3] \n"
798                         "fmla   v24.4s, v14.4s, v11.s[0] \n"
799                         "fmla   v26.4s, v14.4s, v11.s[1] \n"
800                         "fmla   v28.4s, v14.4s, v11.s[2] \n"
801                         "fmla   v30.4s, v14.4s, v11.s[3] \n"
802                         "fmla   v17.4s, v15.4s, v10.s[0] \n"
803                         "fmla   v19.4s, v15.4s, v10.s[1] \n"
804                         "fmla   v21.4s, v15.4s, v10.s[2] \n"
805                         "fmla   v23.4s, v15.4s, v10.s[3] \n"
806                         "fmla   v25.4s, v15.4s, v11.s[0] \n"
807                         "fmla   v27.4s, v15.4s, v11.s[1] \n"
808                         "fmla   v29.4s, v15.4s, v11.s[2] \n"
809                         "fmla   v31.4s, v15.4s, v11.s[3] \n"
810 
811                         "bne    0b                      \n"
812 
813                         "st1    {v16.4s, v17.4s}, [%1], #32 \n"
814                         "st1    {v18.4s, v19.4s}, [%2], #32 \n"
815                         "st1    {v20.4s, v21.4s}, [%3], #32 \n"
816                         "st1    {v22.4s, v23.4s}, [%4], #32 \n"
817                         "st1    {v24.4s, v25.4s}, [%5], #32 \n"
818                         "st1    {v26.4s, v27.4s}, [%6], #32 \n"
819                         "st1    {v28.4s, v29.4s}, [%7], #32 \n"
820                         "st1    {v30.4s, v31.4s}, [%8], #32 \n"
821 
822                         : "=r"(nn),         // %0
823                         "=r"(output0_tm), // %1
824                         "=r"(output1_tm), // %2
825                         "=r"(output2_tm), // %3
826                         "=r"(output3_tm), // %4
827                         "=r"(output4_tm), // %5
828                         "=r"(output5_tm), // %6
829                         "=r"(output6_tm), // %7
830                         "=r"(output7_tm), // %8
831                         "=r"(r0),         // %9
832                         "=r"(kptr)        // %10
833                         : "0"(nn),
834                         "1"(output0_tm),
835                         "2"(output1_tm),
836                         "3"(output2_tm),
837                         "4"(output3_tm),
838                         "5"(output4_tm),
839                         "6"(output5_tm),
840                         "7"(output6_tm),
841                         "8"(output7_tm),
842                         "9"(r0),
843                         "10"(kptr)
844                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
845                 }
846                 for (; i + 3 < tiles; i += 4)
847                 {
848                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
849 
850                     const float* kptr = kernel01_tm.row(r);
851 
852                     int nn = inch; // inch always > 0
853 
854                     asm volatile(
855                         "eor    v16.16b, v16.16b, v16.16b   \n"
856                         "eor    v17.16b, v17.16b, v17.16b   \n"
857                         "eor    v18.16b, v18.16b, v18.16b   \n"
858                         "eor    v19.16b, v19.16b, v19.16b   \n"
859                         "eor    v20.16b, v20.16b, v20.16b   \n"
860                         "eor    v21.16b, v21.16b, v21.16b   \n"
861                         "eor    v22.16b, v22.16b, v22.16b   \n"
862                         "eor    v23.16b, v23.16b, v23.16b   \n"
863 
864                         "0:                             \n"
865 
866                         "prfm   pldl1keep, [%9, #512]   \n"
867                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
868 
869                         "prfm   pldl1keep, [%10, #512]  \n"
870                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
871 
872                         "subs   %w0, %w0, #1            \n"
873 
874                         "fmla   v16.4s, v0.4s, v4.s[0]  \n"
875                         "fmla   v17.4s, v0.4s, v4.s[1]  \n"
876                         "fmla   v18.4s, v0.4s, v4.s[2]  \n"
877                         "fmla   v19.4s, v0.4s, v4.s[3]  \n"
878                         "fmla   v20.4s, v0.4s, v5.s[0]  \n"
879                         "fmla   v21.4s, v0.4s, v5.s[1]  \n"
880                         "fmla   v22.4s, v0.4s, v5.s[2]  \n"
881                         "fmla   v23.4s, v0.4s, v5.s[3]  \n"
882 
883                         "prfm   pldl1keep, [%10, #512]  \n"
884                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
885 
886                         "fmla   v16.4s, v1.4s, v6.s[0]  \n"
887                         "fmla   v17.4s, v1.4s, v6.s[1]  \n"
888                         "fmla   v18.4s, v1.4s, v6.s[2]  \n"
889                         "fmla   v19.4s, v1.4s, v6.s[3]  \n"
890                         "fmla   v20.4s, v1.4s, v7.s[0]  \n"
891                         "fmla   v21.4s, v1.4s, v7.s[1]  \n"
892                         "fmla   v22.4s, v1.4s, v7.s[2]  \n"
893                         "fmla   v23.4s, v1.4s, v7.s[3]  \n"
894 
895                         "fmla   v16.4s, v2.4s, v8.s[0]  \n"
896                         "fmla   v17.4s, v2.4s, v8.s[1]  \n"
897                         "fmla   v18.4s, v2.4s, v8.s[2]  \n"
898                         "fmla   v19.4s, v2.4s, v8.s[3]  \n"
899                         "fmla   v20.4s, v2.4s, v9.s[0]  \n"
900                         "fmla   v21.4s, v2.4s, v9.s[1]  \n"
901                         "fmla   v22.4s, v2.4s, v9.s[2]  \n"
902                         "fmla   v23.4s, v2.4s, v9.s[3]  \n"
903 
904                         "fmla   v16.4s, v3.4s, v10.s[0] \n"
905                         "fmla   v17.4s, v3.4s, v10.s[1] \n"
906                         "fmla   v18.4s, v3.4s, v10.s[2] \n"
907                         "fmla   v19.4s, v3.4s, v10.s[3] \n"
908                         "fmla   v20.4s, v3.4s, v11.s[0] \n"
909                         "fmla   v21.4s, v3.4s, v11.s[1] \n"
910                         "fmla   v22.4s, v3.4s, v11.s[2] \n"
911                         "fmla   v23.4s, v3.4s, v11.s[3] \n"
912 
913                         "bne    0b                      \n"
914 
915                         "st1    {v16.4s}, [%1], #16     \n"
916                         "st1    {v17.4s}, [%2], #16     \n"
917                         "st1    {v18.4s}, [%3], #16     \n"
918                         "st1    {v19.4s}, [%4], #16     \n"
919                         "st1    {v20.4s}, [%5], #16     \n"
920                         "st1    {v21.4s}, [%6], #16     \n"
921                         "st1    {v22.4s}, [%7], #16     \n"
922                         "st1    {v23.4s}, [%8], #16     \n"
923 
924                         : "=r"(nn),         // %0
925                         "=r"(output0_tm), // %1
926                         "=r"(output1_tm), // %2
927                         "=r"(output2_tm), // %3
928                         "=r"(output3_tm), // %4
929                         "=r"(output4_tm), // %5
930                         "=r"(output5_tm), // %6
931                         "=r"(output6_tm), // %7
932                         "=r"(output7_tm), // %8
933                         "=r"(r0),         // %9
934                         "=r"(kptr)        // %10
935                         : "0"(nn),
936                         "1"(output0_tm),
937                         "2"(output1_tm),
938                         "3"(output2_tm),
939                         "4"(output3_tm),
940                         "5"(output4_tm),
941                         "6"(output5_tm),
942                         "7"(output6_tm),
943                         "8"(output7_tm),
944                         "9"(r0),
945                         "10"(kptr)
946                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
947                 }
948                 for (; i < tiles; i++)
949                 {
950                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
951 
952                     const float* kptr = kernel01_tm.row(r);
953 
954                     int nn = inch; // inch always > 0
955 
956                     asm volatile(
957                         "eor    v16.16b, v16.16b, v16.16b   \n"
958                         "eor    v17.16b, v17.16b, v17.16b   \n"
959                         "eor    v18.16b, v18.16b, v18.16b   \n"
960                         "eor    v19.16b, v19.16b, v19.16b   \n"
961 
962                         "0:                             \n"
963 
964                         "prfm   pldl1keep, [%9, #128]   \n"
965                         "ld1    {v0.4s}, [%9], #16      \n"
966 
967                         "prfm   pldl1keep, [%10, #512]  \n"
968                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
969 
970                         "subs   %w0, %w0, #1            \n"
971 
972                         "fmla   v16.4s, v4.4s, v0.s[0]  \n"
973                         "fmla   v17.4s, v5.4s, v0.s[0]  \n"
974                         "fmla   v18.4s, v6.4s, v0.s[1]  \n"
975                         "fmla   v19.4s, v7.4s, v0.s[1]  \n"
976 
977                         "prfm   pldl1keep, [%10, #512]  \n"
978                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
979 
980                         "fmla   v16.4s, v8.4s, v0.s[2]  \n"
981                         "fmla   v17.4s, v9.4s, v0.s[2]  \n"
982                         "fmla   v18.4s, v10.4s, v0.s[3] \n"
983                         "fmla   v19.4s, v11.4s, v0.s[3] \n"
984 
985                         "bne    0b                      \n"
986 
987                         "fadd   v16.4s, v16.4s, v18.4s  \n"
988                         "fadd   v17.4s, v17.4s, v19.4s  \n"
989 
990                         "st1    {v16.s}[0], [%1], #4    \n"
991                         "st1    {v16.s}[1], [%2], #4    \n"
992                         "st1    {v16.s}[2], [%3], #4    \n"
993                         "st1    {v16.s}[3], [%4], #4    \n"
994                         "st1    {v17.s}[0], [%5], #4    \n"
995                         "st1    {v17.s}[1], [%6], #4    \n"
996                         "st1    {v17.s}[2], [%7], #4    \n"
997                         "st1    {v17.s}[3], [%8], #4    \n"
998 
999                         : "=r"(nn),         // %0
1000                         "=r"(output0_tm), // %1
1001                         "=r"(output1_tm), // %2
1002                         "=r"(output2_tm), // %3
1003                         "=r"(output3_tm), // %4
1004                         "=r"(output4_tm), // %5
1005                         "=r"(output5_tm), // %6
1006                         "=r"(output6_tm), // %7
1007                         "=r"(output7_tm), // %8
1008                         "=r"(r0),         // %9
1009                         "=r"(kptr)        // %10
1010                         : "0"(nn),
1011                         "1"(output0_tm),
1012                         "2"(output1_tm),
1013                         "3"(output2_tm),
1014                         "4"(output3_tm),
1015                         "5"(output4_tm),
1016                         "6"(output5_tm),
1017                         "7"(output6_tm),
1018                         "8"(output7_tm),
1019                         "9"(r0),
1020                         "10"(kptr)
1021                         : "cc", "memory", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1022                 }
1023             }
1024         }
1025 
1026         remain_outch_start += nn_outch << 3;
1027         nn_outch = (outch - remain_outch_start) >> 2;
1028 #else  // __aarch64__
1029         nn_outch = outch >> 2;
1030 #endif // __aarch64__
1031 
1032         #pragma omp parallel for num_threads(opt.num_threads)
1033         for (int pp = 0; pp < nn_outch; pp++)
1034         {
1035             int p = remain_outch_start + pp * 4;
1036 
1037             float* output0_tm = top_blob_tm.channel(p);
1038             float* output1_tm = top_blob_tm.channel(p + 1);
1039             float* output2_tm = top_blob_tm.channel(p + 2);
1040             float* output3_tm = top_blob_tm.channel(p + 3);
1041 
1042 #if __aarch64__
1043             const Mat kernel01_tm = kernel_tm.channel(p / 8 + (p % 8) / 4);
1044 #else
1045             const Mat kernel01_tm = kernel_tm.channel(p / 4);
1046 #endif
1047 
1048             for (int r = 0; r < 64; r++)
1049             {
1050                 const Mat bb2 = bottom_blob_tm2.channel(r);
1051 
1052                 int i = 0;
1053 #if __aarch64__
1054                 for (; i + 11 < tiles; i += 12)
1055                 {
1056                     const float* r0 = bb2.row(i / 12);
1057 
1058                     const float* kptr = kernel01_tm.row(r);
1059 
1060                     int nn = inch; // inch always > 0
1061 
1062                     asm volatile(
1063                         "eor    v8.16b, v8.16b, v8.16b      \n"
1064                         "eor    v9.16b, v9.16b, v9.16b      \n"
1065                         "eor    v10.16b, v10.16b, v10.16b   \n"
1066                         "eor    v11.16b, v11.16b, v11.16b   \n"
1067                         "eor    v12.16b, v12.16b, v12.16b   \n"
1068                         "eor    v13.16b, v13.16b, v13.16b   \n"
1069                         "eor    v14.16b, v14.16b, v14.16b   \n"
1070                         "eor    v15.16b, v15.16b, v15.16b   \n"
1071                         "eor    v16.16b, v16.16b, v16.16b   \n"
1072                         "eor    v17.16b, v17.16b, v17.16b   \n"
1073                         "eor    v18.16b, v18.16b, v18.16b   \n"
1074                         "eor    v19.16b, v19.16b, v19.16b   \n"
1075 
1076                         "0:                             \n"
1077 
1078                         "prfm   pldl1keep, [%5, #512]   \n"
1079                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1080 
1081                         "prfm   pldl1keep, [%6, #512]   \n"
1082                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1083 
1084                         "subs   %w0, %w0, #1            \n"
1085 
1086                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1087                         "fmla   v11.4s, v0.4s, v4.s[1]  \n"
1088                         "fmla   v14.4s, v0.4s, v4.s[2]  \n"
1089                         "fmla   v17.4s, v0.4s, v4.s[3]  \n"
1090                         "fmla   v9.4s, v1.4s, v4.s[0]   \n"
1091                         "fmla   v12.4s, v1.4s, v4.s[1]  \n"
1092                         "fmla   v15.4s, v1.4s, v4.s[2]  \n"
1093                         "fmla   v18.4s, v1.4s, v4.s[3]  \n"
1094                         "fmla   v10.4s, v2.4s, v4.s[0]  \n"
1095                         "fmla   v13.4s, v2.4s, v4.s[1]  \n"
1096                         "fmla   v16.4s, v2.4s, v4.s[2]  \n"
1097                         "fmla   v19.4s, v2.4s, v4.s[3]  \n"
1098 
1099                         "prfm   pldl1keep, [%5, #512]   \n"
1100                         "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%5], #64 \n"
1101 
1102                         "fmla   v8.4s, v3.4s, v5.s[0]   \n"
1103                         "fmla   v11.4s, v3.4s, v5.s[1]  \n"
1104                         "fmla   v14.4s, v3.4s, v5.s[2]  \n"
1105                         "fmla   v17.4s, v3.4s, v5.s[3]  \n"
1106                         "fmla   v9.4s, v20.4s, v5.s[0]  \n"
1107                         "fmla   v12.4s, v20.4s, v5.s[1] \n"
1108                         "fmla   v15.4s, v20.4s, v5.s[2] \n"
1109                         "fmla   v18.4s, v20.4s, v5.s[3] \n"
1110                         "fmla   v10.4s, v21.4s, v5.s[0] \n"
1111                         "fmla   v13.4s, v21.4s, v5.s[1] \n"
1112                         "fmla   v16.4s, v21.4s, v5.s[2] \n"
1113                         "fmla   v19.4s, v21.4s, v5.s[3] \n"
1114 
1115                         "prfm   pldl1keep, [%5, #512]   \n"
1116                         "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%5], #64 \n"
1117 
1118                         "fmla   v8.4s, v22.4s, v6.s[0]  \n"
1119                         "fmla   v11.4s, v22.4s, v6.s[1] \n"
1120                         "fmla   v14.4s, v22.4s, v6.s[2] \n"
1121                         "fmla   v17.4s, v22.4s, v6.s[3] \n"
1122                         "fmla   v9.4s, v23.4s, v6.s[0]  \n"
1123                         "fmla   v12.4s, v23.4s, v6.s[1] \n"
1124                         "fmla   v15.4s, v23.4s, v6.s[2] \n"
1125                         "fmla   v18.4s, v23.4s, v6.s[3] \n"
1126                         "fmla   v10.4s, v24.4s, v6.s[0] \n"
1127                         "fmla   v13.4s, v24.4s, v6.s[1] \n"
1128                         "fmla   v16.4s, v24.4s, v6.s[2] \n"
1129                         "fmla   v19.4s, v24.4s, v6.s[3] \n"
1130 
1131                         "fmla   v8.4s, v25.4s, v7.s[0]  \n"
1132                         "fmla   v11.4s, v25.4s, v7.s[1] \n"
1133                         "fmla   v14.4s, v25.4s, v7.s[2] \n"
1134                         "fmla   v17.4s, v25.4s, v7.s[3] \n"
1135                         "fmla   v9.4s, v26.4s, v7.s[0]  \n"
1136                         "fmla   v12.4s, v26.4s, v7.s[1] \n"
1137                         "fmla   v15.4s, v26.4s, v7.s[2] \n"
1138                         "fmla   v18.4s, v26.4s, v7.s[3] \n"
1139                         "fmla   v10.4s, v27.4s, v7.s[0] \n"
1140                         "fmla   v13.4s, v27.4s, v7.s[1] \n"
1141                         "fmla   v16.4s, v27.4s, v7.s[2] \n"
1142                         "fmla   v19.4s, v27.4s, v7.s[3] \n"
1143 
1144                         "bne    0b                      \n"
1145 
1146                         "st1    {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
1147                         "st1    {v11.4s, v12.4s, v13.4s}, [%2], #48 \n"
1148                         "st1    {v14.4s, v15.4s, v16.4s}, [%3], #48 \n"
1149                         "st1    {v17.4s, v18.4s, v19.4s}, [%4], #48 \n"
1150 
1151                         : "=r"(nn),         // %0
1152                         "=r"(output0_tm), // %1
1153                         "=r"(output1_tm), // %2
1154                         "=r"(output2_tm), // %3
1155                         "=r"(output3_tm), // %4
1156                         "=r"(r0),         // %5
1157                         "=r"(kptr)        // %6
1158                         : "0"(nn),
1159                         "1"(output0_tm),
1160                         "2"(output1_tm),
1161                         "3"(output2_tm),
1162                         "4"(output3_tm),
1163                         "5"(r0),
1164                         "6"(kptr)
1165                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1166                 }
1167 #endif // __aarch64__
1168                 for (; i + 7 < tiles; i += 8)
1169                 {
1170 #if __aarch64__
1171                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1172 #else
1173                     const float* r0 = bb2.row(i / 8);
1174 #endif
1175 
1176                     const float* kptr = kernel01_tm.row(r);
1177 
1178                     int nn = inch; // inch always > 0
1179 
1180 #if __aarch64__
1181                     asm volatile(
1182                         "eor    v8.16b, v8.16b, v8.16b      \n"
1183                         "eor    v9.16b, v9.16b, v9.16b      \n"
1184                         "eor    v10.16b, v10.16b, v10.16b   \n"
1185                         "eor    v11.16b, v11.16b, v11.16b   \n"
1186                         "eor    v12.16b, v12.16b, v12.16b   \n"
1187                         "eor    v13.16b, v13.16b, v13.16b   \n"
1188                         "eor    v14.16b, v14.16b, v14.16b   \n"
1189                         "eor    v15.16b, v15.16b, v15.16b   \n"
1190 
1191                         "0:                             \n"
1192 
1193                         "prfm   pldl1keep, [%5, #512]   \n"
1194                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1195 
1196                         "prfm   pldl1keep, [%6, #512]   \n"
1197                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1198 
1199                         "subs   %w0, %w0, #1            \n"
1200 
1201                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1202                         "fmla   v10.4s, v0.4s, v4.s[1]  \n"
1203                         "fmla   v12.4s, v0.4s, v4.s[2]  \n"
1204                         "fmla   v14.4s, v0.4s, v4.s[3]  \n"
1205                         "fmla   v9.4s, v1.4s, v4.s[0]   \n"
1206                         "fmla   v11.4s, v1.4s, v4.s[1]  \n"
1207                         "fmla   v13.4s, v1.4s, v4.s[2]  \n"
1208                         "fmla   v15.4s, v1.4s, v4.s[3]  \n"
1209 
1210                         "fmla   v8.4s, v2.4s, v5.s[0]   \n"
1211                         "fmla   v10.4s, v2.4s, v5.s[1]  \n"
1212                         "fmla   v12.4s, v2.4s, v5.s[2]  \n"
1213                         "fmla   v14.4s, v2.4s, v5.s[3]  \n"
1214                         "fmla   v9.4s, v3.4s, v5.s[0]   \n"
1215                         "fmla   v11.4s, v3.4s, v5.s[1]  \n"
1216                         "fmla   v13.4s, v3.4s, v5.s[2]  \n"
1217                         "fmla   v15.4s, v3.4s, v5.s[3]  \n"
1218 
1219                         "prfm   pldl1keep, [%5, #512]   \n"
1220                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%5], #64 \n"
1221 
1222                         "fmla   v8.4s, v16.4s, v6.s[0]  \n"
1223                         "fmla   v10.4s, v16.4s, v6.s[1] \n"
1224                         "fmla   v12.4s, v16.4s, v6.s[2] \n"
1225                         "fmla   v14.4s, v16.4s, v6.s[3] \n"
1226                         "fmla   v9.4s, v17.4s, v6.s[0]  \n"
1227                         "fmla   v11.4s, v17.4s, v6.s[1] \n"
1228                         "fmla   v13.4s, v17.4s, v6.s[2] \n"
1229                         "fmla   v15.4s, v17.4s, v6.s[3] \n"
1230 
1231                         "fmla   v8.4s, v18.4s, v7.s[0]  \n"
1232                         "fmla   v10.4s, v18.4s, v7.s[1] \n"
1233                         "fmla   v12.4s, v18.4s, v7.s[2] \n"
1234                         "fmla   v14.4s, v18.4s, v7.s[3] \n"
1235                         "fmla   v9.4s, v19.4s, v7.s[0]  \n"
1236                         "fmla   v11.4s, v19.4s, v7.s[1] \n"
1237                         "fmla   v13.4s, v19.4s, v7.s[2] \n"
1238                         "fmla   v15.4s, v19.4s, v7.s[3] \n"
1239 
1240                         "bne    0b                      \n"
1241 
1242                         "st1    {v8.4s, v9.4s}, [%1], #32 \n"
1243                         "st1    {v10.4s, v11.4s}, [%2], #32 \n"
1244                         "st1    {v12.4s, v13.4s}, [%3], #32 \n"
1245                         "st1    {v14.4s, v15.4s}, [%4], #32 \n"
1246 
1247                         : "=r"(nn),         // %0
1248                         "=r"(output0_tm), // %1
1249                         "=r"(output1_tm), // %2
1250                         "=r"(output2_tm), // %3
1251                         "=r"(output3_tm), // %4
1252                         "=r"(r0),         // %5
1253                         "=r"(kptr)        // %6
1254                         : "0"(nn),
1255                         "1"(output0_tm),
1256                         "2"(output1_tm),
1257                         "3"(output2_tm),
1258                         "4"(output3_tm),
1259                         "5"(r0),
1260                         "6"(kptr)
1261                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1262 #else  // __aarch64__
1263                     asm volatile(
1264                         "veor       q8, q8          \n"
1265                         "veor       q9, q9          \n"
1266                         "veor       q10, q10        \n"
1267                         "veor       q11, q11        \n"
1268                         "veor       q12, q12        \n"
1269                         "veor       q13, q13        \n"
1270                         "veor       q14, q14        \n"
1271                         "veor       q15, q15        \n"
1272 
1273                         "0:                         \n"
1274 
1275                         "pld        [%5, #512]      \n"
1276                         "vldm       %5!, {d0-d7}    \n"
1277 
1278                         "pld        [%6, #512]      \n"
1279                         "vldm       %6!, {d8-d15}   \n"
1280 
1281                         "vmla.f32   q8, q0, d8[0]   \n"
1282                         "vmla.f32   q10, q0, d8[1]  \n"
1283                         "vmla.f32   q12, q0, d9[0]  \n"
1284                         "vmla.f32   q14, q0, d9[1]  \n"
1285                         "vmla.f32   q9, q1, d8[0]   \n"
1286                         "vmla.f32   q11, q1, d8[1]  \n"
1287                         "vmla.f32   q13, q1, d9[0]  \n"
1288                         "vmla.f32   q15, q1, d9[1]  \n"
1289 
1290                         "vmla.f32   q8, q2, d10[0]  \n"
1291                         "vmla.f32   q10, q2, d10[1] \n"
1292                         "vmla.f32   q12, q2, d11[0] \n"
1293                         "vmla.f32   q14, q2, d11[1] \n"
1294                         "vmla.f32   q9, q3, d10[0]  \n"
1295                         "vmla.f32   q11, q3, d10[1] \n"
1296                         "vmla.f32   q13, q3, d11[0] \n"
1297                         "vmla.f32   q15, q3, d11[1] \n"
1298 
1299                         "pld        [%5, #512]      \n"
1300                         "vldm       %5!, {d0-d7}    \n"
1301 
1302                         "vmla.f32   q8, q0, d12[0]  \n"
1303                         "vmla.f32   q10, q0, d12[1] \n"
1304                         "vmla.f32   q12, q0, d13[0] \n"
1305                         "vmla.f32   q14, q0, d13[1] \n"
1306                         "vmla.f32   q9, q1, d12[0]  \n"
1307                         "vmla.f32   q11, q1, d12[1] \n"
1308                         "vmla.f32   q13, q1, d13[0] \n"
1309                         "vmla.f32   q15, q1, d13[1] \n"
1310 
1311                         "subs       %0, %0, #1      \n"
1312 
1313                         "vmla.f32   q8, q2, d14[0]  \n"
1314                         "vmla.f32   q10, q2, d14[1] \n"
1315                         "vmla.f32   q12, q2, d15[0] \n"
1316                         "vmla.f32   q14, q2, d15[1] \n"
1317                         "vmla.f32   q9, q3, d14[0]  \n"
1318                         "vmla.f32   q11, q3, d14[1] \n"
1319                         "vmla.f32   q13, q3, d15[0] \n"
1320                         "vmla.f32   q15, q3, d15[1] \n"
1321 
1322                         "bne        0b              \n"
1323 
1324                         "vst1.f32   {d16-d19}, [%1]! \n"
1325                         "vst1.f32   {d20-d23}, [%2]! \n"
1326                         "vst1.f32   {d24-d27}, [%3]! \n"
1327                         "vst1.f32   {d28-d31}, [%4]! \n"
1328 
1329                         : "=r"(nn),         // %0
1330                         "=r"(output0_tm), // %1
1331                         "=r"(output1_tm), // %2
1332                         "=r"(output2_tm), // %3
1333                         "=r"(output3_tm), // %4
1334                         "=r"(r0),         // %5
1335                         "=r"(kptr)        // %6
1336                         : "0"(nn),
1337                         "1"(output0_tm),
1338                         "2"(output1_tm),
1339                         "3"(output2_tm),
1340                         "4"(output3_tm),
1341                         "5"(r0),
1342                         "6"(kptr)
1343                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1344 #endif // __aarch64__
1345                 }
1346                 for (; i + 3 < tiles; i += 4)
1347                 {
1348 #if __aarch64__
1349                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1350 #else
1351                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1352 #endif
1353 
1354                     const float* kptr = kernel01_tm.row(r);
1355 
1356                     int nn = inch; // inch always > 0
1357 
1358 #if __aarch64__
1359                     asm volatile(
1360                         "eor    v8.16b, v8.16b, v8.16b      \n"
1361                         "eor    v9.16b, v9.16b, v9.16b      \n"
1362                         "eor    v10.16b, v10.16b, v10.16b   \n"
1363                         "eor    v11.16b, v11.16b, v11.16b   \n"
1364 
1365                         "0:                             \n"
1366 
1367                         "prfm   pldl1keep, [%5, #512]   \n"
1368                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1369 
1370                         "prfm   pldl1keep, [%6, #512]   \n"
1371                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1372 
1373                         "subs   %w0, %w0, #1            \n"
1374 
1375                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1376                         "fmla   v9.4s, v0.4s, v4.s[1]   \n"
1377                         "fmla   v10.4s, v0.4s, v4.s[2]  \n"
1378                         "fmla   v11.4s, v0.4s, v4.s[3]  \n"
1379 
1380                         "fmla   v8.4s, v1.4s, v5.s[0]   \n"
1381                         "fmla   v9.4s, v1.4s, v5.s[1]   \n"
1382                         "fmla   v10.4s, v1.4s, v5.s[2]  \n"
1383                         "fmla   v11.4s, v1.4s, v5.s[3]  \n"
1384 
1385                         "fmla   v8.4s, v2.4s, v6.s[0]   \n"
1386                         "fmla   v9.4s, v2.4s, v6.s[1]   \n"
1387                         "fmla   v10.4s, v2.4s, v6.s[2]  \n"
1388                         "fmla   v11.4s, v2.4s, v6.s[3]  \n"
1389 
1390                         "fmla   v8.4s, v3.4s, v7.s[0]   \n"
1391                         "fmla   v9.4s, v3.4s, v7.s[1]   \n"
1392                         "fmla   v10.4s, v3.4s, v7.s[2]  \n"
1393                         "fmla   v11.4s, v3.4s, v7.s[3]  \n"
1394 
1395                         "bne    0b                      \n"
1396 
1397                         "st1    {v8.4s}, [%1], #16      \n"
1398                         "st1    {v9.4s}, [%2], #16      \n"
1399                         "st1    {v10.4s}, [%3], #16     \n"
1400                         "st1    {v11.4s}, [%4], #16     \n"
1401 
1402                         : "=r"(nn),         // %0
1403                         "=r"(output0_tm), // %1
1404                         "=r"(output1_tm), // %2
1405                         "=r"(output2_tm), // %3
1406                         "=r"(output3_tm), // %4
1407                         "=r"(r0),         // %5
1408                         "=r"(kptr)        // %6
1409                         : "0"(nn),
1410                         "1"(output0_tm),
1411                         "2"(output1_tm),
1412                         "3"(output2_tm),
1413                         "4"(output3_tm),
1414                         "5"(r0),
1415                         "6"(kptr)
1416                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
1417 #else  // __aarch64__
1418                     asm volatile(
1419                         "veor       q8, q8          \n"
1420                         "veor       q9, q9          \n"
1421                         "veor       q10, q10        \n"
1422                         "veor       q11, q11        \n"
1423 
1424                         "0:                         \n"
1425 
1426                         "pld        [%5, #512]      \n"
1427                         "vldm       %5!, {d0-d7}    \n"
1428 
1429                         "pld        [%6, #512]      \n"
1430                         "vldm       %6!, {d8-d15}   \n"
1431 
1432                         "vmla.f32   q8, q0, d8[0]   \n"
1433                         "vmla.f32   q9, q0, d8[1]   \n"
1434                         "vmla.f32   q10, q0, d9[0]  \n"
1435                         "vmla.f32   q11, q0, d9[1]  \n"
1436 
1437                         "vmla.f32   q8, q1, d10[0]  \n"
1438                         "vmla.f32   q9, q1, d10[1]  \n"
1439                         "vmla.f32   q10, q1, d11[0] \n"
1440                         "vmla.f32   q11, q1, d11[1] \n"
1441 
1442                         "subs       %0, %0, #1      \n"
1443 
1444                         "vmla.f32   q8, q2, d12[0]  \n"
1445                         "vmla.f32   q9, q2, d12[1]  \n"
1446                         "vmla.f32   q10, q2, d13[0] \n"
1447                         "vmla.f32   q11, q2, d13[1] \n"
1448 
1449                         "vmla.f32   q8, q3, d14[0]  \n"
1450                         "vmla.f32   q9, q3, d14[1]  \n"
1451                         "vmla.f32   q10, q3, d15[0] \n"
1452                         "vmla.f32   q11, q3, d15[1] \n"
1453 
1454                         "bne        0b              \n"
1455 
1456                         "vst1.f32   {d16-d17}, [%1]! \n"
1457                         "vst1.f32   {d18-d19}, [%2]! \n"
1458                         "vst1.f32   {d20-d21}, [%3]! \n"
1459                         "vst1.f32   {d22-d23}, [%4]! \n"
1460 
1461                         : "=r"(nn),         // %0
1462                         "=r"(output0_tm), // %1
1463                         "=r"(output1_tm), // %2
1464                         "=r"(output2_tm), // %3
1465                         "=r"(output3_tm), // %4
1466                         "=r"(r0),         // %5
1467                         "=r"(kptr)        // %6
1468                         : "0"(nn),
1469                         "1"(output0_tm),
1470                         "2"(output1_tm),
1471                         "3"(output2_tm),
1472                         "4"(output3_tm),
1473                         "5"(r0),
1474                         "6"(kptr)
1475                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1476 #endif // __aarch64__
1477                 }
1478                 for (; i < tiles; i++)
1479                 {
1480 #if __aarch64__
1481                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
1482 #else
1483                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
1484 #endif
1485 
1486                     const float* kptr = kernel01_tm.row(r);
1487 
1488                     int nn = inch; // inch always > 0
1489 
1490 #if __aarch64__
1491                     asm volatile(
1492                         "eor    v8.16b, v8.16b, v8.16b  \n"
1493                         "eor    v9.16b, v9.16b, v9.16b  \n"
1494                         "eor    v10.16b, v10.16b, v10.16b \n"
1495                         "eor    v11.16b, v11.16b, v11.16b \n"
1496 
1497                         "0:                             \n"
1498 
1499                         "prfm   pldl1keep, [%5, #128]   \n"
1500                         "ld1    {v0.4s}, [%5], #16      \n"
1501 
1502                         "prfm   pldl1keep, [%6, #512]   \n"
1503                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1504 
1505                         "subs   %w0, %w0, #1            \n"
1506 
1507                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
1508                         "fmla   v9.4s, v5.4s, v0.s[1]   \n"
1509                         "fmla   v10.4s, v6.4s, v0.s[2]  \n"
1510                         "fmla   v11.4s, v7.4s, v0.s[3]  \n"
1511 
1512                         "bne    0b                      \n"
1513 
1514                         "fadd   v8.4s, v8.4s, v9.4s     \n"
1515                         "fadd   v10.4s, v10.4s, v11.4s  \n"
1516                         "fadd   v8.4s, v8.4s, v10.4s    \n"
1517 
1518                         "st1    {v8.s}[0], [%1], #4     \n"
1519                         "st1    {v8.s}[1], [%2], #4     \n"
1520                         "st1    {v8.s}[2], [%3], #4     \n"
1521                         "st1    {v8.s}[3], [%4], #4     \n"
1522 
1523                         : "=r"(nn),         // %0
1524                         "=r"(output0_tm), // %1
1525                         "=r"(output1_tm), // %2
1526                         "=r"(output2_tm), // %3
1527                         "=r"(output3_tm), // %4
1528                         "=r"(r0),         // %5
1529                         "=r"(kptr)        // %6
1530                         : "0"(nn),
1531                         "1"(output0_tm),
1532                         "2"(output1_tm),
1533                         "3"(output2_tm),
1534                         "4"(output3_tm),
1535                         "5"(r0),
1536                         "6"(kptr)
1537                         : "cc", "memory", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
1538 #else  // __aarch64__
1539                     asm volatile(
1540                         "veor       q8, q8          \n"
1541                         "veor       q9, q9          \n"
1542                         "veor       q10, q10        \n"
1543                         "veor       q11, q11        \n"
1544 
1545                         "0:                         \n"
1546 
1547                         "pld        [%5, #128]      \n"
1548                         "vld1.f32   {d0-d1}, [%5]!  \n"
1549 
1550                         "pld        [%6, #512]      \n"
1551                         "vldm       %6!, {d8-d15}   \n"
1552 
1553                         "subs       %0, %0, #1      \n"
1554 
1555                         "vmla.f32   q8, q4, d0[0]   \n"
1556                         "vmla.f32   q9, q5, d0[1]   \n"
1557                         "vmla.f32   q10, q6, d1[0]  \n"
1558                         "vmla.f32   q11, q7, d1[1]  \n"
1559 
1560                         "bne        0b              \n"
1561 
1562                         "vadd.f32   q8, q8, q9      \n"
1563                         "vadd.f32   q10, q10, q11   \n"
1564                         "vadd.f32   q8, q8, q10     \n"
1565 
1566                         "vst1.f32   {d16[0]}, [%1]! \n"
1567                         "vst1.f32   {d16[1]}, [%2]! \n"
1568                         "vst1.f32   {d17[0]}, [%3]! \n"
1569                         "vst1.f32   {d17[1]}, [%4]! \n"
1570 
1571                         : "=r"(nn),         // %0
1572                         "=r"(output0_tm), // %1
1573                         "=r"(output1_tm), // %2
1574                         "=r"(output2_tm), // %3
1575                         "=r"(output3_tm), // %4
1576                         "=r"(r0),         // %5
1577                         "=r"(kptr)        // %6
1578                         : "0"(nn),
1579                         "1"(output0_tm),
1580                         "2"(output1_tm),
1581                         "3"(output2_tm),
1582                         "4"(output3_tm),
1583                         "5"(r0),
1584                         "6"(kptr)
1585                         : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1586 #endif // __aarch64__
1587                 }
1588             }
1589         }
1590 
1591         remain_outch_start += nn_outch << 2;
1592 
1593         #pragma omp parallel for num_threads(opt.num_threads)
1594         for (int p = remain_outch_start; p < outch; p++)
1595         {
1596             float* output0_tm = top_blob_tm.channel(p);
1597 
1598 #if __aarch64__
1599             const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
1600 #else
1601             const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
1602 #endif
1603 
1604             for (int r = 0; r < 64; r++)
1605             {
1606                 const Mat bb2 = bottom_blob_tm2.channel(r);
1607 
1608                 int i = 0;
1609 #if __aarch64__
1610                 for (; i + 11 < tiles; i += 12)
1611                 {
1612                     const float* r0 = bb2.row(i / 12);
1613 
1614                     const float* kptr = kernel0_tm.row(r);
1615 
1616                     int nn = inch; // inch always > 0
1617 
1618                     asm volatile(
1619                         "eor    v8.16b, v8.16b, v8.16b  \n"
1620                         "eor    v9.16b, v9.16b, v9.16b  \n"
1621                         "eor    v10.16b, v10.16b, v10.16b \n"
1622                         "eor    v5.16b, v5.16b, v5.16b  \n"
1623                         "eor    v6.16b, v6.16b, v6.16b  \n"
1624                         "eor    v7.16b, v7.16b, v7.16b  \n"
1625 
1626                         "0:                             \n"
1627 
1628                         "prfm   pldl1keep, [%2, #512]   \n"
1629                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1630 
1631                         "prfm   pldl1keep, [%3, #128]   \n"
1632                         "ld1    {v4.4s}, [%3], #16      \n"
1633 
1634                         "subs   %w0, %w0, #1            \n"
1635 
1636                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1637                         "fmla   v9.4s, v1.4s, v4.s[0]   \n"
1638                         "fmla   v10.4s, v2.4s, v4.s[0]  \n"
1639 
1640                         "prfm   pldl1keep, [%2, #512]   \n"
1641                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n"
1642 
1643                         "fmla   v5.4s, v3.4s, v4.s[1]   \n"
1644                         "fmla   v6.4s, v12.4s, v4.s[1]  \n"
1645                         "fmla   v7.4s, v13.4s, v4.s[1]  \n"
1646 
1647                         "prfm   pldl1keep, [%2, #512]   \n"
1648                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%2], #64 \n"
1649 
1650                         "fmla   v8.4s, v14.4s, v4.s[2]  \n"
1651                         "fmla   v9.4s, v15.4s, v4.s[2]  \n"
1652                         "fmla   v10.4s, v16.4s, v4.s[2] \n"
1653 
1654                         "fmla   v5.4s, v17.4s, v4.s[3]  \n"
1655                         "fmla   v6.4s, v18.4s, v4.s[3]  \n"
1656                         "fmla   v7.4s, v19.4s, v4.s[3]  \n"
1657 
1658                         "bne    0b                      \n"
1659 
1660                         "fadd   v8.4s, v8.4s, v5.4s     \n"
1661                         "fadd   v9.4s, v9.4s, v6.4s     \n"
1662                         "fadd   v10.4s, v10.4s, v7.4s   \n"
1663 
1664                         "st1    {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
1665 
1666                         : "=r"(nn),         // %0
1667                         "=r"(output0_tm), // %1
1668                         "=r"(r0),         // %2
1669                         "=r"(kptr)        // %3
1670                         : "0"(nn),
1671                         "1"(output0_tm),
1672                         "2"(r0),
1673                         "3"(kptr)
1674                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1675                 }
1676 #endif
1677                 for (; i + 7 < tiles; i += 8)
1678                 {
1679 #if __aarch64__
1680                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1681 #else
1682                     const float* r0 = bb2.row(i / 8);
1683 #endif
1684 
1685                     const float* kptr = kernel0_tm.row(r);
1686 
1687                     int nn = inch; // inch always > 0
1688 
1689 #if __aarch64__
1690                     asm volatile(
1691                         "eor    v8.16b, v8.16b, v8.16b  \n"
1692                         "eor    v9.16b, v9.16b, v9.16b  \n"
1693                         "eor    v10.16b, v10.16b, v10.16b \n"
1694                         "eor    v11.16b, v11.16b, v11.16b \n"
1695 
1696                         "0:                             \n"
1697 
1698                         "prfm   pldl1keep, [%2, #512]   \n"
1699                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1700 
1701                         "prfm   pldl1keep, [%3, #128]   \n"
1702                         "ld1    {v4.4s}, [%3], #16      \n"
1703 
1704                         "subs   %w0, %w0, #1            \n"
1705 
1706                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1707                         "fmla   v9.4s, v1.4s, v4.s[0]   \n"
1708                         "fmla   v10.4s, v2.4s, v4.s[1]  \n"
1709                         "fmla   v11.4s, v3.4s, v4.s[1]  \n"
1710 
1711                         "prfm   pldl1keep, [%2, #512]   \n"
1712                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n"
1713 
1714                         "fmla   v8.4s, v12.4s, v4.s[2]  \n"
1715                         "fmla   v9.4s, v13.4s, v4.s[2]  \n"
1716                         "fmla   v10.4s, v14.4s, v4.s[3] \n"
1717                         "fmla   v11.4s, v15.4s, v4.s[3] \n"
1718 
1719                         "bne    0b                      \n"
1720 
1721                         "fadd   v8.4s, v8.4s, v10.4s    \n"
1722                         "fadd   v9.4s, v9.4s, v11.4s    \n"
1723 
1724                         "st1    {v8.4s, v9.4s}, [%1], #32 \n"
1725 
1726                         : "=r"(nn),         // %0
1727                         "=r"(output0_tm), // %1
1728                         "=r"(r0),         // %2
1729                         "=r"(kptr)        // %3
1730                         : "0"(nn),
1731                         "1"(output0_tm),
1732                         "2"(r0),
1733                         "3"(kptr)
1734                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
1735 #else  // __aarch64__
1736                     asm volatile(
1737                         "veor       q8, q8          \n"
1738                         "veor       q9, q9          \n"
1739                         "veor       q10, q10        \n"
1740                         "veor       q11, q11        \n"
1741 
1742                         "0:                         \n"
1743 
1744                         "pld        [%2, #512]      \n"
1745                         "vldm       %2!, {d0-d7}    \n"
1746 
1747                         "pld        [%3, #128]      \n"
1748                         "vld1.f32   {d8-d9}, [%3]!  \n"
1749 
1750                         "vmla.f32   q8, q0, d8[0]   \n"
1751                         "vmla.f32   q9, q1, d8[0]   \n"
1752                         "vmla.f32   q10, q2, d8[1]  \n"
1753                         "vmla.f32   q11, q3, d8[1]  \n"
1754 
1755                         "pld        [%2, #512]      \n"
1756                         "vldm       %2!, {d24-d31}  \n"
1757 
1758                         "subs       %0, %0, #1      \n"
1759 
1760                         "vmla.f32   q8, q12, d9[0]  \n"
1761                         "vmla.f32   q9, q13, d9[0]  \n"
1762                         "vmla.f32   q10, q14, d9[1] \n"
1763                         "vmla.f32   q11, q15, d9[1] \n"
1764 
1765                         "bne        0b              \n"
1766 
1767                         "vadd.f32   q8, q8, q10     \n"
1768                         "vadd.f32   q9, q9, q11     \n"
1769 
1770                         "vst1.f32   {d16-d19}, [%1]! \n"
1771 
1772                         : "=r"(nn),         // %0
1773                         "=r"(output0_tm), // %1
1774                         "=r"(r0),         // %2
1775                         "=r"(kptr)        // %3
1776                         : "0"(nn),
1777                         "1"(output0_tm),
1778                         "2"(r0),
1779                         "3"(kptr)
1780                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1781 #endif // __aarch64__
1782                 }
1783                 for (; i + 3 < tiles; i += 4)
1784                 {
1785 #if __aarch64__
1786                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1787 #else
1788                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1789 #endif
1790 
1791                     const float* kptr = kernel0_tm.row(r);
1792 
1793                     int nn = inch; // inch always > 0
1794 
1795 #if __aarch64__
1796                     asm volatile(
1797                         "eor    v8.16b, v8.16b, v8.16b  \n"
1798                         "eor    v9.16b, v9.16b, v9.16b  \n"
1799                         "eor    v10.16b, v10.16b, v10.16b \n"
1800                         "eor    v11.16b, v11.16b, v11.16b \n"
1801 
1802                         "0:                             \n"
1803 
1804                         "prfm   pldl1keep, [%2, #512]   \n"
1805                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1806 
1807                         "prfm   pldl1keep, [%3, #128]   \n"
1808                         "ld1    {v4.4s}, [%3], #16      \n"
1809 
1810                         "subs   %w0, %w0, #1            \n"
1811 
1812                         "fmla   v8.4s, v0.4s, v4.s[0]   \n"
1813                         "fmla   v9.4s, v1.4s, v4.s[1]   \n"
1814                         "fmla   v10.4s, v2.4s, v4.s[2]  \n"
1815                         "fmla   v11.4s, v3.4s, v4.s[3]  \n"
1816 
1817                         "bne    0b                      \n"
1818 
1819                         "fadd   v8.4s, v8.4s, v9.4s     \n"
1820                         "fadd   v10.4s, v10.4s, v11.4s  \n"
1821                         "fadd   v8.4s, v8.4s, v10.4s    \n"
1822 
1823                         "st1    {v8.4s}, [%1], #16      \n"
1824 
1825                         : "=r"(nn),         // %0
1826                         "=r"(output0_tm), // %1
1827                         "=r"(r0),         // %2
1828                         "=r"(kptr)        // %3
1829                         : "0"(nn),
1830                         "1"(output0_tm),
1831                         "2"(r0),
1832                         "3"(kptr)
1833                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11");
1834 #else  // __aarch64__
1835                     asm volatile(
1836                         "veor       q8, q8          \n"
1837                         "veor       q9, q9          \n"
1838                         "veor       q10, q10        \n"
1839                         "veor       q11, q11        \n"
1840 
1841                         "0:                         \n"
1842 
1843                         "pld        [%2, #512]      \n"
1844                         "vldm       %2!, {d0-d7}    \n"
1845 
1846                         "pld        [%3, #128]      \n"
1847                         "vld1.f32   {d8-d9}, [%3]!  \n"
1848 
1849                         "subs       %0, %0, #1      \n"
1850 
1851                         "vmla.f32   q8, q0, d8[0]   \n"
1852                         "vmla.f32   q9, q1, d8[1]   \n"
1853                         "vmla.f32   q10, q2, d9[0]  \n"
1854                         "vmla.f32   q11, q3, d9[1]  \n"
1855 
1856                         "bne        0b              \n"
1857 
1858                         "vadd.f32   q8, q8, q9      \n"
1859                         "vadd.f32   q10, q10, q11   \n"
1860                         "vadd.f32   q8, q8, q10     \n"
1861 
1862                         "vst1.f32   {d16-d17}, [%1]! \n"
1863 
1864                         : "=r"(nn),         // %0
1865                         "=r"(output0_tm), // %1
1866                         "=r"(r0),         // %2
1867                         "=r"(kptr)        // %3
1868                         : "0"(nn),
1869                         "1"(output0_tm),
1870                         "2"(r0),
1871                         "3"(kptr)
1872                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11");
1873 #endif // __aarch64__
1874                 }
1875                 for (; i < tiles; i++)
1876                 {
1877 #if __aarch64__
1878                     const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
1879 #else
1880                     const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
1881 #endif
1882 
1883                     const float* kptr = kernel0_tm.row(r);
1884 
1885                     float32x4_t _sum0 = vdupq_n_f32(0.f);
1886 
1887                     for (int q = 0; q < inch; q++)
1888                     {
1889                         float32x4_t _r0 = vld1q_f32(r0);
1890 
1891                         float32x4_t _k0 = vld1q_f32(kptr);
1892 
1893                         _sum0 = vmlaq_f32(_sum0, _r0, _k0);
1894 
1895                         kptr += 4;
1896                         r0 += 4;
1897                     }
1898 
1899 #if __aarch64__
1900                     float sum0 = vaddvq_f32(_sum0);
1901 #else
1902                     float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
1903                     float32x2_t _ss2 = vpadd_f32(_ss, _ss);
1904                     float sum0 = vget_lane_f32(_ss2, 0);
1905 #endif
1906 
1907                     output0_tm[0] = sum0;
1908 
1909                     output0_tm++;
1910                 }
1911             }
1912         }
1913     }
1914     bottom_blob_tm = Mat();
1915     // END dot
1916 
1917     // BEGIN transform output
1918     Mat top_blob_bordered;
1919     if (outw == top_blob.w && outh == top_blob.h)
1920     {
1921         top_blob_bordered = top_blob;
1922     }
1923     else
1924     {
1925         top_blob_bordered.create(outw, outh, outch, 2u, 1, opt.workspace_allocator);
1926     }
1927     {
1928         //         const float otm[6][8] = {
1929         //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
1930         //             {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
1931         //             {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
1932         //             {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
1933         //             {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
1934         //             {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
1935         //         };
1936 
1937         // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
1938         // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
1939         // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
1940         // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
1941         // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
1942         // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
1943 
1944         int w_tm = outw / 6 * 8;
1945         int h_tm = outh / 6 * 8;
1946         const int tiles = w_tm / 8 * h_tm / 8;
1947 
1948         #pragma omp parallel for num_threads(opt.num_threads)
1949         for (int p = 0; p < outch; p++)
1950         {
1951             const Mat out0_tm = top_blob_tm.channel(p);
1952             Mat out0 = top_blob_bordered.channel(p);
1953 
1954             const float bias0 = bias ? bias[p] : 0.f;
1955             // float32x2_t _bias0 = vdup_n_f32(bias0);
1956 
1957             float tmp[6][8];
1958 
1959             // tile
1960             for (int i = 0; i < outh / 6; i++)
1961             {
1962                 for (int j = 0; j < outw / 6; j++)
1963                 {
1964                     //                     top_blob_tm.create(tiles, 64, outch, 4u, 1, opt.workspace_allocator);
1965 
1966                     const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 8 + j) * 1;
1967                     const float* output0_tm_1 = output0_tm_0 + tiles * 1;
1968                     const float* output0_tm_2 = output0_tm_0 + tiles * 2;
1969                     const float* output0_tm_3 = output0_tm_0 + tiles * 3;
1970                     const float* output0_tm_4 = output0_tm_0 + tiles * 4;
1971                     const float* output0_tm_5 = output0_tm_0 + tiles * 5;
1972                     const float* output0_tm_6 = output0_tm_0 + tiles * 6;
1973                     const float* output0_tm_7 = output0_tm_0 + tiles * 7;
1974 
1975                     // TODO neon optimize
1976                     for (int m = 0; m < 8; m++)
1977                     {
1978                         float tmp024a = output0_tm_1[0] + output0_tm_2[0];
1979                         float tmp135a = output0_tm_1[0] - output0_tm_2[0];
1980 
1981                         float tmp024b = output0_tm_3[0] + output0_tm_4[0];
1982                         float tmp135b = output0_tm_3[0] - output0_tm_4[0];
1983 
1984                         float tmp024c = output0_tm_5[0] + output0_tm_6[0];
1985                         float tmp135c = output0_tm_5[0] - output0_tm_6[0];
1986 
1987                         tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
1988                         tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
1989                         tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
1990 
1991                         tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
1992                         tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
1993                         tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
1994 
1995                         output0_tm_0 += tiles * 8;
1996                         output0_tm_1 += tiles * 8;
1997                         output0_tm_2 += tiles * 8;
1998                         output0_tm_3 += tiles * 8;
1999                         output0_tm_4 += tiles * 8;
2000                         output0_tm_5 += tiles * 8;
2001                         output0_tm_6 += tiles * 8;
2002                         output0_tm_7 += tiles * 8;
2003                     }
2004 
2005                     unsigned short* output0 = out0.row<unsigned short>(i * 6) + j * 6;
2006 
2007                     for (int m = 0; m < 6; m++)
2008                     {
2009                         const float* tmp0 = tmp[m];
2010 
2011                         float tmp024a = tmp0[1] + tmp0[2];
2012                         float tmp135a = tmp0[1] - tmp0[2];
2013 
2014                         float tmp024b = tmp0[3] + tmp0[4];
2015                         float tmp135b = tmp0[3] - tmp0[4];
2016 
2017                         float tmp024c = tmp0[5] + tmp0[6];
2018                         float tmp135c = tmp0[5] - tmp0[6];
2019 
2020                         output0[0] = float32_to_bfloat16(bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32);
2021                         output0[2] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 4 + tmp024c * 8);
2022                         output0[4] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c);
2023 
2024                         output0[1] = float32_to_bfloat16(bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16);
2025                         output0[3] = float32_to_bfloat16(bias0 + tmp135a + tmp135b * 8 + tmp135c * 4);
2026                         output0[5] = float32_to_bfloat16(bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c);
2027 
2028                         output0 += outw;
2029                     }
2030                 }
2031             }
2032         }
2033     }
2034     // END transform output
2035 
2036     // cut result pad
2037     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
2038 }
2039