1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv3x3s1_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)15 static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
16 {
17     int w = bottom_blob.w;
18     int inch = bottom_blob.c;
19 
20     int outw = top_blob.w;
21     int outh = top_blob.h;
22     int outch = top_blob.c;
23 
24     const float* kernel = _kernel;
25     const float* bias = _bias;
26 
27     int nn_outch = outch >> 1;
28     int remain_outch_start = nn_outch << 1;
29 
30     #pragma omp parallel for num_threads(opt.num_threads)
31     for (int pp = 0; pp < nn_outch; pp++)
32     {
33         int p = pp * 2;
34 
35         Mat out0 = top_blob.channel(p);
36         Mat out1 = top_blob.channel(p + 1);
37 
38         const float bias0 = bias ? bias[p] : 0.f;
39         const float bias1 = bias ? bias[p + 1] : 0.f;
40 
41         out0.fill(bias0);
42         out1.fill(bias1);
43 
44         const float* k0 = kernel + p * inch * 9;
45         const float* k1 = kernel + (p + 1) * inch * 9;
46 
47         for (int q = 0; q < inch; q++)
48         {
49             float* outptr0 = out0;
50             float* outptr1 = out1;
51             float* outptr0n = outptr0 + outw;
52             float* outptr1n = outptr1 + outw;
53 
54             const float* img0 = bottom_blob.channel(q);
55 
56             const float* r0 = img0;
57             const float* r1 = img0 + w;
58             const float* r2 = img0 + w * 2;
59             const float* r3 = img0 + w * 3;
60 
61 #if __ARM_NEON
62             float32x4_t _k00 = vld1q_f32(k0);
63             float32x4_t _k03 = vld1q_f32(k0 + 3);
64             float32x4_t _k06 = vld1q_f32(k0 + 6);
65 
66             float32x4_t _k10 = vld1q_f32(k1);
67             float32x4_t _k13 = vld1q_f32(k1 + 3);
68             float32x4_t _k16 = vld1q_f32(k1 + 6);
69 #endif // __ARM_NEON
70 
71             int i = 0;
72 
73             for (; i + 1 < outh; i += 2)
74             {
75 #if __ARM_NEON
76                 int nn = outw >> 2;
77                 int remain = outw & 3;
78 #else
79                 int remain = outw;
80 #endif // __ARM_NEON
81 
82 #if __ARM_NEON
83 #if __aarch64__
84                 if (nn > 0)
85                 {
86                     asm volatile(
87                         "prfm   pldl1keep, [%5, #256]       \n"
88                         "ld1    {v8.4s, v9.4s}, [%5]        \n" // r0
89                         "add    %5, %5, #16                 \n"
90 
91                         "prfm   pldl1keep, [%8, #256]       \n"
92                         "ld1    {v14.4s, v15.4s}, [%8]      \n" // r3
93                         "add    %8, %8, #16                 \n"
94 
95                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
96                         "ext    v11.16b, v14.16b, v15.16b, #8 \n"
97 
98                         "0:                                 \n"
99 
100                         "prfm   pldl1keep, [%1, #128]       \n"
101                         "ld1    {v6.4s}, [%1]               \n" // _sum0
102 
103                         "prfm   pldl1keep, [%2, #128]       \n"
104                         "ld1    {v7.4s}, [%2]               \n" // _sum1
105 
106                         "fmla   v6.4s, v8.4s, %18.s[0]      \n"
107                         "fmla   v7.4s, v8.4s, %21.s[0]      \n"
108 
109                         "prfm   pldl1keep, [%3, #128]       \n"
110                         "ld1    {v12.4s}, [%3]              \n" // _sum0n
111 
112                         "prfm   pldl1keep, [%4, #128]       \n"
113                         "ld1    {v13.4s}, [%4]              \n" // _sum1n
114 
115                         "fmla   v12.4s, v14.4s, %20.s[0]    \n"
116                         "fmla   v13.4s, v14.4s, %23.s[0]    \n"
117 
118                         "ext    v8.16b, v8.16b, v9.16b, #8  \n"
119                         "ext    v9.16b, v14.16b, v15.16b, #4 \n"
120 
121                         "fmla   v6.4s, v10.4s, %18.s[1]     \n"
122                         "fmla   v7.4s, v10.4s, %21.s[1]     \n"
123                         "fmla   v12.4s, v11.4s, %20.s[2]    \n"
124                         "fmla   v13.4s, v11.4s, %23.s[2]    \n"
125 
126                         "prfm   pldl1keep, [%6, #256]       \n"
127                         "ld1    {v14.4s, v15.4s}, [%6]      \n" // r1
128                         "add    %6, %6, #16                 \n"
129 
130                         "fmla   v6.4s, v8.4s, %18.s[2]      \n"
131                         "fmla   v7.4s, v8.4s, %21.s[2]      \n"
132                         "fmla   v12.4s, v9.4s, %20.s[1]     \n"
133                         "fmla   v13.4s, v9.4s, %23.s[1]     \n"
134 
135                         "ext    v10.16b, v14.16b, v15.16b, #4 \n"
136 
137                         "fmla   v6.4s, v14.4s, %19.s[0]     \n"
138                         "fmla   v7.4s, v14.4s, %22.s[0]     \n"
139                         "fmla   v12.4s, v14.4s, %18.s[0]    \n"
140                         "fmla   v13.4s, v14.4s, %21.s[0]    \n"
141 
142                         "ext    v11.16b, v14.16b, v15.16b, #8 \n"
143 
144                         "fmla   v6.4s, v10.4s, %19.s[1]     \n"
145                         "fmla   v7.4s, v10.4s, %22.s[1]     \n"
146                         "fmla   v12.4s, v10.4s, %18.s[1]    \n"
147                         "fmla   v13.4s, v10.4s, %21.s[1]    \n"
148 
149                         "prfm   pldl1keep, [%7, #256]       \n"
150                         "ld1    {v8.4s, v9.4s}, [%7]        \n" // r2
151                         "add    %7, %7, #16                 \n"
152 
153                         "fmla   v6.4s, v11.4s, %19.s[2]     \n"
154                         "fmla   v7.4s, v11.4s, %22.s[2]     \n"
155                         "fmla   v12.4s, v11.4s, %18.s[2]    \n"
156                         "fmla   v13.4s, v11.4s, %21.s[2]    \n"
157 
158                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
159 
160                         "fmla   v6.4s, v8.4s, %20.s[0]      \n"
161                         "fmla   v7.4s, v8.4s, %23.s[0]      \n"
162                         "fmla   v12.4s, v8.4s, %19.s[0]     \n"
163                         "fmla   v13.4s, v8.4s, %22.s[0]     \n"
164 
165                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
166 
167                         "fmla   v6.4s, v10.4s, %20.s[1]     \n"
168                         "fmla   v7.4s, v10.4s, %23.s[1]     \n"
169                         "fmla   v12.4s, v10.4s, %19.s[1]    \n"
170                         "fmla   v13.4s, v10.4s, %22.s[1]    \n"
171 
172                         "prfm   pldl1keep, [%5, #256]       \n"
173                         "ld1    {v8.4s, v9.4s}, [%5]        \n" // r0
174                         "add    %5, %5, #16                 \n"
175 
176                         "fmla   v6.4s, v11.4s, %20.s[2]     \n"
177                         "fmla   v7.4s, v11.4s, %23.s[2]     \n"
178                         "fmla   v12.4s, v11.4s, %19.s[2]    \n"
179                         "fmla   v13.4s, v11.4s, %22.s[2]    \n"
180 
181                         "prfm   pldl1keep, [%8, #256]       \n"
182                         "ld1    {v14.4s, v15.4s}, [%8]      \n" // r3
183                         "add    %8, %8, #16                 \n"
184 
185                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
186 
187                         "st1    {v6.4s}, [%1], #16          \n"
188                         "st1    {v7.4s}, [%2], #16          \n"
189 
190                         "ext    v11.16b, v14.16b, v15.16b, #8 \n"
191 
192                         "st1    {v12.4s}, [%3], #16         \n"
193                         "st1    {v13.4s}, [%4], #16         \n"
194 
195                         "subs   %w0, %w0, #1                \n"
196                         "bne    0b                          \n"
197 
198                         "sub    %5, %5, #16                 \n"
199                         "sub    %8, %8, #16                 \n"
200                         : "=r"(nn),       // %0
201                         "=r"(outptr0),  // %1
202                         "=r"(outptr1),  // %2
203                         "=r"(outptr0n), // %3
204                         "=r"(outptr1n), // %4
205                         "=r"(r0),       // %5
206                         "=r"(r1),       // %6
207                         "=r"(r2),       // %7
208                         "=r"(r3)        // %8
209                         : "0"(nn),
210                         "1"(outptr0),
211                         "2"(outptr1),
212                         "3"(outptr0n),
213                         "4"(outptr1n),
214                         "5"(r0),
215                         "6"(r1),
216                         "7"(r2),
217                         "8"(r3),
218                         "w"(_k00), // %18
219                         "w"(_k03), // %19
220                         "w"(_k06), // %20
221                         "w"(_k10), // %21
222                         "w"(_k13), // %22
223                         "w"(_k16)  // %23
224                         : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
225                 }
226 #else
227                 if (nn > 0)
228                 {
229                     asm volatile(
230 
231                         "pld        [%5, #192]          \n"
232                         "vld1.f32   {d16-d18}, [%5 :64] \n" // r0
233                         "add        %5, #16             \n"
234 
235                         "pld        [%8, #192]          \n"
236                         "vld1.f32   {d28-d30}, [%8]     \n" // r3
237                         "add        %8, #16             \n"
238 
239                         "vext.32    q10, q8, q9, #1     \n"
240                         "vext.32    q11, q14, q15, #2   \n"
241 
242                         "0:                             \n"
243 
244                         "pld        [%1, #128]          \n"
245                         "vld1.f32   {d12-d13}, [%1 :64] \n" // _sum0
246 
247                         "pld        [%2, #128]          \n"
248                         "vld1.f32   {d14-d15}, [%2 :64] \n" // _sum1
249 
250                         "vmla.f32   q6, q8, %e18[0]     \n"
251                         "vmla.f32   q7, q8, %e21[0]     \n"
252 
253                         "pld        [%3, #128]          \n"
254                         "vld1.f32   {d24-d25}, [%3]     \n" // _sum0n
255 
256                         "pld        [%4, #128]          \n"
257                         "vld1.f32   {d26-d27}, [%4]     \n" // _sum1n
258 
259                         "vmla.f32   q12, q14, %e20[0]   \n"
260                         "vmla.f32   q13, q14, %e23[0]   \n"
261 
262                         "vext.32    q8, q8, q9, #2      \n"
263                         "vext.32    q9, q14, q15, #1    \n"
264 
265                         "vmla.f32   q6, q10, %e18[1]    \n"
266                         "vmla.f32   q7, q10, %e21[1]    \n"
267                         "vmla.f32   q12, q11, %f20[0]   \n"
268                         "vmla.f32   q13, q11, %f23[0]   \n"
269 
270                         "pld        [%6, #192]          \n"
271                         "vld1.f32   {d28-d30}, [%6]     \n" // r1
272                         "add        %6, #16             \n"
273 
274                         "vmla.f32   q6, q8, %f18[0]     \n"
275                         "vmla.f32   q7, q8, %f21[0]     \n"
276                         "vmla.f32   q12, q9, %e20[1]    \n"
277                         "vmla.f32   q13, q9, %e23[1]    \n"
278 
279                         "vext.32    q10, q14, q15, #1   \n"
280 
281                         "vmla.f32   q6, q14, %e19[0]    \n"
282                         "vmla.f32   q7, q14, %e22[0]    \n"
283                         "vmla.f32   q12, q14, %e18[0]   \n"
284                         "vmla.f32   q13, q14, %e21[0]   \n"
285 
286                         "vext.32    q11, q14, q15, #2   \n"
287 
288                         "vmla.f32   q6, q10, %e19[1]    \n"
289                         "vmla.f32   q7, q10, %e22[1]    \n"
290                         "vmla.f32   q12, q10, %e18[1]   \n"
291                         "vmla.f32   q13, q10, %e21[1]   \n"
292 
293                         "pld        [%7, #192]          \n"
294                         "vld1.f32   {d16-d18}, [%7 :64] \n" // r2
295                         "add        %7, #16             \n"
296 
297                         "vmla.f32   q6, q11, %f19[0]    \n"
298                         "vmla.f32   q7, q11, %f22[0]    \n"
299                         "vmla.f32   q12, q11, %f18[0]   \n"
300                         "vmla.f32   q13, q11, %f21[0]   \n"
301 
302                         "vext.32    q10, q8, q9, #1     \n"
303 
304                         "vmla.f32   q6, q8, %e20[0]     \n"
305                         "vmla.f32   q7, q8, %e23[0]     \n"
306                         "vmla.f32   q12, q8, %e19[0]    \n"
307                         "vmla.f32   q13, q8, %e22[0]    \n"
308 
309                         "vext.32    q11, q8, q9, #2     \n"
310 
311                         "vmla.f32   q6, q10, %e20[1]    \n"
312                         "vmla.f32   q7, q10, %e23[1]    \n"
313                         "vmla.f32   q12, q10, %e19[1]   \n"
314                         "vmla.f32   q13, q10, %e22[1]   \n"
315 
316                         "pld        [%5, #192]          \n"
317                         "vld1.f32   {d16-d18}, [%5 :64] \n" // r0
318                         "add        %5, #16             \n"
319 
320                         "vmla.f32   q6, q11, %f20[0]    \n"
321                         "vmla.f32   q7, q11, %f23[0]    \n"
322                         "vmla.f32   q12, q11, %f19[0]   \n"
323                         "vmla.f32   q13, q11, %f22[0]   \n"
324 
325                         "pld        [%8, #192]          \n"
326                         "vld1.f32   {d28-d30}, [%8]     \n" // r3
327                         "add        %8, #16             \n"
328 
329                         "vext.32    q10, q8, q9, #1     \n"
330 
331                         "vst1.f32   {d12-d13}, [%1 : 64]!\n"
332                         "vst1.f32   {d14-d15}, [%2 : 64]!\n"
333 
334                         "vext.32    q11, q14, q15, #2   \n"
335 
336                         "vst1.f32   {d24-d25}, [%3]!    \n"
337                         "vst1.f32   {d26-d27}, [%4]!    \n"
338 
339                         "subs       %0, #1              \n"
340                         "bne        0b                  \n"
341 
342                         "sub        %5, #16             \n"
343                         "sub        %8, #16             \n"
344                         : "=r"(nn),       // %0
345                         "=r"(outptr0),  // %1
346                         "=r"(outptr1),  // %2
347                         "=r"(outptr0n), // %3
348                         "=r"(outptr1n), // %4
349                         "=r"(r0),       // %5
350                         "=r"(r1),       // %6
351                         "=r"(r2),       // %7
352                         "=r"(r3)        // %8
353                         : "0"(nn),
354                         "1"(outptr0),
355                         "2"(outptr1),
356                         "3"(outptr0n),
357                         "4"(outptr1n),
358                         "5"(r0),
359                         "6"(r1),
360                         "7"(r2),
361                         "8"(r3),
362                         "w"(_k00), // %18
363                         "w"(_k03), // %19
364                         "w"(_k06), // %20
365                         "w"(_k10), // %21
366                         "w"(_k13), // %22
367                         "w"(_k16)  // %23
368                         : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
369                 }
370 #endif // __aarch64__
371 #endif // __ARM_NEON
372                 for (; remain > 0; remain--)
373                 {
374 #if __ARM_NEON
375                     float32x4_t _r00 = vld1q_f32(r0);
376                     float32x4_t _r10 = vld1q_f32(r1);
377                     float32x4_t _r20 = vld1q_f32(r2);
378                     float32x4_t _r30 = vld1q_f32(r3);
379 
380                     float32x4_t _sum0 = vmulq_f32(_r00, _k00);
381                     float32x4_t _sum1 = vmulq_f32(_r00, _k10);
382                     _sum0 = vmlaq_f32(_sum0, _r10, _k03);
383                     _sum1 = vmlaq_f32(_sum1, _r10, _k13);
384                     _sum0 = vmlaq_f32(_sum0, _r20, _k06);
385                     _sum1 = vmlaq_f32(_sum1, _r20, _k16);
386 
387                     float32x4_t _sum0n = vmulq_f32(_r10, _k00);
388                     float32x4_t _sum1n = vmulq_f32(_r10, _k10);
389                     _sum0n = vmlaq_f32(_sum0n, _r20, _k03);
390                     _sum1n = vmlaq_f32(_sum1n, _r20, _k13);
391                     _sum0n = vmlaq_f32(_sum0n, _r30, _k06);
392                     _sum1n = vmlaq_f32(_sum1n, _r30, _k16);
393 
394                     _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
395                     _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
396                     _sum0n = vsetq_lane_f32(*outptr0n, _sum0n, 3);
397                     _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3);
398 #if __aarch64__
399                     *outptr0 = vaddvq_f32(_sum0);
400                     *outptr1 = vaddvq_f32(_sum1);
401                     *outptr0n = vaddvq_f32(_sum0n);
402                     *outptr1n = vaddvq_f32(_sum1n);
403 #else
404                     float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
405                     float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
406                     float32x2_t _ss0n = vadd_f32(vget_low_f32(_sum0n), vget_high_f32(_sum0n));
407                     float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n));
408 
409                     float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
410                     float32x2_t _ss01n = vpadd_f32(_ss0n, _ss1n);
411 
412                     *outptr0 = vget_lane_f32(_ss01, 0);
413                     *outptr1 = vget_lane_f32(_ss01, 1);
414                     *outptr0n = vget_lane_f32(_ss01n, 0);
415                     *outptr1n = vget_lane_f32(_ss01n, 1);
416 #endif // __aarch64__
417 #else
418                     float sum0 = 0.f;
419                     float sum0n = 0.f;
420                     float sum1 = 0.f;
421                     float sum1n = 0.f;
422 
423                     sum0 += r0[0] * k0[0];
424                     sum0 += r0[1] * k0[1];
425                     sum0 += r0[2] * k0[2];
426                     sum0 += r1[0] * k0[3];
427                     sum0 += r1[1] * k0[4];
428                     sum0 += r1[2] * k0[5];
429                     sum0 += r2[0] * k0[6];
430                     sum0 += r2[1] * k0[7];
431                     sum0 += r2[2] * k0[8];
432 
433                     sum1 += r0[0] * k1[0];
434                     sum1 += r0[1] * k1[1];
435                     sum1 += r0[2] * k1[2];
436                     sum1 += r1[0] * k1[3];
437                     sum1 += r1[1] * k1[4];
438                     sum1 += r1[2] * k1[5];
439                     sum1 += r2[0] * k1[6];
440                     sum1 += r2[1] * k1[7];
441                     sum1 += r2[2] * k1[8];
442 
443                     sum0n += r1[0] * k0[0];
444                     sum0n += r1[1] * k0[1];
445                     sum0n += r1[2] * k0[2];
446                     sum0n += r2[0] * k0[3];
447                     sum0n += r2[1] * k0[4];
448                     sum0n += r2[2] * k0[5];
449                     sum0n += r3[0] * k0[6];
450                     sum0n += r3[1] * k0[7];
451                     sum0n += r3[2] * k0[8];
452 
453                     sum1n += r1[0] * k1[0];
454                     sum1n += r1[1] * k1[1];
455                     sum1n += r1[2] * k1[2];
456                     sum1n += r2[0] * k1[3];
457                     sum1n += r2[1] * k1[4];
458                     sum1n += r2[2] * k1[5];
459                     sum1n += r3[0] * k1[6];
460                     sum1n += r3[1] * k1[7];
461                     sum1n += r3[2] * k1[8];
462 
463                     *outptr0 += sum0;
464                     *outptr1 += sum1;
465                     *outptr0n += sum0n;
466                     *outptr1n += sum1n;
467 #endif // __ARM_NEON
468                     r0++;
469                     r1++;
470                     r2++;
471                     r3++;
472                     outptr0++;
473                     outptr1++;
474                     outptr0n++;
475                     outptr1n++;
476                 }
477 
478                 r0 += 2 + w;
479                 r1 += 2 + w;
480                 r2 += 2 + w;
481                 r3 += 2 + w;
482 
483                 outptr0 += outw;
484                 outptr1 += outw;
485                 outptr0n += outw;
486                 outptr1n += outw;
487             }
488 
489             for (; i < outh; i++)
490             {
491 #if __ARM_NEON
492                 int nn = outw >> 2;
493                 int remain = outw & 3;
494 #else
495                 int remain = outw;
496 #endif // __ARM_NEON
497 
498 #if __ARM_NEON
499 #if __aarch64__
500                 if (nn > 0)
501                 {
502                     asm volatile(
503                         "0:                                 \n"
504 
505                         "prfm   pldl1keep, [%3, #256]       \n"
506                         "ld1    {v8.4s, v9.4s}, [%3]        \n" // r0
507                         "add    %3, %3, #16                 \n"
508 
509                         "prfm   pldl1keep, [%1, #128]       \n"
510                         "ld1    {v6.4s}, [%1]               \n" // _sum0
511 
512                         "prfm   pldl1keep, [%2, #128]       \n"
513                         "ld1    {v7.4s}, [%2]               \n" // _sum1
514 
515                         "fmul   v14.4s, v8.4s, %12.s[0]     \n"
516                         "fmul   v15.4s, v8.4s, %15.s[0]     \n"
517 
518                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
519                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
520 
521                         "fmla   v6.4s, v10.4s, %12.s[1]     \n"
522                         "fmla   v7.4s, v10.4s, %15.s[1]     \n"
523 
524                         "prfm   pldl1keep, [%4, #256]       \n"
525                         "ld1    {v8.4s, v9.4s}, [%4]        \n" // r1
526                         "add    %4, %4, #16                 \n"
527 
528                         "fmla   v14.4s, v11.4s, %12.s[2]    \n"
529                         "fmla   v15.4s, v11.4s, %15.s[2]    \n"
530 
531                         "fmla   v6.4s, v8.4s, %13.s[0]      \n"
532                         "fmla   v7.4s, v8.4s, %16.s[0]      \n"
533 
534                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
535                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
536 
537                         "fmla   v14.4s, v10.4s, %13.s[1]    \n"
538                         "fmla   v15.4s, v10.4s, %16.s[1]    \n"
539 
540                         "prfm   pldl1keep, [%5, #256]       \n"
541                         "ld1    {v8.4s, v9.4s}, [%5]        \n" // r2
542                         "add    %5, %5, #16                 \n"
543 
544                         "fmla   v6.4s, v11.4s, %13.s[2]     \n"
545                         "fmla   v7.4s, v11.4s, %16.s[2]     \n"
546 
547                         "fmla   v14.4s, v8.4s, %14.s[0]     \n"
548                         "fmla   v15.4s, v8.4s, %17.s[0]     \n"
549 
550                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
551                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
552 
553                         "fmla   v6.4s, v10.4s, %14.s[1]     \n"
554                         "fmla   v7.4s, v10.4s, %17.s[1]     \n"
555 
556                         "fmla   v14.4s, v11.4s, %14.s[2]    \n"
557                         "fmla   v15.4s, v11.4s, %17.s[2]    \n"
558 
559                         "fadd   v6.4s, v6.4s, v14.4s        \n"
560                         "fadd   v7.4s, v7.4s, v15.4s        \n"
561 
562                         "st1    {v6.4s}, [%1], #16          \n"
563                         "st1    {v7.4s}, [%2], #16          \n"
564 
565                         "subs   %w0, %w0, #1                \n"
566                         "bne    0b                          \n"
567 
568                         : "=r"(nn),      // %0
569                         "=r"(outptr0), // %1
570                         "=r"(outptr1), // %2
571                         "=r"(r0),      // %3
572                         "=r"(r1),      // %4
573                         "=r"(r2)       // %5
574                         : "0"(nn),
575                         "1"(outptr0),
576                         "2"(outptr1),
577                         "3"(r0),
578                         "4"(r1),
579                         "5"(r2),
580                         "w"(_k00), // %12
581                         "w"(_k03), // %13
582                         "w"(_k06), // %14
583                         "w"(_k10), // %15
584                         "w"(_k13), // %16
585                         "w"(_k16)  // %17
586                         : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
587                 }
588 #else
589                 if (nn > 0)
590                 {
591                     asm volatile(
592                         "0:                             \n"
593 
594                         "pld        [%3, #192]          \n"
595                         "vld1.f32   {d16-d18}, [%3]     \n" // r0
596                         "add        %3, #16             \n"
597 
598                         "pld        [%1, #128]          \n"
599                         "vld1.f32   {d12-d13}, [%1]     \n" // _sum0
600 
601                         "pld        [%2, #128]          \n"
602                         "vld1.f32   {d14-d15}, [%2]     \n" // _sum1
603 
604                         "vmul.f32   q14, q8, %e12[0]    \n"
605                         "vmul.f32   q15, q8, %e15[0]    \n"
606 
607                         "vext.32    q10, q8, q9, #1     \n"
608                         "vext.32    q11, q8, q9, #2     \n"
609 
610                         "vmla.f32   q6, q10, %e12[1]    \n"
611                         "vmla.f32   q7, q10, %e15[1]    \n"
612 
613                         "pld        [%4, #192]          \n"
614                         "vld1.f32   {d16-d18}, [%4]     \n" // r1
615                         "add        %4, #16             \n"
616 
617                         "vmla.f32   q14, q11, %f12[0]   \n"
618                         "vmla.f32   q15, q11, %f15[0]   \n"
619 
620                         "vmla.f32   q6, q8, %e13[0]     \n"
621                         "vmla.f32   q7, q8, %e16[0]     \n"
622 
623                         "vext.32    q10, q8, q9, #1     \n"
624                         "vext.32    q11, q8, q9, #2     \n"
625 
626                         "vmla.f32   q14, q10, %e13[1]   \n"
627                         "vmla.f32   q15, q10, %e16[1]   \n"
628 
629                         "pld        [%5, #192]          \n"
630                         "vld1.f32   {d16-d18}, [%5]     \n" // r2
631                         "add        %5, #16             \n"
632 
633                         "vmla.f32   q6, q11, %f13[0]    \n"
634                         "vmla.f32   q7, q11, %f16[0]    \n"
635 
636                         "vmla.f32   q14, q8, %e14[0]    \n"
637                         "vmla.f32   q15, q8, %e17[0]    \n"
638 
639                         "vext.32    q10, q8, q9, #1     \n"
640                         "vext.32    q11, q8, q9, #2     \n"
641 
642                         "vmla.f32   q6, q10, %e14[1]    \n"
643                         "vmla.f32   q7, q10, %e17[1]    \n"
644 
645                         "vmla.f32   q14, q11, %f14[0]   \n"
646                         "vmla.f32   q15, q11, %f17[0]   \n"
647 
648                         "vadd.f32   q6, q6, q14         \n"
649                         "vadd.f32   q7, q7, q15         \n"
650 
651                         "vst1.f32   {d12-d13}, [%1]!    \n"
652 
653                         "vst1.f32   {d14-d15}, [%2]!    \n"
654 
655                         "subs       %0, #1              \n"
656                         "bne        0b                  \n"
657 
658                         : "=r"(nn),      // %0
659                         "=r"(outptr0), // %1
660                         "=r"(outptr1), // %2
661                         "=r"(r0),      // %3
662                         "=r"(r1),      // %4
663                         "=r"(r2)       // %5
664                         : "0"(nn),
665                         "1"(outptr0),
666                         "2"(outptr1),
667                         "3"(r0),
668                         "4"(r1),
669                         "5"(r2),
670                         "w"(_k00), // %12
671                         "w"(_k03), // %13
672                         "w"(_k06), // %14
673                         "w"(_k10), // %15
674                         "w"(_k13), // %16
675                         "w"(_k16)  // %17
676                         : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
677                 }
678 #endif // __aarch64__
679 #endif // __ARM_NEON
680                 for (; remain > 0; remain--)
681                 {
682 #if __ARM_NEON
683                     float32x4_t _r00 = vld1q_f32(r0);
684                     float32x4_t _r10 = vld1q_f32(r1);
685                     float32x4_t _r20 = vld1q_f32(r2);
686 
687                     float32x4_t _sum0 = vmulq_f32(_r00, _k00);
688                     float32x4_t _sum1 = vmulq_f32(_r00, _k10);
689                     _sum0 = vmlaq_f32(_sum0, _r10, _k03);
690                     _sum1 = vmlaq_f32(_sum1, _r10, _k13);
691                     _sum0 = vmlaq_f32(_sum0, _r20, _k06);
692                     _sum1 = vmlaq_f32(_sum1, _r20, _k16);
693 
694                     _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
695                     _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
696 #if __aarch64__
697                     *outptr0 = vaddvq_f32(_sum0);
698                     *outptr1 = vaddvq_f32(_sum1);
699 #else
700                     float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
701                     float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
702                     float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
703 
704                     *outptr0 = vget_lane_f32(_ss01, 0);
705                     *outptr1 = vget_lane_f32(_ss01, 1);
706 #endif // __aarch64__
707 #else
708                     float sum0 = 0.f;
709                     float sum1 = 0.f;
710 
711                     sum0 += r0[0] * k0[0];
712                     sum0 += r0[1] * k0[1];
713                     sum0 += r0[2] * k0[2];
714                     sum0 += r1[0] * k0[3];
715                     sum0 += r1[1] * k0[4];
716                     sum0 += r1[2] * k0[5];
717                     sum0 += r2[0] * k0[6];
718                     sum0 += r2[1] * k0[7];
719                     sum0 += r2[2] * k0[8];
720 
721                     sum1 += r0[0] * k1[0];
722                     sum1 += r0[1] * k1[1];
723                     sum1 += r0[2] * k1[2];
724                     sum1 += r1[0] * k1[3];
725                     sum1 += r1[1] * k1[4];
726                     sum1 += r1[2] * k1[5];
727                     sum1 += r2[0] * k1[6];
728                     sum1 += r2[1] * k1[7];
729                     sum1 += r2[2] * k1[8];
730 
731                     *outptr0 += sum0;
732                     *outptr1 += sum1;
733 #endif // __ARM_NEON
734                     r0++;
735                     r1++;
736                     r2++;
737                     outptr0++;
738                     outptr1++;
739                 }
740 
741                 r0 += 2;
742                 r1 += 2;
743                 r2 += 2;
744             }
745 
746             k0 += 9;
747             k1 += 9;
748         }
749     }
750 
751     #pragma omp parallel for num_threads(opt.num_threads)
752     for (int p = remain_outch_start; p < outch; p++)
753     {
754         Mat out = top_blob.channel(p);
755 
756         const float bias0 = bias ? bias[p] : 0.f;
757 
758         out.fill(bias0);
759 
760         const float* kernel0 = kernel + p * inch * 9;
761 
762         for (int q = 0; q < inch; q++)
763         {
764             float* outptr = out;
765             float* outptr2 = outptr + outw;
766 
767             const float* img0 = bottom_blob.channel(q);
768 
769             const float* r0 = img0;
770             const float* r1 = img0 + w;
771             const float* r2 = img0 + w * 2;
772             const float* r3 = img0 + w * 3;
773 
774 #if __ARM_NEON
775             float32x4_t _k0123 = vld1q_f32(kernel0);
776             float32x4_t _k3456 = vld1q_f32(kernel0 + 3);
777             float32x4_t _k6789 = vld1q_f32(kernel0 + 6);
778 #else
779             const float* k0 = kernel0;
780             const float* k1 = kernel0 + 3;
781             const float* k2 = kernel0 + 6;
782 #endif // __ARM_NEON
783 
784             int i = 0;
785 
786             for (; i + 1 < outh; i += 2)
787             {
788 #if __ARM_NEON
789                 int nn = outw >> 2;
790                 int remain = outw & 3;
791 #else
792                 int remain = outw;
793 #endif // __ARM_NEON
794 
795 #if __ARM_NEON
796 #if __aarch64__
797                 if (nn > 0)
798                 {
799                     asm volatile(
800                         "prfm   pldl1keep, [%3, #256]       \n"
801                         "ld1    {v9.4s, v10.4s}, [%3]       \n" // r0
802                         "add    %3, %3, #16                 \n"
803 
804                         "ext    v11.16b, v9.16b, v10.16b, #4 \n"
805                         "ext    v12.16b, v9.16b, v10.16b, #8 \n"
806 
807                         "0:                                 \n"
808 
809                         "prfm   pldl1keep, [%1, #128]       \n"
810                         "ld1    {v7.4s}, [%1]               \n" // _sum
811 
812                         "fmla   v7.4s, v9.4s, %14.s[0]      \n"
813                         "fmul   v6.4s, v11.4s, %14.s[1]     \n"
814                         "fmul   v13.4s, v12.4s, %14.s[2]    \n"
815 
816                         "prfm   pldl1keep, [%4, #256]       \n"
817                         "ld1    {v9.4s, v10.4s}, [%4]       \n" // r1
818                         "add    %4, %4, #16                 \n"
819 
820                         "fmla   v7.4s, v9.4s, %15.s[0]      \n"
821 
822                         "ext    v11.16b, v9.16b, v10.16b, #4 \n"
823                         "ext    v12.16b, v9.16b, v10.16b, #8 \n"
824 
825                         "fmla   v6.4s, v11.4s, %15.s[1]     \n"
826                         "fmla   v13.4s, v12.4s, %15.s[2]    \n"
827 
828                         "prfm   pldl1keep, [%2, #128]       \n"
829                         "ld1    {v8.4s}, [%2]               \n" // _sum2
830 
831                         "fmla   v8.4s, v9.4s, %14.s[0]      \n"
832                         "fmul   v14.4s, v11.4s, %14.s[1]    \n"
833                         "fmul   v15.4s, v12.4s, %14.s[2]    \n"
834 
835                         "prfm   pldl1keep, [%5, #256]       \n"
836                         "ld1    {v9.4s, v10.4s}, [%5]       \n" // r2
837                         "add    %5, %5, #16                 \n"
838 
839                         "fmla   v7.4s, v9.4s, %16.s[0]      \n"
840 
841                         "ext    v11.16b, v9.16b, v10.16b, #4 \n"
842                         "ext    v12.16b, v9.16b, v10.16b, #8 \n"
843 
844                         "fmla   v6.4s, v11.4s, %16.s[1]     \n"
845                         "fmla   v13.4s, v12.4s, %16.s[2]    \n"
846 
847                         "fmla   v8.4s, v9.4s, %15.s[0]      \n"
848                         "fmla   v14.4s, v11.4s, %15.s[1]    \n"
849                         "fmla   v15.4s, v12.4s, %15.s[2]    \n"
850 
851                         "prfm   pldl1keep, [%6, #256]       \n"
852                         "ld1    {v9.4s, v10.4s}, [%6]       \n" // r3
853                         "add    %6, %6, #16                 \n"
854 
855                         "fmla   v8.4s, v9.4s, %16.s[0]      \n"
856 
857                         "ext    v11.16b, v9.16b, v10.16b, #4 \n"
858                         "ext    v12.16b, v9.16b, v10.16b, #8 \n"
859 
860                         "fmla   v14.4s, v11.4s, %16.s[1]    \n"
861                         "fmla   v15.4s, v12.4s, %16.s[2]    \n"
862 
863                         "fadd   v7.4s, v7.4s, v6.4s         \n"
864 
865                         "prfm   pldl1keep, [%3, #256]       \n"
866                         "ld1    {v9.4s, v10.4s}, [%3]       \n" // r0
867 
868                         "fadd   v8.4s, v8.4s, v14.4s        \n"
869                         "fadd   v7.4s, v7.4s, v13.4s        \n"
870                         "fadd   v8.4s, v8.4s, v15.4s        \n"
871 
872                         "ext    v11.16b, v9.16b, v10.16b, #4 \n"
873                         "ext    v12.16b, v9.16b, v10.16b, #8 \n"
874 
875                         "add    %3, %3, #16                 \n"
876 
877                         "st1    {v7.4s}, [%1], #16          \n"
878                         "st1    {v8.4s}, [%2], #16          \n"
879 
880                         "subs   %w0, %w0, #1                \n"
881                         "bne    0b                          \n"
882 
883                         "sub    %3, %3, #16                 \n"
884                         : "=r"(nn),      // %0
885                         "=r"(outptr),  // %1
886                         "=r"(outptr2), // %2
887                         "=r"(r0),      // %3
888                         "=r"(r1),      // %4
889                         "=r"(r2),      // %5
890                         "=r"(r3)       // %6
891                         : "0"(nn),
892                         "1"(outptr),
893                         "2"(outptr2),
894                         "3"(r0),
895                         "4"(r1),
896                         "5"(r2),
897                         "6"(r3),
898                         "w"(_k0123), // %14
899                         "w"(_k3456), // %15
900                         "w"(_k6789)  // %16
901                         : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
902                 }
903 #else
904                 if (nn > 0)
905                 {
906                     asm volatile(
907                         "pld        [%3, #192]          \n"
908                         "vld1.f32   {d18-d20}, [%3 :64] \n" // r0
909                         "add        %3, #16             \n"
910 
911                         "vext.32    q11, q9, q10, #1    \n"
912                         "vext.32    q12, q9, q10, #2    \n"
913 
914                         "0:                             \n"
915 
916                         "pld        [%1, #128]          \n"
917                         "vld1.f32   {d14-d15}, [%1 :64] \n" // _sum
918 
919                         "vmla.f32   q7, q9, %e14[0]     \n"
920                         "vmul.f32   q6, q11, %e14[1]    \n"
921                         "vmul.f32   q13, q12, %f14[0]   \n"
922 
923                         "pld        [%4, #192]          \n"
924                         "vld1.f32   {d18-d20}, [%4]     \n" // r1
925                         "add        %4, #16             \n"
926 
927                         "vmla.f32   q7, q9, %e15[0]     \n"
928 
929                         "vext.32    q11, q9, q10, #1    \n"
930                         "vext.32    q12, q9, q10, #2    \n"
931 
932                         "vmla.f32   q6, q11, %e15[1]    \n"
933                         "vmla.f32   q13, q12, %f15[0]   \n"
934 
935                         "pld        [%2, #128]          \n"
936                         "vld1.f32   {d16-d17}, [%2]     \n" // _sum2
937 
938                         "vmla.f32   q8, q9, %e14[0]     \n"
939                         "vmul.f32   q14, q11, %e14[1]   \n"
940                         "vmul.f32   q15, q12, %f14[0]   \n"
941 
942                         "pld        [%5, #192]          \n"
943                         "vld1.f32   {d18-d20}, [%5 :64] \n" // r2
944                         "add        %5, #16             \n"
945 
946                         "vmla.f32   q7, q9, %e16[0]     \n"
947 
948                         "vext.32    q11, q9, q10, #1    \n"
949                         "vext.32    q12, q9, q10, #2    \n"
950 
951                         "vmla.f32   q6, q11, %e16[1]    \n"
952                         "vmla.f32   q13, q12, %f16[0]   \n"
953 
954                         "vmla.f32   q8, q9, %e15[0]     \n"
955                         "vmla.f32   q14, q11, %e15[1]   \n"
956                         "vmla.f32   q15, q12, %f15[0]   \n"
957 
958                         "pld        [%6, #192]          \n"
959                         "vld1.f32   {d18-d20}, [%6]     \n" // r3
960                         "add        %6, #16             \n"
961 
962                         "vmla.f32   q8, q9, %e16[0]     \n"
963 
964                         "vext.32    q11, q9, q10, #1    \n"
965                         "vext.32    q12, q9, q10, #2    \n"
966 
967                         "vmla.f32   q14, q11, %e16[1]   \n"
968                         "vmla.f32   q15, q12, %f16[0]   \n"
969 
970                         "vadd.f32   q7, q7, q6          \n"
971 
972                         "pld        [%3, #192]          \n"
973                         "vld1.f32   {d18-d20}, [%3 :64] \n" // r0
974 
975                         "vadd.f32   q8, q8, q14         \n"
976                         "vadd.f32   q7, q7, q13         \n"
977                         "vadd.f32   q8, q8, q15         \n"
978 
979                         "vext.32    q11, q9, q10, #1    \n"
980                         "vext.32    q12, q9, q10, #2    \n"
981 
982                         "add        %3, #16             \n"
983 
984                         "vst1.f32   {d14-d15}, [%1]!    \n"
985                         "vst1.f32   {d16-d17}, [%2]!    \n"
986 
987                         "subs       %0, #1              \n"
988                         "bne        0b                  \n"
989 
990                         "sub        %3, #16             \n"
991                         : "=r"(nn),      // %0
992                         "=r"(outptr),  // %1
993                         "=r"(outptr2), // %2
994                         "=r"(r0),      // %3
995                         "=r"(r1),      // %4
996                         "=r"(r2),      // %5
997                         "=r"(r3)       // %6
998                         : "0"(nn),
999                         "1"(outptr),
1000                         "2"(outptr2),
1001                         "3"(r0),
1002                         "4"(r1),
1003                         "5"(r2),
1004                         "6"(r3),
1005                         "w"(_k0123), // %14
1006                         "w"(_k3456), // %15
1007                         "w"(_k6789)  // %16
1008                         : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1009                 }
1010 #endif // __aarch64__
1011 #endif // __ARM_NEON
1012                 for (; remain > 0; remain--)
1013                 {
1014 #if __ARM_NEON
1015                     float32x4_t _r00 = vld1q_f32(r0);
1016                     float32x4_t _r10 = vld1q_f32(r1);
1017                     float32x4_t _r20 = vld1q_f32(r2);
1018                     float32x4_t _r30 = vld1q_f32(r3);
1019 
1020                     float32x4_t _sum = vmulq_f32(_r00, _k0123);
1021                     _sum = vmlaq_f32(_sum, _r10, _k3456);
1022                     _sum = vmlaq_f32(_sum, _r20, _k6789);
1023 
1024                     float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
1025                     _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
1026                     _sum2 = vmlaq_f32(_sum2, _r30, _k6789);
1027 
1028                     _sum = vsetq_lane_f32(*outptr, _sum, 3);
1029                     _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);
1030 
1031 #if __aarch64__
1032                     *outptr = vaddvq_f32(_sum);
1033                     *outptr2 = vaddvq_f32(_sum2);
1034 #else
1035                     float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
1036                     float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
1037 
1038                     float32x2_t _sss2 = vpadd_f32(_ss, _ss2);
1039 
1040                     *outptr = vget_lane_f32(_sss2, 0);
1041                     *outptr2 = vget_lane_f32(_sss2, 1);
1042 #endif // __aarch64__
1043 #else
1044                     float sum = 0;
1045                     float sum2 = 0;
1046 
1047                     sum += r0[0] * k0[0];
1048                     sum += r0[1] * k0[1];
1049                     sum += r0[2] * k0[2];
1050                     sum += r1[0] * k1[0];
1051                     sum += r1[1] * k1[1];
1052                     sum += r1[2] * k1[2];
1053                     sum += r2[0] * k2[0];
1054                     sum += r2[1] * k2[1];
1055                     sum += r2[2] * k2[2];
1056 
1057                     sum2 += r1[0] * k0[0];
1058                     sum2 += r1[1] * k0[1];
1059                     sum2 += r1[2] * k0[2];
1060                     sum2 += r2[0] * k1[0];
1061                     sum2 += r2[1] * k1[1];
1062                     sum2 += r2[2] * k1[2];
1063                     sum2 += r3[0] * k2[0];
1064                     sum2 += r3[1] * k2[1];
1065                     sum2 += r3[2] * k2[2];
1066 
1067                     *outptr += sum;
1068                     *outptr2 += sum2;
1069 #endif
1070                     r0++;
1071                     r1++;
1072                     r2++;
1073                     r3++;
1074                     outptr++;
1075                     outptr2++;
1076                 }
1077 
1078                 r0 += 2 + w;
1079                 r1 += 2 + w;
1080                 r2 += 2 + w;
1081                 r3 += 2 + w;
1082 
1083                 outptr += outw;
1084                 outptr2 += outw;
1085             }
1086 
1087             for (; i < outh; i++)
1088             {
1089 #if __ARM_NEON
1090                 int nn = outw >> 2;
1091                 int remain = outw & 3;
1092 #else
1093                 int remain = outw;
1094 #endif // __ARM_NEON
1095 
1096 #if __ARM_NEON
1097 #if __aarch64__
1098                 if (nn > 0)
1099                 {
1100                     asm volatile(
1101                         "prfm   pldl1keep, [%2, #256]       \n"
1102                         "ld1    {v8.4s, v9.4s}, [%2]        \n" // r0
1103                         "add    %2, %2, #16                 \n"
1104 
1105                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
1106                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
1107 
1108                         "0:                                 \n"
1109 
1110                         "prfm   pldl1keep, [%1, #128]       \n"
1111                         "ld1    {v7.4s}, [%1]               \n" // _sum
1112 
1113                         "fmla   v7.4s, v8.4s, %10.s[0]      \n"
1114                         "fmul   v13.4s, v10.4s, %10.s[1]    \n"
1115                         "fmul   v14.4s, v11.4s, %10.s[2]    \n"
1116 
1117                         "prfm   pldl1keep, [%3, #256]       \n"
1118                         "ld1    {v8.4s, v9.4s}, [%3]        \n" // r1
1119                         "add    %3, %3, #16                 \n"
1120 
1121                         "fmla   v7.4s, v8.4s, %11.s[0]      \n"
1122 
1123                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
1124                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
1125 
1126                         "fmla   v13.4s, v10.4s, %11.s[1]    \n"
1127                         "fmla   v14.4s, v11.4s, %11.s[2]    \n"
1128 
1129                         "prfm   pldl1keep, [%4, #256]       \n"
1130                         "ld1    {v8.4s, v9.4s}, [%4]        \n" // r2
1131                         "add    %4, %4, #16                 \n"
1132 
1133                         "fmla   v7.4s, v8.4s, %12.s[0]      \n"
1134 
1135                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
1136                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
1137 
1138                         "fmla   v13.4s, v10.4s, %12.s[1]    \n"
1139                         "fmla   v14.4s, v11.4s, %12.s[2]    \n"
1140 
1141                         "prfm   pldl1keep, [%2, #256]       \n"
1142                         "ld1    {v8.4s, v9.4s}, [%2]        \n" // r0
1143                         "add    %2, %2, #16                 \n"
1144 
1145                         "fadd   v7.4s, v7.4s, v13.4s        \n"
1146                         "fadd   v7.4s, v7.4s, v14.4s        \n"
1147 
1148                         "ext    v10.16b, v8.16b, v9.16b, #4 \n"
1149                         "ext    v11.16b, v8.16b, v9.16b, #8 \n"
1150 
1151                         "st1    {v7.4s}, [%1], #16          \n"
1152 
1153                         "subs   %w0, %w0, #1                \n"
1154                         "bne    0b                          \n"
1155 
1156                         "sub    %2, %2, #16                 \n"
1157                         : "=r"(nn),     // %0
1158                         "=r"(outptr), // %1
1159                         "=r"(r0),     // %2
1160                         "=r"(r1),     // %3
1161                         "=r"(r2)      // %4
1162                         : "0"(nn),
1163                         "1"(outptr),
1164                         "2"(r0),
1165                         "3"(r1),
1166                         "4"(r2),
1167                         "w"(_k0123), // %10
1168                         "w"(_k3456), // %11
1169                         "w"(_k6789)  // %12
1170                         : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
1171                 }
1172 #else
1173                 if (nn > 0)
1174                 {
1175                     asm volatile(
1176                         "pld        [%2, #192]          \n"
1177                         "vld1.f32   {d16-d18}, [%2]     \n" // r0
1178                         "add        %2, #16             \n"
1179 
1180                         "vext.32    q10, q8, q9, #1     \n"
1181                         "vext.32    q11, q8, q9, #2     \n"
1182 
1183                         "0:                             \n"
1184 
1185                         "pld        [%1, #128]          \n"
1186                         "vld1.f32   {d14-d15}, [%1]     \n" // _sum
1187 
1188                         "vmla.f32   q7, q8, %e10[0]     \n"
1189                         "vmul.f32   q13, q10, %e10[1]   \n"
1190                         "vmul.f32   q14, q11, %f10[0]   \n"
1191 
1192                         "pld        [%3, #192]          \n"
1193                         "vld1.f32   {d16-d18}, [%3]     \n" // r1
1194                         "add        %3, #16             \n"
1195 
1196                         "vmla.f32   q7, q8, %e11[0]     \n"
1197 
1198                         "vext.32    q10, q8, q9, #1     \n"
1199                         "vext.32    q11, q8, q9, #2     \n"
1200 
1201                         "vmla.f32   q13, q10, %e11[1]   \n"
1202                         "vmla.f32   q14, q11, %f11[0]   \n"
1203 
1204                         "pld        [%4, #192]          \n"
1205                         "vld1.f32   {d16-d18}, [%4]     \n" // r2
1206                         "add        %4, #16             \n"
1207 
1208                         "vmla.f32   q7, q8, %e12[0]     \n"
1209 
1210                         "vext.32    q10, q8, q9, #1     \n"
1211                         "vext.32    q11, q8, q9, #2     \n"
1212 
1213                         "vmla.f32   q13, q10, %e12[1]   \n"
1214                         "vmla.f32   q14, q11, %f12[0]   \n"
1215 
1216                         "pld        [%2, #192]          \n"
1217                         "vld1.f32   {d16-d18}, [%2]     \n" // r0
1218                         "add        %2, #16             \n"
1219 
1220                         "vadd.f32   q7, q7, q13         \n"
1221                         "vadd.f32   q7, q7, q14         \n"
1222 
1223                         "vext.32    q10, q8, q9, #1     \n"
1224                         "vext.32    q11, q8, q9, #2     \n"
1225 
1226                         "vst1.f32   {d14-d15}, [%1]!    \n"
1227 
1228                         "subs       %0, #1              \n"
1229                         "bne        0b                  \n"
1230 
1231                         "sub        %2, #16             \n"
1232                         : "=r"(nn),     // %0
1233                         "=r"(outptr), // %1
1234                         "=r"(r0),     // %2
1235                         "=r"(r1),     // %3
1236                         "=r"(r2)      // %4
1237                         : "0"(nn),
1238                         "1"(outptr),
1239                         "2"(r0),
1240                         "3"(r1),
1241                         "4"(r2),
1242                         "w"(_k0123), // %10
1243                         "w"(_k3456), // %11
1244                         "w"(_k6789)  // %12
1245                         : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1246                 }
1247 #endif // __aarch64__
1248 #endif // __ARM_NEON
1249                 for (; remain > 0; remain--)
1250                 {
1251 #if __ARM_NEON
1252                     float32x4_t _r00 = vld1q_f32(r0);
1253                     float32x4_t _r10 = vld1q_f32(r1);
1254                     float32x4_t _r20 = vld1q_f32(r2);
1255 
1256                     float32x4_t _sum = vmulq_f32(_r00, _k0123);
1257                     _sum = vmlaq_f32(_sum, _r10, _k3456);
1258                     _sum = vmlaq_f32(_sum, _r20, _k6789);
1259 
1260                     _sum = vsetq_lane_f32(*outptr, _sum, 3);
1261 
1262 #if __aarch64__
1263                     *outptr = vaddvq_f32(_sum);
1264 #else
1265                     float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
1266                     _ss = vpadd_f32(_ss, _ss);
1267 
1268                     *outptr = vget_lane_f32(_ss, 0);
1269 #endif // __aarch64__
1270 #else
1271                     float sum = 0;
1272 
1273                     sum += r0[0] * k0[0];
1274                     sum += r0[1] * k0[1];
1275                     sum += r0[2] * k0[2];
1276                     sum += r1[0] * k1[0];
1277                     sum += r1[1] * k1[1];
1278                     sum += r1[2] * k1[2];
1279                     sum += r2[0] * k2[0];
1280                     sum += r2[1] * k2[1];
1281                     sum += r2[2] * k2[2];
1282 
1283                     *outptr += sum;
1284 #endif
1285                     r0++;
1286                     r1++;
1287                     r2++;
1288                     outptr++;
1289                 }
1290 
1291                 r0 += 2;
1292                 r1 += 2;
1293                 r2 += 2;
1294             }
1295 
1296             kernel0 += 9;
1297         }
1298     }
1299 }
1300 
conv3x3s1_winograd64_transform_kernel_neon(const Mat & kernel,Mat & kernel_tm,int inch,int outch)1301 static void conv3x3s1_winograd64_transform_kernel_neon(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
1302 {
1303     kernel_tm.create(8 * 8, inch, outch);
1304 
1305     const float ktm[8][3] = {
1306         {1.0f, 0.0f, 0.0f},
1307         {-2.0f / 9, -2.0f / 9, -2.0f / 9},
1308         {-2.0f / 9, 2.0f / 9, -2.0f / 9},
1309         {1.0f / 90, 1.0f / 45, 2.0f / 45},
1310         {1.0f / 90, -1.0f / 45, 2.0f / 45},
1311         {1.0f / 45, 1.0f / 90, 1.0f / 180},
1312         {1.0f / 45, -1.0f / 90, 1.0f / 180},
1313         {0.0f, 0.0f, 1.0f}
1314     };
1315 
1316     #pragma omp parallel for
1317     for (int p = 0; p < outch; p++)
1318     {
1319         for (int q = 0; q < inch; q++)
1320         {
1321             const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
1322             float* kernel_tm0 = kernel_tm.channel(p).row(q);
1323 
1324             // transform kernel, transposed
1325             const float* k0 = kernel0;
1326             const float* k1 = kernel0 + 3;
1327             const float* k2 = kernel0 + 6;
1328 
1329             // h
1330             float tmp[8][3];
1331             for (int i = 0; i < 8; i++)
1332             {
1333                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
1334                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
1335                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
1336             }
1337 
1338             // v
1339             for (int j = 0; j < 8; j++)
1340             {
1341                 float* tmpp = &tmp[j][0];
1342 
1343                 for (int i = 0; i < 8; i++)
1344                 {
1345                     kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
1346                 }
1347             }
1348         }
1349     }
1350 
1351     // optimized layout for winograd4
1352     // interleave weights
1353     int nn_outch = outch >> 2;
1354     int remain_outch_start = nn_outch << 2;
1355 
1356     Mat kernel_tm2(8 * 8 * inch * 4, 1, nn_outch + (outch % 4 + 3) / 4);
1357 
1358     #pragma omp parallel for
1359     for (int pp = 0; pp < nn_outch; pp++)
1360     {
1361         int p = pp * 4;
1362 
1363         float* ktm2 = kernel_tm2.channel(pp);
1364 
1365         const Mat kernel0_tm = kernel_tm.channel(p);
1366         const Mat kernel1_tm = kernel_tm.channel(p + 1);
1367         const Mat kernel2_tm = kernel_tm.channel(p + 2);
1368         const Mat kernel3_tm = kernel_tm.channel(p + 3);
1369 
1370         int q = 0;
1371 
1372 #if __ARM_NEON && __aarch64__
1373         for (; q + 3 < inch; q += 4)
1374         {
1375             const float* k00 = kernel0_tm.row(q);
1376             const float* k01 = kernel0_tm.row(q + 1);
1377             const float* k02 = kernel0_tm.row(q + 2);
1378             const float* k03 = kernel0_tm.row(q + 3);
1379             const float* k10 = kernel1_tm.row(q);
1380             const float* k11 = kernel1_tm.row(q + 1);
1381             const float* k12 = kernel1_tm.row(q + 2);
1382             const float* k13 = kernel1_tm.row(q + 3);
1383             const float* k20 = kernel2_tm.row(q);
1384             const float* k21 = kernel2_tm.row(q + 1);
1385             const float* k22 = kernel2_tm.row(q + 2);
1386             const float* k23 = kernel2_tm.row(q + 3);
1387             const float* k30 = kernel3_tm.row(q);
1388             const float* k31 = kernel3_tm.row(q + 1);
1389             const float* k32 = kernel3_tm.row(q + 2);
1390             const float* k33 = kernel3_tm.row(q + 3);
1391 
1392             for (int r = 0; r < 16; r++)
1393             {
1394                 // split into two asm blocks for gcc reject over 30 oprands :(
1395                 asm volatile(
1396                     "ld1    {v0.4s}, [%1], #16  \n"
1397                     "ld1    {v1.4s}, [%2], #16  \n"
1398                     "ld1    {v2.4s}, [%3], #16  \n"
1399                     "ld1    {v3.4s}, [%4], #16  \n"
1400                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64  \n"
1401 
1402                     "ld1    {v0.4s}, [%5], #16  \n"
1403                     "ld1    {v1.4s}, [%6], #16  \n"
1404                     "ld1    {v2.4s}, [%7], #16  \n"
1405                     "ld1    {v3.4s}, [%8], #16  \n"
1406                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64  \n"
1407 
1408                     : "=r"(ktm2), // %0
1409                     "=r"(k00),  // %1
1410                     "=r"(k01),  // %2
1411                     "=r"(k02),  // %3
1412                     "=r"(k03),  // %4
1413                     "=r"(k10),  // %5
1414                     "=r"(k11),  // %6
1415                     "=r"(k12),  // %7
1416                     "=r"(k13)   // %8
1417                     : "0"(ktm2),
1418                     "1"(k00),
1419                     "2"(k01),
1420                     "3"(k02),
1421                     "4"(k03),
1422                     "5"(k10),
1423                     "6"(k11),
1424                     "7"(k12),
1425                     "8"(k13)
1426                     : "cc", "memory", "v0", "v1", "v2", "v3");
1427                 asm volatile(
1428                     "ld1    {v0.4s}, [%1], #16  \n"
1429                     "ld1    {v1.4s}, [%2], #16  \n"
1430                     "ld1    {v2.4s}, [%3], #16  \n"
1431                     "ld1    {v3.4s}, [%4], #16  \n"
1432                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64  \n"
1433 
1434                     "ld1    {v0.4s}, [%5], #16  \n"
1435                     "ld1    {v1.4s}, [%6], #16  \n"
1436                     "ld1    {v2.4s}, [%7], #16  \n"
1437                     "ld1    {v3.4s}, [%8], #16  \n"
1438                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64  \n"
1439 
1440                     : "=r"(ktm2), // %0
1441                     "=r"(k20),  // %1
1442                     "=r"(k21),  // %2
1443                     "=r"(k22),  // %3
1444                     "=r"(k23),  // %4
1445                     "=r"(k30),  // %5
1446                     "=r"(k31),  // %6
1447                     "=r"(k32),  // %7
1448                     "=r"(k33)   // %8
1449                     : "0"(ktm2),
1450                     "1"(k20),
1451                     "2"(k21),
1452                     "3"(k22),
1453                     "4"(k23),
1454                     "5"(k30),
1455                     "6"(k31),
1456                     "7"(k32),
1457                     "8"(k33)
1458                     : "cc", "memory", "v0", "v1", "v2", "v3");
1459             }
1460         }
1461 #endif // __ARM_NEON && __aarch64__
1462 
1463         for (; q + 1 < inch; q += 2)
1464         {
1465             const float* k00 = kernel0_tm.row(q);
1466             const float* k01 = kernel0_tm.row(q + 1);
1467             const float* k10 = kernel1_tm.row(q);
1468             const float* k11 = kernel1_tm.row(q + 1);
1469             const float* k20 = kernel2_tm.row(q);
1470             const float* k21 = kernel2_tm.row(q + 1);
1471             const float* k30 = kernel3_tm.row(q);
1472             const float* k31 = kernel3_tm.row(q + 1);
1473 
1474             for (int r = 0; r < 16; r++)
1475             {
1476 #if __ARM_NEON
1477 #if __aarch64__
1478                 asm volatile(
1479                     "ld1    {v0.4s}, [%1], #16  \n"
1480                     "ld1    {v1.4s}, [%2], #16  \n"
1481                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1482 
1483                     "ld1    {v0.4s}, [%3], #16  \n"
1484                     "ld1    {v1.4s}, [%4], #16  \n"
1485                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1486 
1487                     "ld1    {v0.4s}, [%5], #16  \n"
1488                     "ld1    {v1.4s}, [%6], #16  \n"
1489                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1490 
1491                     "ld1    {v0.4s}, [%7], #16  \n"
1492                     "ld1    {v1.4s}, [%8], #16  \n"
1493                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1494 
1495                     : "=r"(ktm2), // %0
1496                     "=r"(k00),  // %1
1497                     "=r"(k01),  // %2
1498                     "=r"(k10),  // %3
1499                     "=r"(k11),  // %4
1500                     "=r"(k20),  // %5
1501                     "=r"(k21),  // %6
1502                     "=r"(k30),  // %7
1503                     "=r"(k31)   // %8
1504                     : "0"(ktm2),
1505                     "1"(k00),
1506                     "2"(k01),
1507                     "3"(k10),
1508                     "4"(k11),
1509                     "5"(k20),
1510                     "6"(k21),
1511                     "7"(k30),
1512                     "8"(k31)
1513                     : "cc", "memory", "v0", "v1");
1514 #else
1515                 asm volatile(
1516                     "vld1.f32   {d0-d1}, [%1 :128]! \n"
1517                     "vld1.f32   {d2-d3}, [%2 :128]! \n"
1518                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1519 
1520                     "vld1.f32   {d0-d1}, [%3 :128]! \n"
1521                     "vld1.f32   {d2-d3}, [%4 :128]! \n"
1522                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1523 
1524                     "vld1.f32   {d0-d1}, [%5 :128]! \n"
1525                     "vld1.f32   {d2-d3}, [%6 :128]! \n"
1526                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1527 
1528                     "vld1.f32   {d0-d1}, [%7 :128]! \n"
1529                     "vld1.f32   {d2-d3}, [%8 :128]! \n"
1530                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1531 
1532                     : "=r"(ktm2), // %0
1533                     "=r"(k00),  // %1
1534                     "=r"(k01),  // %2
1535                     "=r"(k10),  // %3
1536                     "=r"(k11),  // %4
1537                     "=r"(k20),  // %5
1538                     "=r"(k21),  // %6
1539                     "=r"(k30),  // %7
1540                     "=r"(k31)   // %8
1541                     : "0"(ktm2),
1542                     "1"(k00),
1543                     "2"(k01),
1544                     "3"(k10),
1545                     "4"(k11),
1546                     "5"(k20),
1547                     "6"(k21),
1548                     "7"(k30),
1549                     "8"(k31)
1550                     : "cc", "memory", "q0", "q1");
1551 #endif // __aarch64__
1552 #else
1553                 for (int m = 0; m < 4; m++)
1554                 {
1555                     ktm2[0 + m] = k00[m];
1556                     ktm2[4 + m] = k01[m];
1557                     ktm2[8 + m] = k10[m];
1558                     ktm2[12 + m] = k11[m];
1559                     ktm2[16 + m] = k20[m];
1560                     ktm2[20 + m] = k21[m];
1561                     ktm2[24 + m] = k30[m];
1562                     ktm2[28 + m] = k31[m];
1563                 }
1564 
1565                 k00 += 4;
1566                 k01 += 4;
1567                 k10 += 4;
1568                 k11 += 4;
1569                 k20 += 4;
1570                 k21 += 4;
1571                 k30 += 4;
1572                 k31 += 4;
1573                 ktm2 += 32;
1574 #endif // __ARM_NEON
1575             }
1576         }
1577 
1578         for (; q < inch; q++)
1579         {
1580             const float* k00 = kernel0_tm.row(q);
1581             const float* k10 = kernel1_tm.row(q);
1582             const float* k20 = kernel2_tm.row(q);
1583             const float* k30 = kernel3_tm.row(q);
1584 
1585             for (int r = 0; r < 16; r++)
1586             {
1587 #if __ARM_NEON
1588 #if __aarch64__
1589                 asm volatile(
1590                     "ld1    {v0.4s}, [%1], #16  \n"
1591                     "ld1    {v1.4s}, [%2], #16  \n"
1592                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1593 
1594                     "ld1    {v0.4s}, [%3], #16  \n"
1595                     "ld1    {v1.4s}, [%4], #16  \n"
1596                     "st1    {v0.4s, v1.4s}, [%0], #32  \n"
1597 
1598                     : "=r"(ktm2), // %0
1599                     "=r"(k00),  // %1
1600                     "=r"(k10),  // %2
1601                     "=r"(k20),  // %3
1602                     "=r"(k30)   // %4
1603                     : "0"(ktm2),
1604                     "1"(k00),
1605                     "2"(k10),
1606                     "3"(k20),
1607                     "4"(k30)
1608                     : "cc", "memory", "v0", "v1");
1609 #else
1610                 asm volatile(
1611                     "vld1.f32   {d0-d1}, [%1 :128]! \n"
1612                     "vld1.f32   {d2-d3}, [%2 :128]! \n"
1613                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1614 
1615                     "vld1.f32   {d0-d1}, [%3 :128]! \n"
1616                     "vld1.f32   {d2-d3}, [%4 :128]! \n"
1617                     "vst1.f32   {d0-d3}, [%0 :128]! \n"
1618 
1619                     : "=r"(ktm2), // %0
1620                     "=r"(k00),  // %1
1621                     "=r"(k10),  // %2
1622                     "=r"(k20),  // %3
1623                     "=r"(k30)   // %4
1624                     : "0"(ktm2),
1625                     "1"(k00),
1626                     "2"(k10),
1627                     "3"(k20),
1628                     "4"(k30)
1629                     : "cc", "memory", "q0", "q1");
1630 #endif // __aarch64__
1631 #else
1632                 for (int m = 0; m < 4; m++)
1633                 {
1634                     ktm2[0 + m] = k00[m];
1635                     ktm2[4 + m] = k10[m];
1636                     ktm2[8 + m] = k20[m];
1637                     ktm2[12 + m] = k30[m];
1638                 }
1639 
1640                 k00 += 4;
1641                 k10 += 4;
1642                 k20 += 4;
1643                 k30 += 4;
1644                 ktm2 += 16;
1645 #endif // __ARM_NEON
1646             }
1647         }
1648     }
1649 
1650     #pragma omp parallel for
1651     for (int p = remain_outch_start; p < outch; p++)
1652     {
1653         float* ktm2 = (float*)kernel_tm2.channel(nn_outch) + 8 * 8 * inch * (p - remain_outch_start);
1654 
1655         const Mat kernel0_tm = kernel_tm.channel(p);
1656 
1657         int q = 0;
1658 
1659         for (; q < inch; q++)
1660         {
1661             const float* k00 = kernel0_tm.row(q);
1662 
1663             for (int r = 0; r < 16; r++)
1664             {
1665 #if __ARM_NEON
1666 #if __aarch64__
1667                 asm volatile(
1668                     "ld1    {v0.4s}, [%1], #16  \n"
1669                     "st1    {v0.4s}, [%0], #16  \n"
1670                     : "=r"(ktm2), // %0
1671                     "=r"(k00)   // %1
1672                     : "0"(ktm2),
1673                     "1"(k00)
1674                     : "cc", "memory", "v0");
1675 #else
1676                 asm volatile(
1677                     "vld1.f32   {d0-d1}, [%1 :128]! \n"
1678                     "vst1.f32   {d0-d1}, [%0 :128]! \n"
1679                     : "=r"(ktm2), // %0
1680                     "=r"(k00)   // %1
1681                     : "0"(ktm2),
1682                     "1"(k00)
1683                     : "cc", "memory", "q0");
1684 #endif // __aarch64__
1685 #else
1686                 for (int m = 0; m < 4; m++)
1687                 {
1688                     ktm2[m] = k00[m];
1689                 }
1690 
1691                 k00 += 4;
1692                 ktm2 += 4;
1693 #endif // __ARM_NEON
1694             }
1695         }
1696     }
1697 
1698     kernel_tm = kernel_tm2;
1699 }
1700 
conv3x3s1_winograd64_transform_kernel_neon5(const Mat & kernel,Mat & kernel_tm,int inch,int outch)1701 static void conv3x3s1_winograd64_transform_kernel_neon5(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
1702 {
1703     kernel_tm.create(8 * 8, inch, outch);
1704 
1705     const float ktm[8][3] = {
1706         {1.0f, 0.0f, 0.0f},
1707         {-2.0f / 9, -2.0f / 9, -2.0f / 9},
1708         {-2.0f / 9, 2.0f / 9, -2.0f / 9},
1709         {1.0f / 90, 1.0f / 45, 2.0f / 45},
1710         {1.0f / 90, -1.0f / 45, 2.0f / 45},
1711         {1.0f / 45, 1.0f / 90, 1.0f / 180},
1712         {1.0f / 45, -1.0f / 90, 1.0f / 180},
1713         {0.0f, 0.0f, 1.0f}
1714     };
1715 
1716     #pragma omp parallel for
1717     for (int p = 0; p < outch; p++)
1718     {
1719         for (int q = 0; q < inch; q++)
1720         {
1721             const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
1722             float* kernel_tm0 = kernel_tm.channel(p).row(q);
1723 
1724             // transform kernel, transposed
1725             const float* k0 = kernel0;
1726             const float* k1 = kernel0 + 3;
1727             const float* k2 = kernel0 + 6;
1728 
1729             // h
1730             float tmp[8][3];
1731             for (int i = 0; i < 8; i++)
1732             {
1733                 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
1734                 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
1735                 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
1736             }
1737 
1738             // v
1739             for (int j = 0; j < 8; j++)
1740             {
1741                 float* tmpp = &tmp[j][0];
1742 
1743                 for (int i = 0; i < 8; i++)
1744                 {
1745                     kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
1746                 }
1747             }
1748         }
1749     }
1750 
1751     // optimized layout for winograd5
1752     // interleave weights
1753 //     Mat kernel_tm2(8*8, inch, outch);
1754 //     Mat kernel_tm2(inch, 64, outch);
1755 #if __ARM_NEON && __aarch64__
1756     Mat kernel_tm2(8 * 4 * (inch / 4) + 8 * (inch % 4), 64, outch / 8 + (outch % 8) / 4 + outch % 4);
1757 #else
1758     Mat kernel_tm2(4 * 4 * (inch / 4) + 4 * (inch % 4), 64, outch / 4 + outch % 4);
1759 #endif
1760 
1761     int p = 0;
1762 #if __aarch64__
1763     for (; p + 7 < outch; p += 8)
1764     {
1765         const Mat kernel0_tm = kernel_tm.channel(p);
1766         const Mat kernel1_tm = kernel_tm.channel(p + 1);
1767         const Mat kernel2_tm = kernel_tm.channel(p + 2);
1768         const Mat kernel3_tm = kernel_tm.channel(p + 3);
1769         const Mat kernel4_tm = kernel_tm.channel(p + 4);
1770         const Mat kernel5_tm = kernel_tm.channel(p + 5);
1771         const Mat kernel6_tm = kernel_tm.channel(p + 6);
1772         const Mat kernel7_tm = kernel_tm.channel(p + 7);
1773 
1774         Mat ktm2 = kernel_tm2.channel(p / 8);
1775 
1776         for (int r = 0; r < 64; r++)
1777         {
1778             float* ktm2p = ktm2.row(r);
1779 
1780             for (int q = 0; q < inch; q++)
1781             {
1782                 const float* ktm0_0 = kernel0_tm.row(q);
1783                 const float* ktm1_0 = kernel1_tm.row(q);
1784                 const float* ktm2_0 = kernel2_tm.row(q);
1785                 const float* ktm3_0 = kernel3_tm.row(q);
1786                 const float* ktm4_0 = kernel4_tm.row(q);
1787                 const float* ktm5_0 = kernel5_tm.row(q);
1788                 const float* ktm6_0 = kernel6_tm.row(q);
1789                 const float* ktm7_0 = kernel7_tm.row(q);
1790 
1791                 ktm2p[0] = ktm0_0[r];
1792                 ktm2p[1] = ktm1_0[r];
1793                 ktm2p[2] = ktm2_0[r];
1794                 ktm2p[3] = ktm3_0[r];
1795                 ktm2p[4] = ktm4_0[r];
1796                 ktm2p[5] = ktm5_0[r];
1797                 ktm2p[6] = ktm6_0[r];
1798                 ktm2p[7] = ktm7_0[r];
1799 
1800                 ktm2p += 8;
1801             }
1802         }
1803     }
1804 #endif // __aarch64__
1805     for (; p + 3 < outch; p += 4)
1806     {
1807         const Mat kernel0_tm = kernel_tm.channel(p);
1808         const Mat kernel1_tm = kernel_tm.channel(p + 1);
1809         const Mat kernel2_tm = kernel_tm.channel(p + 2);
1810         const Mat kernel3_tm = kernel_tm.channel(p + 3);
1811 
1812 #if __ARM_NEON && __aarch64__
1813         Mat ktm2 = kernel_tm2.channel(p / 8 + (p % 8) / 4);
1814 #else
1815         Mat ktm2 = kernel_tm2.channel(p / 4);
1816 #endif
1817 
1818         for (int r = 0; r < 64; r++)
1819         {
1820             float* ktm2p = ktm2.row(r);
1821 
1822             for (int q = 0; q < inch; q++)
1823             {
1824                 const float* ktm0_0 = kernel0_tm.row(q);
1825                 const float* ktm1_0 = kernel1_tm.row(q);
1826                 const float* ktm2_0 = kernel2_tm.row(q);
1827                 const float* ktm3_0 = kernel3_tm.row(q);
1828 
1829                 ktm2p[0] = ktm0_0[r];
1830                 ktm2p[1] = ktm1_0[r];
1831                 ktm2p[2] = ktm2_0[r];
1832                 ktm2p[3] = ktm3_0[r];
1833 
1834                 ktm2p += 4;
1835             }
1836         }
1837     }
1838     for (; p < outch; p++)
1839     {
1840         const Mat kernel0_tm = kernel_tm.channel(p);
1841 
1842 #if __ARM_NEON && __aarch64__
1843         Mat ktm2 = kernel_tm2.channel(p / 8 + (p % 8) / 4 + p % 4);
1844 #else
1845         Mat ktm2 = kernel_tm2.channel(p / 4 + p % 4);
1846 #endif
1847 
1848         for (int r = 0; r < 64; r++)
1849         {
1850             float* ktm2p = ktm2.row(r);
1851 
1852             for (int q = 0; q < inch; q++)
1853             {
1854                 const float* ktm0_0 = kernel0_tm.row(q);
1855 
1856                 ktm2p[0] = ktm0_0[r];
1857 
1858                 ktm2p += 1;
1859             }
1860         }
1861     }
1862 
1863     kernel_tm = kernel_tm2;
1864 }
1865 
conv3x3s1_winograd64_neon4(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)1866 static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
1867 {
1868     int w = bottom_blob.w;
1869     int h = bottom_blob.h;
1870     int inch = bottom_blob.c;
1871 
1872     int outw = top_blob.w;
1873     int outh = top_blob.h;
1874     int outch = top_blob.c;
1875 
1876     // pad to 6n+2
1877     Mat bottom_blob_bordered = bottom_blob;
1878 
1879     outw = (outw + 5) / 6 * 6;
1880     outh = (outh + 5) / 6 * 6;
1881 
1882     w = outw + 2;
1883     h = outh + 2;
1884     Option opt_b = opt;
1885     opt_b.blob_allocator = opt.workspace_allocator;
1886     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
1887 
1888     const float* bias = _bias;
1889 
1890     // BEGIN transform input
1891     Mat bottom_blob_tm;
1892     {
1893         int w_tm = outw / 6 * 8;
1894         int h_tm = outh / 6 * 8;
1895         bottom_blob_tm.create(4, 16 * w_tm / 8 * h_tm / 8, inch, 4u, opt.workspace_allocator);
1896         const int tiles = w_tm / 8 * h_tm / 8;
1897 
1898         //         const float itm[8][8] = {
1899         //             {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
1900         //
1901         //             {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
1902         //             {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
1903         //
1904         //             {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
1905         //             {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
1906         //
1907         //             {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
1908         //             {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
1909         //
1910         //             {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
1911         //         };
1912 
1913         // 0 = r00 - r06 + (r04 - r02) * 5.25
1914         // 7 = r07 - r01 + (r03 - r05) * 5.25
1915 
1916         // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
1917         // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
1918 
1919         // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
1920         // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
1921 
1922         // reuse r04 * 1.25
1923         // reuse r03 * 2.5
1924         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
1925         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
1926 
1927 #if __ARM_NEON
1928         const float coeff[8] = {
1929             0.25f, 0.5f, -1.25f, 2.f,
1930             -2.5f, 4.f, 4.25f, 5.25f
1931         };
1932         float32x4_t _coeff0 = vld1q_f32(coeff);
1933         float32x4_t _coeff1 = vld1q_f32(coeff + 4);
1934 #endif // __ARM_NEON
1935 
1936         #pragma omp parallel for num_threads(opt.num_threads)
1937         for (int q = 0; q < inch; q++)
1938         {
1939             const Mat img0 = bottom_blob_bordered.channel(q);
1940             Mat img0_tm = bottom_blob_tm.channel(q);
1941 
1942             float tmp[8][8];
1943 
1944             // tile
1945             for (int i = 0; i < h_tm / 8; i++)
1946             {
1947                 for (int j = 0; j < w_tm / 8; j++)
1948                 {
1949 #if __ARM_NEON
1950                     const float* r0 = img0.row(i * 6) + j * 6;
1951                     const float* r1 = r0 + w;
1952                     const float* r2 = r0 + w * 2;
1953                     const float* r3 = r0 + w * 3;
1954 
1955                     // the assembly block for armv7 input transform requires 13 general registers
1956                     // old gcc may fail to allocate register on debug build without -fomit-frame-pointer
1957                     // so, fallback to intrinsic version for armv7 debug build     --- nihui
1958 #if __aarch64__ || !defined(NDEBUG)
1959                     for (int m = 0; m + 3 < 8; m += 4)
1960                     {
1961                         float32x4_t _r0_0123 = vld1q_f32(r0);
1962                         float32x4_t _r0_4567 = vld1q_f32(r0 + 4);
1963                         float32x4_t _r1_0123 = vld1q_f32(r1);
1964                         float32x4_t _r1_4567 = vld1q_f32(r1 + 4);
1965                         float32x4_t _r2_0123 = vld1q_f32(r2);
1966                         float32x4_t _r2_4567 = vld1q_f32(r2 + 4);
1967                         float32x4_t _r3_0123 = vld1q_f32(r3);
1968                         float32x4_t _r3_4567 = vld1q_f32(r3 + 4);
1969 
1970                         float32x4x2_t _r01_00221133 = vtrnq_f32(_r0_0123, _r1_0123);
1971                         float32x4x2_t _r01_44665577 = vtrnq_f32(_r0_4567, _r1_4567);
1972                         float32x4x2_t _r23_00221133 = vtrnq_f32(_r2_0123, _r3_0123);
1973                         float32x4x2_t _r23_44665577 = vtrnq_f32(_r2_4567, _r3_4567);
1974 
1975                         // no vswp intrinsic  :(
1976                         float32x4_t _r_00 = vcombine_f32(vget_low_f32(_r01_00221133.val[0]), vget_low_f32(_r23_00221133.val[0]));
1977                         float32x4_t _r_11 = vcombine_f32(vget_low_f32(_r01_00221133.val[1]), vget_low_f32(_r23_00221133.val[1]));
1978                         float32x4_t _r_22 = vcombine_f32(vget_high_f32(_r01_00221133.val[0]), vget_high_f32(_r23_00221133.val[0]));
1979                         float32x4_t _r_33 = vcombine_f32(vget_high_f32(_r01_00221133.val[1]), vget_high_f32(_r23_00221133.val[1]));
1980                         float32x4_t _r_44 = vcombine_f32(vget_low_f32(_r01_44665577.val[0]), vget_low_f32(_r23_44665577.val[0]));
1981                         float32x4_t _r_55 = vcombine_f32(vget_low_f32(_r01_44665577.val[1]), vget_low_f32(_r23_44665577.val[1]));
1982                         float32x4_t _r_66 = vcombine_f32(vget_high_f32(_r01_44665577.val[0]), vget_high_f32(_r23_44665577.val[0]));
1983                         float32x4_t _r_77 = vcombine_f32(vget_high_f32(_r01_44665577.val[1]), vget_high_f32(_r23_44665577.val[1]));
1984 
1985                         float32x4_t _r_0_m_6 = vsubq_f32(_r_00, _r_66);
1986                         float32x4_t _r_7_m_1 = vsubq_f32(_r_77, _r_11);
1987 
1988                         float32x4_t _r_4_m_2 = vsubq_f32(_r_44, _r_22);
1989                         float32x4_t _r_3_m_5 = vsubq_f32(_r_33, _r_55);
1990 
1991                         float32x4_t _tmp0 = vmlaq_lane_f32(_r_0_m_6, _r_4_m_2, vget_high_f32(_coeff1), 1);
1992                         float32x4_t _tmp7 = vmlaq_lane_f32(_r_7_m_1, _r_3_m_5, vget_high_f32(_coeff1), 1);
1993 
1994                         vst1q_f32(&tmp[0][m], _tmp0);
1995                         vst1q_f32(&tmp[7][m], _tmp7);
1996 
1997                         float32x4_t _r_2_a_6 = vaddq_f32(_r_22, _r_66);
1998                         float32x4_t _r_1_a_5 = vaddq_f32(_r_11, _r_55);
1999 
2000                         float32x4_t _tmp12a = vmlsq_lane_f32(_r_2_a_6, _r_44, vget_high_f32(_coeff1), 0);
2001                         float32x4_t _tmp12b = vmlsq_lane_f32(_r_1_a_5, _r_33, vget_high_f32(_coeff1), 0);
2002 
2003                         float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
2004                         float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
2005 
2006                         vst1q_f32(&tmp[1][m], _tmp1);
2007                         vst1q_f32(&tmp[2][m], _tmp2);
2008 
2009                         float32x4_t _r_4_x_c = vmulq_lane_f32(_r_44, vget_high_f32(_coeff0), 0);
2010                         float32x4_t _r_3_x_c = vmulq_lane_f32(_r_33, vget_low_f32(_coeff1), 0);
2011 
2012                         float32x4_t _tmp34a = vaddq_f32(_r_66, _r_4_x_c);
2013                         _tmp34a = vmlaq_lane_f32(_tmp34a, _r_22, vget_low_f32(_coeff0), 0);
2014 
2015                         float32x4_t _tmp34b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_low_f32(_coeff0), 1);
2016                         _tmp34b = vmlaq_lane_f32(_tmp34b, _r_55, vget_high_f32(_coeff0), 1);
2017 
2018                         float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
2019                         float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
2020 
2021                         vst1q_f32(&tmp[3][m], _tmp3);
2022                         vst1q_f32(&tmp[4][m], _tmp4);
2023 
2024                         // reuse r04 * 1.25
2025                         // reuse r03 * 2.5
2026                         float32x4_t _r_2_a_4c = vaddq_f32(_r_22, _r_4_x_c);
2027                         float32x4_t _tmp56a = vmlaq_lane_f32(_r_66, _r_2_a_4c, vget_low_f32(_coeff1), 1);
2028                         float32x4_t _tmp56b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_high_f32(_coeff0), 1);
2029                         _tmp56b = vmlaq_lane_f32(_tmp56b, _r_55, vget_low_f32(_coeff0), 1);
2030 
2031                         float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
2032                         float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
2033 
2034                         vst1q_f32(&tmp[5][m], _tmp5);
2035                         vst1q_f32(&tmp[6][m], _tmp6);
2036 
2037                         r0 += w * 4;
2038                         r1 += w * 4;
2039                         r2 += w * 4;
2040                         r3 += w * 4;
2041                     }
2042 
2043                     const float* t0 = tmp[0];
2044                     const float* t1 = tmp[1];
2045                     const float* t2 = tmp[2];
2046                     const float* t3 = tmp[3];
2047 
2048                     float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
2049                     float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2050                     float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
2051                     float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
2052                     float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
2053                     float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
2054                     float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
2055                     float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
2056 
2057                     for (int m = 0; m + 3 < 8; m += 4)
2058                     {
2059                         float32x4_t _t0_0123 = vld1q_f32(t0);
2060                         float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
2061                         float32x4_t _t1_0123 = vld1q_f32(t1);
2062                         float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
2063                         float32x4_t _t2_0123 = vld1q_f32(t2);
2064                         float32x4_t _t2_4567 = vld1q_f32(t2 + 4);
2065                         float32x4_t _t3_0123 = vld1q_f32(t3);
2066                         float32x4_t _t3_4567 = vld1q_f32(t3 + 4);
2067 
2068                         float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
2069                         float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
2070                         float32x4x2_t _t23_00221133 = vtrnq_f32(_t2_0123, _t3_0123);
2071                         float32x4x2_t _t23_44665577 = vtrnq_f32(_t2_4567, _t3_4567);
2072 
2073                         // no vswp intrinsic  :(
2074                         float32x4_t _t_00 = vcombine_f32(vget_low_f32(_t01_00221133.val[0]), vget_low_f32(_t23_00221133.val[0]));
2075                         float32x4_t _t_11 = vcombine_f32(vget_low_f32(_t01_00221133.val[1]), vget_low_f32(_t23_00221133.val[1]));
2076                         float32x4_t _t_22 = vcombine_f32(vget_high_f32(_t01_00221133.val[0]), vget_high_f32(_t23_00221133.val[0]));
2077                         float32x4_t _t_33 = vcombine_f32(vget_high_f32(_t01_00221133.val[1]), vget_high_f32(_t23_00221133.val[1]));
2078                         float32x4_t _t_44 = vcombine_f32(vget_low_f32(_t01_44665577.val[0]), vget_low_f32(_t23_44665577.val[0]));
2079                         float32x4_t _t_55 = vcombine_f32(vget_low_f32(_t01_44665577.val[1]), vget_low_f32(_t23_44665577.val[1]));
2080                         float32x4_t _t_66 = vcombine_f32(vget_high_f32(_t01_44665577.val[0]), vget_high_f32(_t23_44665577.val[0]));
2081                         float32x4_t _t_77 = vcombine_f32(vget_high_f32(_t01_44665577.val[1]), vget_high_f32(_t23_44665577.val[1]));
2082 
2083                         float32x4_t _t_0_m_6 = vsubq_f32(_t_00, _t_66);
2084                         float32x4_t _t_7_m_1 = vsubq_f32(_t_77, _t_11);
2085 
2086                         float32x4_t _t_4_m_2 = vsubq_f32(_t_44, _t_22);
2087                         float32x4_t _t_3_m_5 = vsubq_f32(_t_33, _t_55);
2088 
2089                         float32x4_t _r0_tm_0_0 = vmlaq_lane_f32(_t_0_m_6, _t_4_m_2, vget_high_f32(_coeff1), 1);
2090                         float32x4_t _r0_tm_4_3 = vmlaq_lane_f32(_t_7_m_1, _t_3_m_5, vget_high_f32(_coeff1), 1);
2091 
2092                         r0_tm0_0[0] = vgetq_lane_f32(_r0_tm_0_0, 0);
2093                         r0_tm1_0[0] = vgetq_lane_f32(_r0_tm_0_0, 1);
2094                         r0_tm2_0[0] = vgetq_lane_f32(_r0_tm_0_0, 2);
2095                         r0_tm3_0[0] = vgetq_lane_f32(_r0_tm_0_0, 3);
2096 
2097                         r0_tm0_4[3] = vgetq_lane_f32(_r0_tm_4_3, 0);
2098                         r0_tm1_4[3] = vgetq_lane_f32(_r0_tm_4_3, 1);
2099                         r0_tm2_4[3] = vgetq_lane_f32(_r0_tm_4_3, 2);
2100                         r0_tm3_4[3] = vgetq_lane_f32(_r0_tm_4_3, 3);
2101 
2102                         float32x4_t _t_2_m_6 = vaddq_f32(_t_22, _t_66);
2103                         float32x4_t _t_1_m_5 = vaddq_f32(_t_11, _t_55);
2104 
2105                         float32x4_t _tmp12a = vmlsq_lane_f32(_t_2_m_6, _t_44, vget_high_f32(_coeff1), 0);
2106                         float32x4_t _tmp12b = vmlsq_lane_f32(_t_1_m_5, _t_33, vget_high_f32(_coeff1), 0);
2107 
2108                         float32x4_t _r0_tm_0_1 = vaddq_f32(_tmp12a, _tmp12b);
2109                         float32x4_t _r0_tm_0_2 = vsubq_f32(_tmp12a, _tmp12b);
2110 
2111                         r0_tm0_0[1] = vgetq_lane_f32(_r0_tm_0_1, 0);
2112                         r0_tm1_0[1] = vgetq_lane_f32(_r0_tm_0_1, 1);
2113                         r0_tm2_0[1] = vgetq_lane_f32(_r0_tm_0_1, 2);
2114                         r0_tm3_0[1] = vgetq_lane_f32(_r0_tm_0_1, 3);
2115 
2116                         r0_tm0_0[2] = vgetq_lane_f32(_r0_tm_0_2, 0);
2117                         r0_tm1_0[2] = vgetq_lane_f32(_r0_tm_0_2, 1);
2118                         r0_tm2_0[2] = vgetq_lane_f32(_r0_tm_0_2, 2);
2119                         r0_tm3_0[2] = vgetq_lane_f32(_r0_tm_0_2, 3);
2120 
2121                         float32x4_t _t_4_x_c = vmulq_lane_f32(_t_44, vget_high_f32(_coeff0), 0);
2122                         float32x4_t _t_3_x_c = vmulq_lane_f32(_t_33, vget_low_f32(_coeff1), 0);
2123 
2124                         float32x4_t _tmp34a = vaddq_f32(_t_66, _t_4_x_c);
2125                         _tmp34a = vmlaq_lane_f32(_tmp34a, _t_22, vget_low_f32(_coeff0), 0);
2126 
2127                         float32x4_t _tmp34b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_low_f32(_coeff0), 1);
2128                         _tmp34b = vmlaq_lane_f32(_tmp34b, _t_55, vget_high_f32(_coeff0), 1);
2129 
2130                         float32x4_t _r0_tm_0_3 = vaddq_f32(_tmp34a, _tmp34b);
2131                         float32x4_t _r0_tm_4_0 = vsubq_f32(_tmp34a, _tmp34b);
2132 
2133                         r0_tm0_0[3] = vgetq_lane_f32(_r0_tm_0_3, 0);
2134                         r0_tm1_0[3] = vgetq_lane_f32(_r0_tm_0_3, 1);
2135                         r0_tm2_0[3] = vgetq_lane_f32(_r0_tm_0_3, 2);
2136                         r0_tm3_0[3] = vgetq_lane_f32(_r0_tm_0_3, 3);
2137 
2138                         r0_tm0_4[0] = vgetq_lane_f32(_r0_tm_4_0, 0);
2139                         r0_tm1_4[0] = vgetq_lane_f32(_r0_tm_4_0, 1);
2140                         r0_tm2_4[0] = vgetq_lane_f32(_r0_tm_4_0, 2);
2141                         r0_tm3_4[0] = vgetq_lane_f32(_r0_tm_4_0, 3);
2142 
2143                         float32x4_t _t_2_a_4c = vaddq_f32(_t_22, _t_4_x_c);
2144                         float32x4_t _tmp56a = vmlaq_lane_f32(_t_66, _t_2_a_4c, vget_low_f32(_coeff1), 1);
2145                         float32x4_t _tmp56b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_high_f32(_coeff0), 1);
2146                         _tmp56b = vmlaq_lane_f32(_tmp56b, _t_55, vget_low_f32(_coeff0), 1);
2147 
2148                         float32x4_t _r0_tm_4_1 = vaddq_f32(_tmp56a, _tmp56b);
2149                         float32x4_t _r0_tm_4_2 = vsubq_f32(_tmp56a, _tmp56b);
2150 
2151                         r0_tm0_4[1] = vgetq_lane_f32(_r0_tm_4_1, 0);
2152                         r0_tm1_4[1] = vgetq_lane_f32(_r0_tm_4_1, 1);
2153                         r0_tm2_4[1] = vgetq_lane_f32(_r0_tm_4_1, 2);
2154                         r0_tm3_4[1] = vgetq_lane_f32(_r0_tm_4_1, 3);
2155 
2156                         r0_tm0_4[2] = vgetq_lane_f32(_r0_tm_4_2, 0);
2157                         r0_tm1_4[2] = vgetq_lane_f32(_r0_tm_4_2, 1);
2158                         r0_tm2_4[2] = vgetq_lane_f32(_r0_tm_4_2, 2);
2159                         r0_tm3_4[2] = vgetq_lane_f32(_r0_tm_4_2, 3);
2160 
2161                         t0 += 8 * 4;
2162                         t1 += 8 * 4;
2163                         t2 += 8 * 4;
2164                         t3 += 8 * 4;
2165 
2166                         r0_tm0_0 += img0_tm.w * tiles * 2 * 4;
2167                         r0_tm0_4 += img0_tm.w * tiles * 2 * 4;
2168                         r0_tm1_0 += img0_tm.w * tiles * 2 * 4;
2169                         r0_tm1_4 += img0_tm.w * tiles * 2 * 4;
2170                         r0_tm2_0 += img0_tm.w * tiles * 2 * 4;
2171                         r0_tm2_4 += img0_tm.w * tiles * 2 * 4;
2172                         r0_tm3_0 += img0_tm.w * tiles * 2 * 4;
2173                         r0_tm3_4 += img0_tm.w * tiles * 2 * 4;
2174                     }
2175 #else  // __aarch64__
2176                     float* t0 = tmp[0];
2177                     float* t1 = tmp[1];
2178                     float* t2 = tmp[2];
2179                     float* t3 = tmp[3];
2180                     float* t4 = tmp[4];
2181                     float* t5 = tmp[5];
2182                     float* t6 = tmp[6];
2183                     float* t7 = tmp[7];
2184 
2185                     int stepw = w * 4 * 4;
2186 
2187                     asm volatile(
2188 
2189                         // loop0
2190                         "vld1.f32   {d16-d19}, [%8], %26    \n"
2191                         "vld1.f32   {d20-d23}, [%9], %26    \n"
2192                         "vld1.f32   {d24-d27}, [%10], %26   \n"
2193 
2194                         "vtrn.32    q8, q10             \n"
2195 
2196                         "vld1.f32   {d28-d31}, [%11], %26   \n"
2197 
2198                         "vtrn.32    q9, q11             \n"
2199                         "vtrn.32    q12, q14            \n"
2200                         "vtrn.32    q13, q15            \n"
2201 
2202                         "vswp       d17, d24            \n"
2203                         "vswp       d19, d26            \n"
2204                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
2205                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
2206 
2207                         "vsub.f32   q2, q8, q13         \n"
2208                         "vsub.f32   q3, q9, q12         \n"
2209 
2210                         "vadd.f32   q4, q12, q13        \n"
2211                         "vadd.f32   q5, q10, q11        \n"
2212 
2213                         "vmla.f32   q2, q3, %f25[1]     \n"
2214 
2215                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
2216                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
2217 
2218                         "vmls.f32   q4, q9, %f25[0]     \n"
2219                         "vmls.f32   q5, q14, %f25[0]    \n"
2220 
2221                         "vst1.f32   {d4-d5}, [%0]!      \n" // tmp[0][m]
2222 
2223                         "vmov       q3, q7              \n" // use q7
2224 
2225                         "vadd.f32   q2, q13, q6         \n" // use q6
2226                         "vmla.f32   q3, q10, %e24[1]    \n"
2227 
2228                         "vadd.f32   q8, q4, q5          \n"
2229                         "vsub.f32   q9, q4, q5          \n"
2230 
2231                         "vmov       q5, q7              \n" // use q7
2232 
2233                         "vadd.f32   q6, q12, q6         \n" // use q6
2234                         "vmla.f32   q5, q10, %f24[1]    \n"
2235 
2236                         "vmov       q4, q13             \n"
2237 
2238                         "vmla.f32   q2, q12, %e24[0]    \n"
2239                         "vmla.f32   q3, q11, %f24[1]    \n"
2240 
2241                         "vst1.f32   {d16-d17}, [%1]!    \n" // tmp[1][m]
2242 
2243                         "vmla.f32   q4, q6, %e25[1]     \n"
2244                         "vmla.f32   q5, q11, %e24[1]    \n"
2245 
2246                         "vst1.f32   {d18-d19}, [%2]!    \n" // tmp[2][m]
2247 
2248                         "vadd.f32   q8, q2, q3          \n"
2249                         "vsub.f32   q9, q2, q3          \n"
2250 
2251                         "vsub.f32   q6, q15, q10        \n"
2252                         "vsub.f32   q7, q14, q11        \n"
2253 
2254                         "vadd.f32   q2, q4, q5          \n"
2255                         "vsub.f32   q3, q4, q5          \n"
2256 
2257                         "vst1.f32   {d16-d17}, [%3]!    \n" // tmp[3][m]
2258                         "vst1.f32   {d18-d19}, [%4]!    \n" // tmp[4][m]
2259 
2260                         "vmla.f32   q6, q7, %f25[1]     \n"
2261 
2262                         "vst1.f32   {d4-d5}, [%5]!      \n" // tmp[5][m]
2263                         "vst1.f32   {d6-d7}, [%6]!      \n" // tmp[6][m]
2264 
2265                         "vst1.f32   {d12-d13}, [%7]!    \n" // tmp[7][m]
2266 
2267                         // loop1
2268                         "vld1.f32   {d16-d19}, [%8]     \n"
2269                         "vld1.f32   {d20-d23}, [%9]     \n"
2270                         "vld1.f32   {d24-d27}, [%10]    \n"
2271 
2272                         "vtrn.32    q8, q10             \n"
2273 
2274                         "vld1.f32   {d28-d31}, [%11]    \n"
2275 
2276                         "vtrn.32    q9, q11             \n"
2277                         "vtrn.32    q12, q14            \n"
2278                         "vtrn.32    q13, q15            \n"
2279 
2280                         "vswp       d17, d24            \n"
2281                         "vswp       d19, d26            \n"
2282                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
2283                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
2284 
2285                         "vsub.f32   q2, q8, q13         \n"
2286                         "vsub.f32   q3, q9, q12         \n"
2287 
2288                         "vadd.f32   q4, q12, q13        \n"
2289                         "vadd.f32   q5, q10, q11        \n"
2290 
2291                         "vmla.f32   q2, q3, %f25[1]     \n"
2292 
2293                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
2294                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
2295 
2296                         "vmls.f32   q4, q9, %f25[0]     \n"
2297                         "vmls.f32   q5, q14, %f25[0]    \n"
2298 
2299                         "vst1.f32   {d4-d5}, [%0]!      \n" // tmp[0][m]
2300 
2301                         "vmov       q3, q7              \n" // use q7
2302 
2303                         "vadd.f32   q2, q13, q6         \n" // use q6
2304                         "vmla.f32   q3, q10, %e24[1]    \n"
2305 
2306                         "vadd.f32   q8, q4, q5          \n"
2307                         "vsub.f32   q9, q4, q5          \n"
2308 
2309                         "vmov       q5, q7              \n" // use q7
2310 
2311                         "vadd.f32   q6, q12, q6         \n" // use q6
2312                         "vmla.f32   q5, q10, %f24[1]    \n"
2313 
2314                         "vmov       q4, q13             \n"
2315 
2316                         "vmla.f32   q2, q12, %e24[0]    \n"
2317                         "vmla.f32   q3, q11, %f24[1]    \n"
2318 
2319                         "vst1.f32   {d16-d17}, [%1]!    \n" // tmp[1][m]
2320 
2321                         "vmla.f32   q4, q6, %e25[1]     \n"
2322                         "vmla.f32   q5, q11, %e24[1]    \n"
2323 
2324                         "vst1.f32   {d18-d19}, [%2]!    \n" // tmp[2][m]
2325 
2326                         "vadd.f32   q8, q2, q3          \n"
2327                         "vsub.f32   q9, q2, q3          \n"
2328 
2329                         "vsub.f32   q6, q15, q10        \n"
2330                         "vsub.f32   q7, q14, q11        \n"
2331 
2332                         "vadd.f32   q2, q4, q5          \n"
2333                         "vsub.f32   q3, q4, q5          \n"
2334 
2335                         "vst1.f32   {d16-d17}, [%3]!    \n" // tmp[3][m]
2336                         "vst1.f32   {d18-d19}, [%4]!    \n" // tmp[4][m]
2337 
2338                         "vmla.f32   q6, q7, %f25[1]     \n"
2339 
2340                         "vst1.f32   {d4-d5}, [%5]!      \n" // tmp[5][m]
2341                         "vst1.f32   {d6-d7}, [%6]!      \n" // tmp[6][m]
2342 
2343                         "vst1.f32   {d12-d13}, [%7]!    \n" // tmp[7][m]
2344 
2345                         : "=r"(t0), // %0
2346                         "=r"(t1), // %1
2347                         "=r"(t2), // %2
2348                         "=r"(t3), // %3
2349                         "=r"(t4), // %4
2350                         "=r"(t5), // %5
2351                         "=r"(t6), // %6
2352                         "=r"(t7), // %7
2353                         "=r"(r0), // %8
2354                         "=r"(r1), // %9
2355                         "=r"(r2), // %10
2356                         "=r"(r3)  // %11
2357                         : "0"(t0),
2358                         "1"(t1),
2359                         "2"(t2),
2360                         "3"(t3),
2361                         "4"(t4),
2362                         "5"(t5),
2363                         "6"(t6),
2364                         "7"(t7),
2365                         "8"(r0),
2366                         "9"(r1),
2367                         "10"(r2),
2368                         "11"(r3),
2369                         "w"(_coeff0), // %24
2370                         "w"(_coeff1), // %25
2371                         "r"(stepw)    // %26
2372                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2373 
2374                     t0 = tmp[0];
2375                     t1 = tmp[1];
2376                     t2 = tmp[2];
2377                     t3 = tmp[3];
2378 
2379                     float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
2380                     float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2381                     float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
2382                     float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
2383                     float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
2384                     float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
2385                     float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
2386                     float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
2387 
2388                     int step = img0_tm.w * tiles * 2 * 4 * 4;
2389 
2390                     asm volatile(
2391 
2392                         // loop0
2393                         "vld1.f32   {d16-d19}, [%8]     \n"
2394                         "add        %8, %8, #128        \n"
2395                         "vld1.f32   {d20-d23}, [%9]     \n"
2396                         "add        %9, %9, #128        \n"
2397                         "vld1.f32   {d24-d27}, [%10]    \n"
2398                         "add        %10, %10, #128      \n"
2399 
2400                         "vtrn.32    q8, q10             \n"
2401 
2402                         "vld1.f32   {d28-d31}, [%11]    \n"
2403                         "add        %11, %11, #128      \n"
2404 
2405                         "vtrn.32    q9, q11             \n"
2406                         "vtrn.32    q12, q14            \n"
2407                         "vtrn.32    q13, q15            \n"
2408 
2409                         "vswp       d17, d24            \n"
2410                         "vswp       d19, d26            \n"
2411                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
2412                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
2413 
2414                         "vsub.f32   q2, q8, q13         \n"
2415                         "vsub.f32   q3, q9, q12         \n"
2416 
2417                         "vadd.f32   q4, q12, q13        \n"
2418                         "vadd.f32   q5, q10, q11        \n"
2419 
2420                         "vmla.f32   q2, q3, %f25[1]     \n"
2421 
2422                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
2423                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
2424 
2425                         "vmls.f32   q4, q9, %f25[0]     \n"
2426                         "vmls.f32   q5, q14, %f25[0]    \n"
2427 
2428                         "vst1.f32   {d4[0]}, [%0]!      \n"
2429                         "vst1.f32   {d4[1]}, [%2]!      \n"
2430 
2431                         "vmov       q3, q7              \n" // use q7
2432 
2433                         "vst1.f32   {d5[0]}, [%4]!      \n"
2434                         "vst1.f32   {d5[1]}, [%6]!      \n"
2435 
2436                         "vadd.f32   q2, q13, q6         \n" // use q6
2437                         "vmla.f32   q3, q10, %e24[1]    \n"
2438 
2439                         "vadd.f32   q8, q4, q5          \n"
2440                         "vsub.f32   q9, q4, q5          \n"
2441 
2442                         "vmov       q5, q7              \n" // use q7
2443 
2444                         "vadd.f32   q6, q12, q6         \n" // use q6
2445                         "vmla.f32   q5, q10, %f24[1]    \n"
2446 
2447                         "vmov       q4, q13             \n"
2448 
2449                         "vmla.f32   q2, q12, %e24[0]    \n"
2450                         "vmla.f32   q3, q11, %f24[1]    \n"
2451 
2452                         "vst1.f32   {d16[0]}, [%0]!     \n"
2453                         "vst1.f32   {d16[1]}, [%2]!     \n"
2454 
2455                         "vmla.f32   q4, q6, %e25[1]     \n"
2456 
2457                         "vst1.f32   {d17[0]}, [%4]!     \n"
2458                         "vst1.f32   {d17[1]}, [%6]!     \n"
2459 
2460                         "vmla.f32   q5, q11, %e24[1]    \n"
2461 
2462                         "vst1.f32   {d18[0]}, [%0]!     \n"
2463                         "vst1.f32   {d18[1]}, [%2]!     \n"
2464 
2465                         "vadd.f32   q8, q2, q3          \n"
2466 
2467                         "vst1.f32   {d19[0]}, [%4]!     \n"
2468                         "vst1.f32   {d19[1]}, [%6]!     \n"
2469 
2470                         "vsub.f32   q9, q2, q3          \n"
2471 
2472                         "vsub.f32   q6, q15, q10        \n"
2473                         "vsub.f32   q7, q14, q11        \n"
2474 
2475                         "vadd.f32   q2, q4, q5          \n"
2476                         "vsub.f32   q3, q4, q5          \n"
2477 
2478                         "vst1.f32   {d16[0]}, [%0], %26 \n"
2479                         "vst1.f32   {d16[1]}, [%2], %26 \n"
2480 
2481                         "vmla.f32   q6, q7, %f25[1]     \n"
2482 
2483                         "vst1.f32   {d17[0]}, [%4], %26 \n"
2484                         "vst1.f32   {d17[1]}, [%6], %26 \n"
2485 
2486                         "vtrn.32    q9, q2              \n"
2487                         "vtrn.32    q3, q6              \n"
2488 
2489                         "sub        %0, %0, #12         \n"
2490                         "sub        %2, %2, #12         \n"
2491                         "sub        %4, %4, #12         \n"
2492                         "sub        %6, %6, #12         \n"
2493 
2494                         "vswp       d19, d6             \n"
2495                         "vswp       d5, d12             \n"
2496 
2497                         "vst1.f32   {d18-d19}, [%1], %26 \n"
2498                         "vst1.f32   {d4-d5}, [%3], %26  \n"
2499                         "vst1.f32   {d6-d7}, [%5], %26  \n"
2500                         "vst1.f32   {d12-d13}, [%7], %26 \n"
2501 
2502                         // loop1
2503                         "vld1.f32   {d16-d19}, [%8]     \n"
2504                         "vld1.f32   {d20-d23}, [%9]     \n"
2505                         "vld1.f32   {d24-d27}, [%10]    \n"
2506 
2507                         "vtrn.32    q8, q10             \n"
2508 
2509                         "vld1.f32   {d28-d31}, [%11]    \n"
2510 
2511                         "vtrn.32    q9, q11             \n"
2512                         "vtrn.32    q12, q14            \n"
2513                         "vtrn.32    q13, q15            \n"
2514 
2515                         "vswp       d17, d24            \n"
2516                         "vswp       d19, d26            \n"
2517                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
2518                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
2519 
2520                         "vsub.f32   q2, q8, q13         \n"
2521                         "vsub.f32   q3, q9, q12         \n"
2522 
2523                         "vadd.f32   q4, q12, q13        \n"
2524                         "vadd.f32   q5, q10, q11        \n"
2525 
2526                         "vmla.f32   q2, q3, %f25[1]     \n"
2527 
2528                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
2529                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
2530 
2531                         "vmls.f32   q4, q9, %f25[0]     \n"
2532                         "vmls.f32   q5, q14, %f25[0]    \n"
2533 
2534                         "vst1.f32   {d4[0]}, [%0]!      \n"
2535                         "vst1.f32   {d4[1]}, [%2]!      \n"
2536 
2537                         "vmov       q3, q7              \n" // use q7
2538 
2539                         "vst1.f32   {d5[0]}, [%4]!      \n"
2540                         "vst1.f32   {d5[1]}, [%6]!      \n"
2541 
2542                         "vadd.f32   q2, q13, q6         \n" // use q6
2543                         "vmla.f32   q3, q10, %e24[1]    \n"
2544 
2545                         "vadd.f32   q8, q4, q5          \n"
2546                         "vsub.f32   q9, q4, q5          \n"
2547 
2548                         "vmov       q5, q7              \n" // use q7
2549 
2550                         "vadd.f32   q6, q12, q6         \n" // use q6
2551                         "vmla.f32   q5, q10, %f24[1]    \n"
2552 
2553                         "vmov       q4, q13             \n"
2554 
2555                         "vmla.f32   q2, q12, %e24[0]    \n"
2556                         "vmla.f32   q3, q11, %f24[1]    \n"
2557 
2558                         "vst1.f32   {d16[0]}, [%0]!     \n"
2559                         "vst1.f32   {d16[1]}, [%2]!     \n"
2560 
2561                         "vmla.f32   q4, q6, %e25[1]     \n"
2562 
2563                         "vst1.f32   {d17[0]}, [%4]!     \n"
2564                         "vst1.f32   {d17[1]}, [%6]!     \n"
2565 
2566                         "vmla.f32   q5, q11, %e24[1]    \n"
2567 
2568                         "vst1.f32   {d18[0]}, [%0]!     \n"
2569                         "vst1.f32   {d18[1]}, [%2]!     \n"
2570 
2571                         "vadd.f32   q8, q2, q3          \n"
2572 
2573                         "vst1.f32   {d19[0]}, [%4]!     \n"
2574                         "vst1.f32   {d19[1]}, [%6]!     \n"
2575 
2576                         "vsub.f32   q9, q2, q3          \n"
2577 
2578                         "vsub.f32   q6, q15, q10        \n"
2579                         "vsub.f32   q7, q14, q11        \n"
2580 
2581                         "vadd.f32   q2, q4, q5          \n"
2582                         "vsub.f32   q3, q4, q5          \n"
2583 
2584                         "vst1.f32   {d16[0]}, [%0]      \n"
2585                         "vst1.f32   {d16[1]}, [%2]      \n"
2586 
2587                         "vmla.f32   q6, q7, %f25[1]     \n"
2588 
2589                         "vst1.f32   {d17[0]}, [%4]      \n"
2590                         "vst1.f32   {d17[1]}, [%6]      \n"
2591 
2592                         "vtrn.32    q9, q2              \n"
2593                         "vtrn.32    q3, q6              \n"
2594 
2595                         "vswp       d19, d6             \n"
2596                         "vswp       d5, d12             \n"
2597 
2598                         "vst1.f32   {d18-d19}, [%1]     \n"
2599                         "vst1.f32   {d4-d5}, [%3]       \n"
2600                         "vst1.f32   {d6-d7}, [%5]       \n"
2601                         "vst1.f32   {d12-d13}, [%7]     \n"
2602 
2603                         : "=r"(r0_tm0_0), // %0
2604                         "=r"(r0_tm0_4), // %1
2605                         "=r"(r0_tm1_0), // %2
2606                         "=r"(r0_tm1_4), // %3
2607                         "=r"(r0_tm2_0), // %4
2608                         "=r"(r0_tm2_4), // %5
2609                         "=r"(r0_tm3_0), // %6
2610                         "=r"(r0_tm3_4), // %7
2611                         "=r"(t0),       // %8
2612                         "=r"(t1),       // %9
2613                         "=r"(t2),       // %10
2614                         "=r"(t3)        // %11
2615                         : "0"(r0_tm0_0),
2616                         "1"(r0_tm0_4),
2617                         "2"(r0_tm1_0),
2618                         "3"(r0_tm1_4),
2619                         "4"(r0_tm2_0),
2620                         "5"(r0_tm2_4),
2621                         "6"(r0_tm3_0),
2622                         "7"(r0_tm3_4),
2623                         "8"(t0),
2624                         "9"(t1),
2625                         "10"(t2),
2626                         "11"(t3),
2627                         "w"(_coeff0), // %24
2628                         "w"(_coeff1), // %25
2629                         "r"(step)     // %26
2630                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2631 #endif // __aarch64__
2632 #else
2633                     const float* r0 = img0.row(i * 6) + j * 6;
2634 
2635                     for (int m = 0; m < 8; m++)
2636                     {
2637                         tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25f;
2638                         tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25f;
2639 
2640                         float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25f);
2641                         float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25f);
2642 
2643                         tmp[1][m] = tmp12a + tmp12b;
2644                         tmp[2][m] = tmp12a - tmp12b;
2645 
2646                         float tmp34a = (r0[6] + r0[2] * 0.25f - r0[4] * 1.25f);
2647                         float tmp34b = (r0[1] * 0.5f - r0[3] * 2.5f + r0[5] * 2.f);
2648 
2649                         tmp[3][m] = tmp34a + tmp34b;
2650                         tmp[4][m] = tmp34a - tmp34b;
2651 
2652                         float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25f) * 4.f);
2653                         float tmp56b = (r0[1] * 2.f - r0[3] * 2.5f + r0[5] * 0.5f);
2654 
2655                         tmp[5][m] = tmp56a + tmp56b;
2656                         tmp[6][m] = tmp56a - tmp56b;
2657 
2658                         r0 += w;
2659                     }
2660 
2661                     float* r0_tm_0 = img0_tm.row(i * w_tm / 8 + j);
2662                     float* r0_tm_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2663 
2664                     for (int m = 0; m < 8; m++)
2665                     {
2666                         const float* tmp0 = tmp[m];
2667 
2668                         r0_tm_0[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f;
2669                         r0_tm_4[3] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f;
2670 
2671                         float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25f);
2672                         float tmp12b = (tmp0[1] - tmp0[3] * 4.25f + tmp0[5]);
2673 
2674                         r0_tm_0[1] = tmp12a + tmp12b;
2675                         r0_tm_0[2] = tmp12a - tmp12b;
2676 
2677                         float tmp34a = (tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f);
2678                         float tmp34b = (tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f);
2679 
2680                         r0_tm_0[3] = tmp34a + tmp34b;
2681                         r0_tm_4[0] = tmp34a - tmp34b;
2682 
2683                         float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f);
2684                         float tmp56b = (tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f);
2685 
2686                         r0_tm_4[1] = tmp56a + tmp56b;
2687                         r0_tm_4[2] = tmp56a - tmp56b;
2688 
2689                         r0_tm_0 += img0_tm.w * tiles * 2;
2690                         r0_tm_4 += img0_tm.w * tiles * 2;
2691                     }
2692 #endif // __ARM_NEON
2693                 }
2694             }
2695         }
2696     }
2697     bottom_blob_bordered = Mat();
2698     // END transform input
2699 
2700     // BEGIN dot
2701     Mat top_blob_tm;
2702     {
2703         int w_tm = outw / 6 * 8;
2704         int h_tm = outh / 6 * 8;
2705         top_blob_tm.create(4, 16 * w_tm / 8 * h_tm / 8, outch, 4u, opt.workspace_allocator);
2706 
2707         const int tiles = h_tm / 8 * w_tm / 8;
2708 
2709         int nn_outch = outch >> 2;
2710         int remain_outch_start = nn_outch << 2;
2711 
2712         #pragma omp parallel for num_threads(opt.num_threads)
2713         for (int pp = 0; pp < nn_outch; pp++)
2714         {
2715             int p = pp * 4;
2716 
2717             Mat out0_tm = top_blob_tm.channel(p);
2718             Mat out1_tm = top_blob_tm.channel(p + 1);
2719             Mat out2_tm = top_blob_tm.channel(p + 2);
2720             Mat out3_tm = top_blob_tm.channel(p + 3);
2721 
2722             const float* ktm = kernel_tm.channel(pp);
2723 
2724             out0_tm.fill(0.f);
2725             out1_tm.fill(0.f);
2726             out2_tm.fill(0.f);
2727             out3_tm.fill(0.f);
2728 
2729             int q = 0;
2730 
2731 #if __ARM_NEON && __aarch64__
2732             for (; q + 3 < inch; q += 4)
2733             {
2734                 const float* r0 = bottom_blob_tm.channel(q);
2735                 const float* r1 = bottom_blob_tm.channel(q + 1);
2736                 const float* r2 = bottom_blob_tm.channel(q + 2);
2737                 const float* r3 = bottom_blob_tm.channel(q + 3);
2738 
2739                 float* output0_tm = out0_tm;
2740                 float* output1_tm = out1_tm;
2741                 float* output2_tm = out2_tm;
2742                 float* output3_tm = out3_tm;
2743 
2744                 asm volatile(
2745                     "mov    w0, #16                     \n" // w0 = r = 16
2746                     "0:                                 \n"
2747 
2748                     "prfm   pldl1keep, [%8, #512]                       \n"
2749                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%8], #64     \n" // v0  v1  v2  v3  = _k00 _k01 _k02 _k03
2750 
2751                     "prfm   pldl1keep, [%8, #512]                       \n"
2752                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%8], #64     \n" // v4  v5  v6  v7  = _k10 _k11 _k12 _k13
2753 
2754                     "prfm   pldl1keep, [%8, #512]                       \n"
2755                     "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64   \n" // v8  v9  v10 v11 = _k20 _k21 _k22 _k23
2756 
2757                     "prfm   pldl1keep, [%8, #512]                       \n"
2758                     "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n" // v12 v13 v14 v15 = _k30 _k31 _k32 _k33
2759 
2760                     // tile loop
2761                     "lsr    w1, %w18, #2                \n" // w1 = nn = tiles >> 2
2762                     "cmp    w1, #0                      \n"
2763                     "beq    2f                          \n"
2764 
2765                     //BEGIN tile loop
2766                     "prfm   pldl1keep, [%4, #128]       \n" //
2767                     "ld1    {v16.4s}, [%4], #16         \n"
2768 
2769                     "1:                                 \n"
2770 
2771                     "prfm   pldl1keep, [%0, #128]       \n"
2772                     "ld1    {v20.4s}, [%0]              \n"
2773                     "add    x4, %0, #16                 \n" // x4 = %0 next
2774 
2775                     "fmla   v20.4s, v16.4s, v0.4s       \n"
2776 
2777                     "prfm   pldl1keep, [%1, #128]       \n"
2778                     "ld1    {v21.4s}, [%1]              \n"
2779                     "add    x5, %1, #16                 \n" // x5 = %1 next
2780 
2781                     "fmla   v21.4s, v16.4s, v4.4s       \n"
2782 
2783                     "prfm   pldl1keep, [%2, #128]       \n"
2784                     "ld1    {v22.4s}, [%2]              \n"
2785                     "add    x6, %2, #16                 \n" // x6 = %2 next
2786 
2787                     "fmla   v22.4s, v16.4s, v8.4s       \n"
2788 
2789                     "prfm   pldl1keep, [%3, #128]       \n"
2790                     "ld1    {v23.4s}, [%3]              \n"
2791                     "add    x7, %3, #16                 \n" // x7 = %3 next
2792 
2793                     "prfm   pldl1keep, [%5, #128]       \n"
2794                     "ld1    {v17.4s}, [%5], #16         \n"
2795 
2796                     "fmla   v23.4s, v16.4s, v12.4s      \n"
2797 
2798                     "prfm   pldl1keep, [x4, #128]       \n"
2799                     "ld1    {v24.4s}, [x4]              \n"
2800 
2801                     "fmla   v20.4s, v17.4s, v1.4s       \n"
2802                     "fmla   v21.4s, v17.4s, v5.4s       \n"
2803 
2804                     "prfm   pldl1keep, [%6, #128]       \n"
2805                     "ld1    {v18.4s}, [%6], #16         \n"
2806 
2807                     "fmla   v22.4s, v17.4s, v9.4s       \n"
2808                     "fmla   v23.4s, v17.4s, v13.4s      \n"
2809 
2810                     "prfm   pldl1keep, [x5, #128]       \n"
2811                     "ld1    {v25.4s}, [x5]              \n"
2812 
2813                     "fmla   v20.4s, v18.4s, v2.4s       \n"
2814                     "fmla   v21.4s, v18.4s, v6.4s       \n"
2815 
2816                     "prfm   pldl1keep, [%7, #128]       \n"
2817                     "ld1    {v19.4s}, [%7], #16         \n"
2818 
2819                     "fmla   v22.4s, v18.4s, v10.4s      \n"
2820                     "fmla   v23.4s, v18.4s, v14.4s      \n"
2821 
2822                     "prfm   pldl1keep, [x6, #128]       \n"
2823                     "ld1    {v26.4s}, [x6]              \n"
2824 
2825                     "fmla   v20.4s, v19.4s, v3.4s       \n"
2826                     "fmla   v21.4s, v19.4s, v7.4s       \n"
2827 
2828                     "prfm   pldl1keep, [%4, #128]       \n"
2829                     "ld1    {v16.4s}, [%4], #16         \n"
2830 
2831                     "fmla   v22.4s, v19.4s, v11.4s      \n"
2832                     "fmla   v23.4s, v19.4s, v15.4s      \n"
2833 
2834                     ///////
2835 
2836                     "prfm   pldl1keep, [x7, #128]       \n"
2837                     "ld1    {v27.4s}, [x7]              \n"
2838 
2839                     "st1    {v20.4s}, [%0]              \n"
2840                     "add    %0, %0, #32                 \n"
2841 
2842                     "fmla   v24.4s, v16.4s, v0.4s       \n"
2843                     "fmla   v25.4s, v16.4s, v4.4s       \n"
2844 
2845                     "prfm   pldl1keep, [%5, #128]       \n"
2846                     "ld1    {v17.4s}, [%5], #16         \n"
2847 
2848                     "fmla   v26.4s, v16.4s, v8.4s       \n"
2849                     "fmla   v27.4s, v16.4s, v12.4s      \n"
2850 
2851                     "prfm   pldl1keep, [%0, #128]       \n"
2852                     "ld1    {v20.4s}, [%0]              \n"
2853 
2854                     "st1    {v21.4s}, [%1]              \n"
2855                     "add    %1, %1, #32                 \n"
2856 
2857                     "fmla   v24.4s, v17.4s, v1.4s       \n"
2858                     "fmla   v25.4s, v17.4s, v5.4s       \n"
2859 
2860                     "prfm   pldl1keep, [%6, #128]       \n"
2861                     "ld1    {v18.4s}, [%6], #16         \n"
2862 
2863                     "fmla   v26.4s, v17.4s, v9.4s       \n"
2864                     "fmla   v27.4s, v17.4s, v13.4s      \n"
2865 
2866                     "prfm   pldl1keep, [%1, #128]       \n"
2867                     "ld1    {v21.4s}, [%1]              \n"
2868 
2869                     "st1    {v22.4s}, [%2]              \n"
2870                     "add    %2, %2, #32                 \n"
2871 
2872                     "fmla   v24.4s, v18.4s, v2.4s       \n"
2873                     "fmla   v25.4s, v18.4s, v6.4s       \n"
2874 
2875                     "prfm   pldl1keep, [%7, #128]       \n"
2876                     "ld1    {v19.4s}, [%7], #16         \n"
2877 
2878                     "fmla   v26.4s, v18.4s, v10.4s      \n"
2879                     "fmla   v27.4s, v18.4s, v14.4s      \n"
2880 
2881                     "prfm   pldl1keep, [%2, #128]       \n"
2882                     "ld1    {v22.4s}, [%2]              \n"
2883 
2884                     "st1    {v23.4s}, [%3]              \n"
2885                     "add    %3, %3, #32                 \n"
2886 
2887                     "fmla   v24.4s, v19.4s, v3.4s       \n"
2888                     "fmla   v25.4s, v19.4s, v7.4s       \n"
2889 
2890                     "prfm   pldl1keep, [%4, #128]       \n"
2891                     "ld1    {v16.4s}, [%4], #16         \n"
2892 
2893                     "fmla   v26.4s, v19.4s, v11.4s      \n"
2894                     "fmla   v27.4s, v19.4s, v15.4s      \n"
2895 
2896                     ///////
2897 
2898                     "prfm   pldl1keep, [%3, #128]       \n"
2899                     "ld1    {v23.4s}, [%3]              \n"
2900 
2901                     "st1    {v24.4s}, [x4]              \n"
2902                     "add    x4, x4, #32                 \n"
2903 
2904                     "fmla   v20.4s, v16.4s, v0.4s       \n"
2905                     "fmla   v21.4s, v16.4s, v4.4s       \n"
2906 
2907                     "prfm   pldl1keep, [%5, #128]       \n"
2908                     "ld1    {v17.4s}, [%5], #16         \n"
2909 
2910                     "fmla   v22.4s, v16.4s, v8.4s       \n"
2911                     "fmla   v23.4s, v16.4s, v12.4s      \n"
2912 
2913                     "prfm   pldl1keep, [x4, #128]       \n"
2914                     "ld1    {v24.4s}, [x4]              \n"
2915 
2916                     "st1    {v25.4s}, [x5]              \n"
2917                     "add    x5, x5, #32                 \n"
2918 
2919                     "fmla   v20.4s, v17.4s, v1.4s       \n"
2920                     "fmla   v21.4s, v17.4s, v5.4s       \n"
2921 
2922                     "prfm   pldl1keep, [%6, #128]       \n"
2923                     "ld1    {v18.4s}, [%6], #16         \n"
2924 
2925                     "fmla   v22.4s, v17.4s, v9.4s       \n"
2926                     "fmla   v23.4s, v17.4s, v13.4s      \n"
2927 
2928                     "prfm   pldl1keep, [x5, #128]       \n"
2929                     "ld1    {v25.4s}, [x5]              \n"
2930 
2931                     "st1    {v26.4s}, [x6]              \n"
2932                     "add    x6, x6, #32                 \n"
2933 
2934                     "fmla   v20.4s, v18.4s, v2.4s       \n"
2935                     "fmla   v21.4s, v18.4s, v6.4s       \n"
2936 
2937                     "prfm   pldl1keep, [%7, #128]       \n"
2938                     "ld1    {v19.4s}, [%7], #16         \n"
2939 
2940                     "fmla   v22.4s, v18.4s, v10.4s      \n"
2941                     "fmla   v23.4s, v18.4s, v14.4s      \n"
2942 
2943                     "prfm   pldl1keep, [x6, #128]       \n"
2944                     "ld1    {v26.4s}, [x6]              \n"
2945 
2946                     "st1    {v27.4s}, [x7]              \n"
2947                     "add    x7, x7, #32                 \n"
2948 
2949                     "fmla   v20.4s, v19.4s, v3.4s       \n"
2950                     "fmla   v21.4s, v19.4s, v7.4s       \n"
2951 
2952                     "prfm   pldl1keep, [%4, #128]       \n"
2953                     "ld1    {v16.4s}, [%4], #16         \n"
2954 
2955                     "fmla   v22.4s, v19.4s, v11.4s      \n"
2956                     "fmla   v23.4s, v19.4s, v15.4s      \n"
2957 
2958                     ///////
2959 
2960                     "prfm   pldl1keep, [x7, #128]       \n"
2961                     "ld1    {v27.4s}, [x7]              \n"
2962 
2963                     "st1    {v20.4s}, [%0]              \n"
2964 
2965                     "fmla   v24.4s, v16.4s, v0.4s       \n"
2966                     "fmla   v25.4s, v16.4s, v4.4s       \n"
2967 
2968                     "prfm   pldl1keep, [%5, #128]       \n"
2969                     "ld1    {v17.4s}, [%5], #16         \n"
2970 
2971                     "fmla   v26.4s, v16.4s, v8.4s       \n"
2972                     "fmla   v27.4s, v16.4s, v12.4s      \n"
2973 
2974                     "st1    {v21.4s}, [%1]              \n"
2975 
2976                     "fmla   v24.4s, v17.4s, v1.4s       \n"
2977                     "fmla   v25.4s, v17.4s, v5.4s       \n"
2978 
2979                     "prfm   pldl1keep, [%6, #128]       \n"
2980                     "ld1    {v18.4s}, [%6], #16         \n"
2981 
2982                     "fmla   v26.4s, v17.4s, v9.4s       \n"
2983                     "fmla   v27.4s, v17.4s, v13.4s      \n"
2984 
2985                     "st1    {v22.4s}, [%2]              \n"
2986 
2987                     "fmla   v24.4s, v18.4s, v2.4s       \n"
2988                     "fmla   v25.4s, v18.4s, v6.4s       \n"
2989 
2990                     "prfm   pldl1keep, [%7, #128]       \n"
2991                     "ld1    {v19.4s}, [%7], #16         \n"
2992 
2993                     "fmla   v26.4s, v18.4s, v10.4s      \n"
2994                     "fmla   v27.4s, v18.4s, v14.4s      \n"
2995 
2996                     "st1    {v23.4s}, [%3]              \n"
2997 
2998                     "fmla   v24.4s, v19.4s, v3.4s       \n"
2999                     "fmla   v25.4s, v19.4s, v7.4s       \n"
3000 
3001                     "prfm   pldl1keep, [%4, #128]       \n"
3002                     "ld1    {v16.4s}, [%4], #16         \n"
3003 
3004                     "fmla   v26.4s, v19.4s, v11.4s      \n"
3005                     "fmla   v27.4s, v19.4s, v15.4s      \n"
3006 
3007                     "st1    {v24.4s}, [x4], #16         \n"
3008                     "mov    %0, x4                      \n"
3009 
3010                     "st1    {v25.4s}, [x5], #16         \n"
3011                     "mov    %1, x5                      \n"
3012 
3013                     "subs   w1, w1, #1                  \n"
3014 
3015                     "st1    {v26.4s}, [x6], #16         \n"
3016                     "mov    %2, x6                      \n"
3017 
3018                     "st1    {v27.4s}, [x7], #16         \n"
3019                     "mov    %3, x7                      \n"
3020 
3021                     "bne    1b                          \n"
3022                     "sub    %4, %4, #16                 \n"
3023                     //END tile loop
3024 
3025                     "2:                                 \n"
3026 
3027                     // remain loop
3028                     "and    w1, %w18, #3                \n" // w1 = remain = tiles & 3;
3029                     "cmp    w1, #0                      \n"
3030                     "beq    4f                          \n"
3031 
3032                     //BEGIN remain loop
3033                     "3:                                 \n"
3034 
3035                     "prfm   pldl1keep, [%4, #128]       \n"
3036                     "ld1    {v16.4s}, [%4], #16         \n"
3037 
3038                     "prfm   pldl1keep, [%0, #128]       \n"
3039                     "ld1    {v20.4s}, [%0]              \n"
3040 
3041                     "fmla   v20.4s, v16.4s, v0.4s       \n"
3042 
3043                     "prfm   pldl1keep, [%1, #128]       \n"
3044                     "ld1    {v21.4s}, [%1]              \n"
3045 
3046                     "fmla   v21.4s, v16.4s, v4.4s       \n"
3047 
3048                     "prfm   pldl1keep, [%2, #128]       \n"
3049                     "ld1    {v22.4s}, [%2]              \n"
3050 
3051                     "fmla   v22.4s, v16.4s, v8.4s       \n"
3052 
3053                     "prfm   pldl1keep, [%3, #128]       \n"
3054                     "ld1    {v23.4s}, [%3]              \n"
3055 
3056                     "fmla   v23.4s, v16.4s, v12.4s      \n"
3057 
3058                     "prfm   pldl1keep, [%5, #128]       \n"
3059                     "ld1    {v17.4s}, [%5], #16         \n"
3060 
3061                     "fmla   v20.4s, v17.4s, v1.4s       \n"
3062                     "fmla   v21.4s, v17.4s, v5.4s       \n"
3063 
3064                     "fmla   v22.4s, v17.4s, v9.4s       \n"
3065                     "fmla   v23.4s, v17.4s, v13.4s      \n"
3066 
3067                     "prfm   pldl1keep, [%6, #128]       \n"
3068                     "ld1    {v18.4s}, [%6], #16         \n"
3069 
3070                     "fmla   v20.4s, v18.4s, v2.4s       \n"
3071                     "fmla   v21.4s, v18.4s, v6.4s       \n"
3072 
3073                     "fmla   v22.4s, v18.4s, v10.4s      \n"
3074                     "fmla   v23.4s, v18.4s, v14.4s      \n"
3075 
3076                     "prfm   pldl1keep, [%7, #128]       \n"
3077                     "ld1    {v19.4s}, [%7], #16         \n"
3078 
3079                     "fmla   v20.4s, v19.4s, v3.4s       \n"
3080                     "fmla   v21.4s, v19.4s, v7.4s       \n"
3081                     "fmla   v22.4s, v19.4s, v11.4s      \n"
3082                     "fmla   v23.4s, v19.4s, v15.4s      \n"
3083 
3084                     "st1    {v20.4s}, [%0], #16         \n"
3085                     "st1    {v21.4s}, [%1], #16         \n"
3086 
3087                     "subs   w1, w1, #1                  \n"
3088 
3089                     "st1    {v22.4s}, [%2], #16         \n"
3090                     "st1    {v23.4s}, [%3], #16         \n"
3091 
3092                     "bne    3b                          \n"
3093                     //END remain loop
3094 
3095                     "4:                                 \n"
3096 
3097                     "subs   w0, w0, #1                  \n"
3098                     "bne    0b                          \n"
3099 
3100                     : "=r"(output0_tm), // %0
3101                     "=r"(output1_tm), // %1
3102                     "=r"(output2_tm), // %2
3103                     "=r"(output3_tm), // %3
3104                     "=r"(r0),         // %4
3105                     "=r"(r1),         // %5
3106                     "=r"(r2),         // %6
3107                     "=r"(r3),         // %7
3108                     "=r"(ktm)         // %8
3109                     : "0"(output0_tm),
3110                     "1"(output1_tm),
3111                     "2"(output2_tm),
3112                     "3"(output3_tm),
3113                     "4"(r0),
3114                     "5"(r1),
3115                     "6"(r2),
3116                     "7"(r3),
3117                     "8"(ktm),
3118                     "r"(tiles) // %18
3119                     : "cc", "memory", "x0", "x1", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
3120             }
3121 #endif // __ARM_NEON && __aarch64__
3122 
3123             for (; q + 1 < inch; q += 2)
3124             {
3125                 const float* r0 = bottom_blob_tm.channel(q);
3126                 const float* r1 = bottom_blob_tm.channel(q + 1);
3127 
3128                 float* output0_tm = out0_tm;
3129                 float* output1_tm = out1_tm;
3130                 float* output2_tm = out2_tm;
3131                 float* output3_tm = out3_tm;
3132 
3133 #if __ARM_NEON
3134 #if __aarch64__
3135                 asm volatile(
3136                     "mov    w0, #16                     \n" // w0 = r = 16
3137                     "0:                                 \n"
3138 
3139                     "prfm   pldl1keep, [%6, #256]       \n"
3140                     "ld1    {v0.4s, v1.4s}, [%6], #32   \n" // v0 v1 = _k00 _k01
3141 
3142                     "prfm   pldl1keep, [%6, #256]       \n"
3143                     "ld1    {v2.4s, v3.4s}, [%6], #32   \n" // v2 v3 = _k10 _k11
3144 
3145                     "prfm   pldl1keep, [%6, #256]       \n"
3146                     "ld1    {v4.4s, v5.4s}, [%6], #32   \n" // v4 v5 = _k20 _k21
3147 
3148                     "prfm   pldl1keep, [%6, #256]       \n"
3149                     "ld1    {v6.4s, v7.4s}, [%6], #32   \n" // v6 v7 = _k30 _k31
3150 
3151                     // tile loop
3152                     "lsr    w1, %w14, #2                \n" // w1 = nn = tiles >> 2
3153                     "cmp    w1, #0                      \n"
3154                     "beq    2f                          \n"
3155 
3156                     //BEGIN tile loop
3157                     "prfm   pldl1keep, [%4, #128]       \n"
3158                     "ld1    {v20.4s}, [%4], #16         \n"
3159 
3160                     "1:                                 \n"
3161 
3162                     "prfm   pldl1keep, [%0, #128]       \n"
3163                     "ld1    {v16.4s}, [%0]              \n"
3164 
3165                     "fmla   v16.4s, v20.4s, v0.4s       \n"
3166 
3167                     "prfm   pldl1keep, [%1, #128]       \n"
3168                     "ld1    {v17.4s}, [%1]              \n"
3169 
3170                     "fmla   v17.4s, v20.4s, v2.4s       \n"
3171 
3172                     "prfm   pldl1keep, [%2, #128]       \n"
3173                     "ld1    {v18.4s}, [%2]              \n"
3174 
3175                     "fmla   v18.4s, v20.4s, v4.4s       \n"
3176 
3177                     "prfm   pldl1keep, [%3, #128]       \n"
3178                     "ld1    {v19.4s}, [%3]              \n"
3179 
3180                     "fmla   v19.4s, v20.4s, v6.4s       \n"
3181 
3182                     "prfm   pldl1keep, [%5, #128]       \n"
3183                     "ld1    {v21.4s}, [%5], #16         \n"
3184 
3185                     "fmla   v16.4s, v21.4s, v1.4s       \n"
3186                     "fmla   v17.4s, v21.4s, v3.4s       \n"
3187 
3188                     "prfm   pldl1keep, [%4, #128]       \n"
3189                     "ld1    {v20.4s}, [%4], #16         \n"
3190 
3191                     "fmla   v18.4s, v21.4s, v5.4s       \n"
3192                     "fmla   v19.4s, v21.4s, v7.4s       \n"
3193 
3194                     "st1    {v16.4s}, [%0], #16         \n"
3195                     "st1    {v17.4s}, [%1], #16         \n"
3196 
3197                     ////
3198 
3199                     "prfm   pldl1keep, [%0, #128]       \n"
3200                     "ld1    {v16.4s}, [%0]              \n"
3201 
3202                     "fmla   v16.4s, v20.4s, v0.4s       \n"
3203 
3204                     "prfm   pldl1keep, [%1, #128]       \n"
3205                     "ld1    {v17.4s}, [%1]              \n"
3206 
3207                     "fmla   v17.4s, v20.4s, v2.4s       \n"
3208 
3209                     "st1    {v18.4s}, [%2], #16         \n"
3210                     "st1    {v19.4s}, [%3], #16         \n"
3211 
3212                     "prfm   pldl1keep, [%2, #128]       \n"
3213                     "ld1    {v18.4s}, [%2]              \n"
3214 
3215                     "fmla   v18.4s, v20.4s, v4.4s       \n"
3216 
3217                     "prfm   pldl1keep, [%3, #128]       \n"
3218                     "ld1    {v19.4s}, [%3]              \n"
3219 
3220                     "fmla   v19.4s, v20.4s, v6.4s       \n"
3221 
3222                     "prfm   pldl1keep, [%5, #128]       \n"
3223                     "ld1    {v21.4s}, [%5], #16         \n"
3224 
3225                     "fmla   v16.4s, v21.4s, v1.4s       \n"
3226                     "fmla   v17.4s, v21.4s, v3.4s       \n"
3227 
3228                     "prfm   pldl1keep, [%4, #128]       \n"
3229                     "ld1    {v20.4s}, [%4], #16         \n"
3230 
3231                     "fmla   v18.4s, v21.4s, v5.4s       \n"
3232                     "fmla   v19.4s, v21.4s, v7.4s       \n"
3233 
3234                     "st1    {v16.4s}, [%0], #16         \n"
3235                     "st1    {v17.4s}, [%1], #16         \n"
3236 
3237                     ////
3238 
3239                     "prfm   pldl1keep, [%0, #128]       \n"
3240                     "ld1    {v16.4s}, [%0]              \n"
3241 
3242                     "fmla   v16.4s, v20.4s, v0.4s       \n"
3243 
3244                     "prfm   pldl1keep, [%1, #128]       \n"
3245                     "ld1    {v17.4s}, [%1]              \n"
3246 
3247                     "fmla   v17.4s, v20.4s, v2.4s       \n"
3248 
3249                     "st1    {v18.4s}, [%2], #16         \n"
3250                     "st1    {v19.4s}, [%3], #16         \n"
3251 
3252                     "prfm   pldl1keep, [%2, #128]       \n"
3253                     "ld1    {v18.4s}, [%2]              \n"
3254 
3255                     "fmla   v18.4s, v20.4s, v4.4s       \n"
3256 
3257                     "prfm   pldl1keep, [%3, #128]       \n"
3258                     "ld1    {v19.4s}, [%3]              \n"
3259 
3260                     "fmla   v19.4s, v20.4s, v6.4s       \n"
3261 
3262                     "prfm   pldl1keep, [%5, #128]       \n"
3263                     "ld1    {v21.4s}, [%5], #16         \n"
3264 
3265                     "fmla   v16.4s, v21.4s, v1.4s       \n"
3266                     "fmla   v17.4s, v21.4s, v3.4s       \n"
3267 
3268                     "prfm   pldl1keep, [%4, #128]       \n"
3269                     "ld1    {v20.4s}, [%4], #16         \n"
3270 
3271                     "fmla   v18.4s, v21.4s, v5.4s       \n"
3272                     "fmla   v19.4s, v21.4s, v7.4s       \n"
3273 
3274                     "st1    {v16.4s}, [%0], #16         \n"
3275                     "st1    {v17.4s}, [%1], #16         \n"
3276 
3277                     ////
3278 
3279                     "prfm   pldl1keep, [%0, #128]       \n"
3280                     "ld1    {v16.4s}, [%0]              \n"
3281 
3282                     "fmla   v16.4s, v20.4s, v0.4s       \n"
3283 
3284                     "prfm   pldl1keep, [%1, #128]       \n"
3285                     "ld1    {v17.4s}, [%1]              \n"
3286 
3287                     "fmla   v17.4s, v20.4s, v2.4s       \n"
3288 
3289                     "st1    {v18.4s}, [%2], #16         \n"
3290                     "st1    {v19.4s}, [%3], #16         \n"
3291 
3292                     "prfm   pldl1keep, [%2, #128]       \n"
3293                     "ld1    {v18.4s}, [%2]              \n"
3294 
3295                     "fmla   v18.4s, v20.4s, v4.4s       \n"
3296 
3297                     "prfm   pldl1keep, [%3, #128]       \n"
3298                     "ld1    {v19.4s}, [%3]              \n"
3299 
3300                     "fmla   v19.4s, v20.4s, v6.4s       \n"
3301 
3302                     "prfm   pldl1keep, [%5, #128]       \n"
3303                     "ld1    {v21.4s}, [%5], #16         \n"
3304 
3305                     "fmla   v16.4s, v21.4s, v1.4s       \n"
3306                     "fmla   v17.4s, v21.4s, v3.4s       \n"
3307 
3308                     "prfm   pldl1keep, [%4, #128]       \n"
3309                     "ld1    {v20.4s}, [%4], #16         \n"
3310 
3311                     "fmla   v18.4s, v21.4s, v5.4s       \n"
3312                     "fmla   v19.4s, v21.4s, v7.4s       \n"
3313 
3314                     "st1    {v16.4s}, [%0], #16         \n"
3315                     "st1    {v17.4s}, [%1], #16         \n"
3316 
3317                     "subs   w1, w1, #1                  \n"
3318 
3319                     "st1    {v18.4s}, [%2], #16         \n"
3320                     "st1    {v19.4s}, [%3], #16         \n"
3321 
3322                     "bne    1b                          \n"
3323                     "sub    %4, %4, #16                 \n"
3324                     //END tile loop
3325 
3326                     "2:                                 \n"
3327 
3328                     // remain loop
3329                     "and    w1, %w14, #3                \n" // w1 = remain = tiles & 3;
3330                     "cmp    w1, #0                      \n"
3331                     "beq    4f                          \n"
3332 
3333                     //BEGIN remain loop
3334                     "3:                                 \n"
3335 
3336                     "prfm   pldl1keep, [%4, #128]       \n"
3337                     "ld1    {v20.4s}, [%4], #16         \n"
3338 
3339                     "prfm   pldl1keep, [%0, #128]       \n"
3340                     "ld1    {v16.4s}, [%0]              \n"
3341 
3342                     "fmla   v16.4s, v20.4s, v0.4s       \n"
3343 
3344                     "prfm   pldl1keep, [%1, #128]       \n"
3345                     "ld1    {v17.4s}, [%1]              \n"
3346 
3347                     "fmla   v17.4s, v20.4s, v2.4s       \n"
3348 
3349                     "prfm   pldl1keep, [%2, #128]       \n"
3350                     "ld1    {v18.4s}, [%2]              \n"
3351 
3352                     "fmla   v18.4s, v20.4s, v4.4s       \n"
3353 
3354                     "prfm   pldl1keep, [%3, #128]       \n"
3355                     "ld1    {v19.4s}, [%3]              \n"
3356 
3357                     "fmla   v19.4s, v20.4s, v6.4s       \n"
3358 
3359                     "prfm   pldl1keep, [%5, #128]       \n"
3360                     "ld1    {v21.4s}, [%5], #16         \n"
3361 
3362                     "fmla   v16.4s, v21.4s, v1.4s       \n"
3363                     "fmla   v17.4s, v21.4s, v3.4s       \n"
3364                     "fmla   v18.4s, v21.4s, v5.4s       \n"
3365                     "fmla   v19.4s, v21.4s, v7.4s       \n"
3366 
3367                     "st1    {v16.4s}, [%0], #16         \n"
3368                     "st1    {v17.4s}, [%1], #16         \n"
3369 
3370                     "subs   w1, w1, #1                  \n"
3371 
3372                     "st1    {v18.4s}, [%2], #16         \n"
3373                     "st1    {v19.4s}, [%3], #16         \n"
3374 
3375                     "bne    3b                          \n"
3376                     //END remain loop
3377 
3378                     "4:                                 \n"
3379 
3380                     "subs   w0, w0, #1                  \n"
3381                     "bne    0b                          \n"
3382 
3383                     : "=r"(output0_tm), // %0
3384                     "=r"(output1_tm), // %1
3385                     "=r"(output2_tm), // %2
3386                     "=r"(output3_tm), // %3
3387                     "=r"(r0),         // %4
3388                     "=r"(r1),         // %5
3389                     "=r"(ktm)         // %6
3390                     : "0"(output0_tm),
3391                     "1"(output1_tm),
3392                     "2"(output2_tm),
3393                     "3"(output3_tm),
3394                     "4"(r0),
3395                     "5"(r1),
3396                     "6"(ktm),
3397                     "r"(tiles) // %14
3398                     : "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21");
3399 #else
3400                 asm volatile(
3401                     "mov        r0, #16                 \n" // r0 = r = 16
3402                     "0:                                 \n"
3403 
3404                     "pld        [%6, #256]              \n"
3405                     "vld1.f32   {d0-d3}, [%6 :128]!     \n" // q0 q1 = _k00 _k01
3406 
3407                     "pld        [%6, #256]              \n"
3408                     "vld1.f32   {d4-d7}, [%6 :128]!     \n" // q2 q3 = _k10 _k11
3409 
3410                     "pld        [%6, #256]              \n"
3411                     "vld1.f32   {d8-d11}, [%6 :128]!    \n" // q4 q5 = _k20 _k21
3412 
3413                     "pld        [%6, #256]              \n"
3414                     "vld1.f32   {d12-d15}, [%6 :128]!   \n" // q6 q7 = _k30 _k31
3415 
3416                     // tile loop
3417                     "lsr        r1, %14, #2             \n" // r1 = nn = tiles >> 2
3418                     "cmp        r1, #0                  \n"
3419                     "beq        2f                      \n"
3420 
3421                     //BEGIN tile loop
3422                     "pld        [%4, #128]              \n"
3423                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3424 
3425                     "1:                                 \n"
3426 
3427                     "pld        [%0, #128]              \n"
3428                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3429 
3430                     "vmla.f32   q8, q12, q0             \n"
3431 
3432                     "pld        [%1, #128]              \n"
3433                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3434 
3435                     "vmla.f32   q9, q12, q2             \n"
3436 
3437                     "pld        [%2, #128]              \n"
3438                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3439 
3440                     "vmla.f32   q10, q12, q4            \n"
3441 
3442                     "pld        [%3, #128]              \n"
3443                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3444 
3445                     "vmla.f32   q11, q12, q6            \n"
3446 
3447                     "pld        [%5, #128]              \n"
3448                     "vld1.f32   {d26-d27}, [%5 :128]!   \n" // q13 = _r1
3449 
3450                     "vmla.f32   q8, q13, q1             \n"
3451                     "vmla.f32   q9, q13, q3             \n"
3452 
3453                     "pld        [%4, #128]              \n"
3454                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3455 
3456                     "vmla.f32   q10, q13, q5            \n"
3457                     "vmla.f32   q11, q13, q7            \n"
3458 
3459                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3460                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3461 
3462                     ////
3463 
3464                     "pld        [%0, #128]              \n"
3465                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3466 
3467                     "vmla.f32   q8, q12, q0             \n"
3468 
3469                     "pld        [%1, #128]              \n"
3470                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3471 
3472                     "vmla.f32   q9, q12, q2             \n"
3473 
3474                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3475                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3476 
3477                     "pld        [%2, #128]              \n"
3478                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3479 
3480                     "vmla.f32   q10, q12, q4            \n"
3481 
3482                     "pld        [%3, #128]              \n"
3483                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3484 
3485                     "vmla.f32   q11, q12, q6            \n"
3486 
3487                     "pld        [%5, #128]              \n"
3488                     "vld1.f32   {d26-d27}, [%5 :128]!   \n" // q13 = _r1
3489 
3490                     "vmla.f32   q8, q13, q1             \n"
3491                     "vmla.f32   q9, q13, q3             \n"
3492 
3493                     "pld        [%4, #128]              \n"
3494                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3495 
3496                     "vmla.f32   q10, q13, q5            \n"
3497                     "vmla.f32   q11, q13, q7            \n"
3498 
3499                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3500                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3501 
3502                     ////
3503 
3504                     "pld        [%0, #128]              \n"
3505                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3506 
3507                     "vmla.f32   q8, q12, q0             \n"
3508 
3509                     "pld        [%1, #128]              \n"
3510                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3511 
3512                     "vmla.f32   q9, q12, q2             \n"
3513 
3514                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3515                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3516 
3517                     "pld        [%2, #128]              \n"
3518                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3519 
3520                     "vmla.f32   q10, q12, q4            \n"
3521 
3522                     "pld        [%3, #128]              \n"
3523                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3524 
3525                     "vmla.f32   q11, q12, q6            \n"
3526 
3527                     "pld        [%5, #128]              \n"
3528                     "vld1.f32   {d26-d27}, [%5 :128]!   \n" // q13 = _r1
3529 
3530                     "vmla.f32   q8, q13, q1             \n"
3531                     "vmla.f32   q9, q13, q3             \n"
3532 
3533                     "pld        [%4, #128]              \n"
3534                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3535 
3536                     "vmla.f32   q10, q13, q5            \n"
3537                     "vmla.f32   q11, q13, q7            \n"
3538 
3539                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3540                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3541 
3542                     ////
3543 
3544                     "pld        [%0, #128]              \n"
3545                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3546 
3547                     "vmla.f32   q8, q12, q0             \n"
3548 
3549                     "pld        [%1, #128]              \n"
3550                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3551 
3552                     "vmla.f32   q9, q12, q2             \n"
3553 
3554                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3555                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3556 
3557                     "pld        [%2, #128]              \n"
3558                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3559 
3560                     "vmla.f32   q10, q12, q4            \n"
3561 
3562                     "pld        [%3, #128]              \n"
3563                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3564 
3565                     "vmla.f32   q11, q12, q6            \n"
3566 
3567                     "pld        [%5, #128]              \n"
3568                     "vld1.f32   {d26-d27}, [%5 :128]!   \n" // q13 = _r1
3569 
3570                     "vmla.f32   q8, q13, q1             \n"
3571                     "vmla.f32   q9, q13, q3             \n"
3572 
3573                     "pld        [%4, #128]              \n"
3574                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3575 
3576                     "vmla.f32   q10, q13, q5            \n"
3577                     "vmla.f32   q11, q13, q7            \n"
3578 
3579                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3580                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3581 
3582                     "subs       r1, #1                  \n"
3583 
3584                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3585                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3586 
3587                     "bne        1b                      \n"
3588                     "sub        %4, %4, #16             \n"
3589                     //END tile loop
3590 
3591                     "2:                                 \n"
3592 
3593                     // remain loop
3594                     "and        r1, %14, #3             \n" // r1 = remain = tiles & 3;
3595                     "cmp        r1, #0                  \n"
3596                     "beq        4f                      \n"
3597 
3598                     //BEGIN remain loop
3599                     "3:                                 \n"
3600 
3601                     "pld        [%4, #128]              \n"
3602                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3603 
3604                     "pld        [%0, #128]              \n"
3605                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3606 
3607                     "vmla.f32   q8, q12, q0             \n"
3608 
3609                     "pld        [%1, #128]              \n"
3610                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3611 
3612                     "vmla.f32   q9, q12, q2             \n"
3613 
3614                     "pld        [%2, #128]              \n"
3615                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3616 
3617                     "vmla.f32   q10, q12, q4            \n"
3618 
3619                     "pld        [%3, #128]              \n"
3620                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3621 
3622                     "vmla.f32   q11, q12, q6            \n"
3623 
3624                     "pld        [%5, #128]              \n"
3625                     "vld1.f32   {d26-d27}, [%5 :128]!   \n" // q13 = _r1
3626 
3627                     "vmla.f32   q8, q13, q1             \n"
3628                     "vmla.f32   q9, q13, q3             \n"
3629                     "vmla.f32   q10, q13, q5            \n"
3630                     "vmla.f32   q11, q13, q7            \n"
3631 
3632                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3633                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3634 
3635                     "subs       r1, #1                  \n"
3636 
3637                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3638                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3639 
3640                     "bne        3b                      \n"
3641                     //END remain loop
3642 
3643                     "4:                                 \n"
3644 
3645                     "subs       r0, #1                  \n"
3646                     "bne        0b                      \n"
3647 
3648                     : "=r"(output0_tm), // %0
3649                     "=r"(output1_tm), // %1
3650                     "=r"(output2_tm), // %2
3651                     "=r"(output3_tm), // %3
3652                     "=r"(r0),         // %4
3653                     "=r"(r1),         // %5
3654                     "=r"(ktm)         // %6
3655                     : "0"(output0_tm),
3656                     "1"(output1_tm),
3657                     "2"(output2_tm),
3658                     "3"(output3_tm),
3659                     "4"(r0),
3660                     "5"(r1),
3661                     "6"(ktm),
3662                     "r"(tiles) // %14
3663                     : "cc", "memory", "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
3664 #endif // __aarch64__
3665 #else
3666                 for (int r = 0; r < 16; r++)
3667                 {
3668                     for (int t = 0; t < tiles; t++)
3669                     {
3670                         for (int m = 0; m < 4; m++)
3671                         {
3672                             output0_tm[m] += r0[m] * ktm[0 + m];
3673                             output0_tm[m] += r1[m] * ktm[4 + m];
3674                             output1_tm[m] += r0[m] * ktm[8 + m];
3675                             output1_tm[m] += r1[m] * ktm[12 + m];
3676                             output2_tm[m] += r0[m] * ktm[16 + m];
3677                             output2_tm[m] += r1[m] * ktm[20 + m];
3678                             output3_tm[m] += r0[m] * ktm[24 + m];
3679                             output3_tm[m] += r1[m] * ktm[28 + m];
3680                         }
3681 
3682                         r0 += 4;
3683                         r1 += 4;
3684                         output0_tm += 4;
3685                         output1_tm += 4;
3686                         output2_tm += 4;
3687                         output3_tm += 4;
3688                     }
3689 
3690                     ktm += 32;
3691                 }
3692 #endif // __ARM_NEON
3693             }
3694 
3695             for (; q < inch; q++)
3696             {
3697                 const float* r0 = bottom_blob_tm.channel(q);
3698 
3699                 float* output0_tm = out0_tm;
3700                 float* output1_tm = out1_tm;
3701                 float* output2_tm = out2_tm;
3702                 float* output3_tm = out3_tm;
3703 
3704 #if __ARM_NEON
3705 #if __aarch64__
3706                 asm volatile(
3707                     "mov    w0, #16                     \n" // w0 = r = 16
3708                     "0:                                 \n"
3709 
3710                     "prfm   pldl1keep, [%5, #256]       \n"
3711                     "ld1    {v0.4s, v1.4s}, [%5], #32   \n" // v0 v1 = _k00 _k10
3712 
3713                     "prfm   pldl1keep, [%5, #256]       \n"
3714                     "ld1    {v2.4s, v3.4s}, [%5], #32   \n" // v2 v3 = _k20 _k30
3715 
3716                     // tile loop
3717                     "mov    w1, %w12                    \n" // w1 = tiles
3718                     "cmp    w1, #0                      \n"
3719                     "beq    2f                          \n"
3720 
3721                     //BEGIN tile loop
3722                     "1:                                 \n"
3723 
3724                     "prfm   pldl1keep, [%4, #128]       \n"
3725                     "ld1    {v16.4s}, [%4], #16         \n"
3726 
3727                     "prfm   pldl1keep, [%0, #128]       \n"
3728                     "ld1    {v17.4s}, [%0]              \n"
3729 
3730                     "fmla   v17.4s, v16.4s, v0.4s       \n"
3731 
3732                     "prfm   pldl1keep, [%1, #128]       \n"
3733                     "ld1    {v18.4s}, [%1]              \n"
3734 
3735                     "fmla   v18.4s, v16.4s, v1.4s       \n"
3736 
3737                     "prfm   pldl1keep, [%2, #128]       \n"
3738                     "ld1    {v19.4s}, [%2]              \n"
3739 
3740                     "fmla   v19.4s, v16.4s, v2.4s       \n"
3741 
3742                     "prfm   pldl1keep, [%3, #128]       \n"
3743                     "ld1    {v20.4s}, [%3]              \n"
3744 
3745                     "fmla   v20.4s, v16.4s, v3.4s       \n"
3746 
3747                     "st1    {v17.4s}, [%0], #16         \n"
3748                     "st1    {v18.4s}, [%1], #16         \n"
3749 
3750                     "subs   w1, w1, #1                  \n"
3751 
3752                     "st1    {v19.4s}, [%2], #16         \n"
3753                     "st1    {v20.4s}, [%3], #16         \n"
3754 
3755                     "bne    1b                          \n"
3756                     //END tile loop
3757 
3758                     "2:                                 \n"
3759 
3760                     "subs   w0, w0, #1                  \n"
3761                     "bne    0b                          \n"
3762 
3763                     : "=r"(output0_tm), // %0
3764                     "=r"(output1_tm), // %1
3765                     "=r"(output2_tm), // %2
3766                     "=r"(output3_tm), // %3
3767                     "=r"(r0),         // %4
3768                     "=r"(ktm)         // %5
3769                     : "0"(output0_tm),
3770                     "1"(output1_tm),
3771                     "2"(output2_tm),
3772                     "3"(output3_tm),
3773                     "4"(r0),
3774                     "5"(ktm),
3775                     "r"(tiles) // %12
3776                     : "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20");
3777 #else
3778                 asm volatile(
3779                     "mov        r0, #16                 \n" // r0 = r = 16
3780                     "0:                                 \n"
3781 
3782                     "pld        [%5, #256]              \n"
3783                     "vld1.f32   {d0-d3}, [%5 :128]!     \n" // q0 q1 = _k00 _k10
3784 
3785                     "pld        [%5, #256]              \n"
3786                     "vld1.f32   {d4-d7}, [%5 :128]!     \n" // q2 q3 = _k20 _k30
3787 
3788                     // tile loop
3789                     "mov        r1, %12                 \n" // r1 = tiles
3790                     "cmp        r1, #0                  \n"
3791                     "beq        2f                      \n"
3792 
3793                     //BEGIN tile loop
3794                     "1:                                 \n"
3795 
3796                     "pld        [%4, #128]              \n"
3797                     "vld1.f32   {d24-d25}, [%4 :128]!   \n" // q12 = _r0
3798 
3799                     "pld        [%0, #128]              \n"
3800                     "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3801 
3802                     "vmla.f32   q8, q12, q0             \n"
3803 
3804                     "pld        [%1, #128]              \n"
3805                     "vld1.f32   {d18-d19}, [%1 :128]    \n" // q9 = _output1_tm
3806 
3807                     "vmla.f32   q9, q12, q1             \n"
3808 
3809                     "pld        [%2, #128]              \n"
3810                     "vld1.f32   {d20-d21}, [%2 :128]    \n" // q10 = _output2_tm
3811 
3812                     "vmla.f32   q10, q12, q2            \n"
3813 
3814                     "pld        [%3, #128]              \n"
3815                     "vld1.f32   {d22-d23}, [%3 :128]    \n" // q11 = _output3_tm
3816 
3817                     "vmla.f32   q11, q12, q3            \n"
3818 
3819                     "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3820                     "vst1.f32   {d18-d19}, [%1 :128]!   \n"
3821 
3822                     "subs       r1, #1                  \n"
3823 
3824                     "vst1.f32   {d20-d21}, [%2 :128]!   \n"
3825                     "vst1.f32   {d22-d23}, [%3 :128]!   \n"
3826 
3827                     "bne        1b                      \n"
3828                     //END tile loop
3829 
3830                     "2:                                 \n"
3831 
3832                     "subs       r0, #1                  \n"
3833                     "bne        0b                      \n"
3834 
3835                     : "=r"(output0_tm), // %0
3836                     "=r"(output1_tm), // %1
3837                     "=r"(output2_tm), // %2
3838                     "=r"(output3_tm), // %3
3839                     "=r"(r0),         // %4
3840                     "=r"(ktm)         // %5
3841                     : "0"(output0_tm),
3842                     "1"(output1_tm),
3843                     "2"(output2_tm),
3844                     "3"(output3_tm),
3845                     "4"(r0),
3846                     "5"(ktm),
3847                     "r"(tiles) // %12
3848                     : "cc", "memory", "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
3849 #endif // __aarch64__
3850 #else
3851                 for (int r = 0; r < 16; r++)
3852                 {
3853                     for (int t = 0; t < tiles; t++)
3854                     {
3855                         for (int m = 0; m < 4; m++)
3856                         {
3857                             output0_tm[m] += r0[m] * ktm[0 + m];
3858                             output1_tm[m] += r0[m] * ktm[4 + m];
3859                             output2_tm[m] += r0[m] * ktm[8 + m];
3860                             output3_tm[m] += r0[m] * ktm[12 + m];
3861                         }
3862 
3863                         r0 += 4;
3864                         output0_tm += 4;
3865                         output1_tm += 4;
3866                         output2_tm += 4;
3867                         output3_tm += 4;
3868                     }
3869 
3870                     ktm += 16;
3871                 }
3872 #endif // __ARM_NEON
3873             }
3874         }
3875 
3876         #pragma omp parallel for num_threads(opt.num_threads)
3877         for (int p = remain_outch_start; p < outch; p++)
3878         {
3879             Mat out0_tm = top_blob_tm.channel(p);
3880 
3881             const float* ktm = (const float*)kernel_tm.channel(nn_outch) + 8 * 8 * inch * (p - remain_outch_start);
3882 
3883             out0_tm.fill(0.f);
3884 
3885             int q = 0;
3886 
3887             for (; q < inch; q++)
3888             {
3889                 const float* r0 = bottom_blob_tm.channel(q);
3890 
3891                 float* output0_tm = out0_tm;
3892 
3893                 for (int r = 0; r < 16; r++)
3894                 {
3895 #if __ARM_NEON
3896                     float32x4_t _k00 = vld1q_f32(ktm);
3897                     ktm += 4;
3898 #endif // __ARM_NEON
3899 
3900                     // tile
3901                     for (int i = 0; i < tiles; i++)
3902                     {
3903 #if __ARM_NEON
3904 #if __aarch64__
3905                         asm volatile(
3906                             "prfm   pldl1keep, [%1, #128]   \n"
3907                             "ld1    {v17.4s}, [%1], #16     \n"
3908 
3909                             "prfm   pldl1keep, [%0, #128]   \n"
3910                             "ld1    {v16.4s}, [%0]          \n"
3911 
3912                             "fmla   v16.4s, v17.4s, %4.4s   \n"
3913 
3914                             "st1    {v16.4s}, [%0], #16     \n"
3915                             : "=r"(output0_tm), // %0
3916                             "=r"(r0)          // %1
3917                             : "0"(output0_tm),
3918                             "1"(r0),
3919                             "w"(_k00) // %4
3920                             : "cc", "memory", "v16", "v17");
3921 #else
3922                         asm volatile(
3923                             "pld        [%1, #128]              \n"
3924                             "vld1.f32   {d18-d19}, [%1 :128]!   \n" // q9 = _r0
3925 
3926                             "pld        [%0, #128]              \n"
3927                             "vld1.f32   {d16-d17}, [%0 :128]    \n" // q8 = _output0_tm
3928 
3929                             "vmla.f32   q8, q9, %q4             \n"
3930 
3931                             "vst1.f32   {d16-d17}, [%0 :128]!   \n"
3932                             : "=r"(output0_tm), // %0
3933                             "=r"(r0)          // %1
3934                             : "0"(output0_tm),
3935                             "1"(r0),
3936                             "w"(_k00) // %4
3937                             : "cc", "memory", "q8", "q9");
3938 #endif // __aarch64__
3939 #else
3940                         for (int m = 0; m < 4; m++)
3941                         {
3942                             output0_tm[m] += r0[m] * ktm[m];
3943                         }
3944 
3945                         r0 += 4;
3946                         output0_tm += 4;
3947 #endif // __ARM_NEON
3948                     }
3949 
3950 #if !__ARM_NEON
3951                     ktm += 4;
3952 #endif // __ARM_NEON
3953                 }
3954             }
3955         }
3956     }
3957     bottom_blob_tm = Mat();
3958     // END dot
3959 
3960     // BEGIN transform output
3961     Mat top_blob_bordered;
3962     top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
3963     {
3964         //         const float otm[6][8] = {
3965         //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
3966         //             {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
3967         //             {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
3968         //             {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
3969         //             {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
3970         //             {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
3971         //         };
3972 
3973         // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
3974         // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
3975         // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
3976         // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
3977         // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
3978         // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
3979 
3980 #if __ARM_NEON
3981         const float coeff[4] = {4.f, 8.f, 16.f, 32.f};
3982         float32x4_t _coeff = vld1q_f32(coeff);
3983 #endif // __ARM_NEON
3984 
3985         int w_tm = outw / 6 * 8;
3986         int h_tm = outh / 6 * 8;
3987         const int tiles = w_tm / 8 * h_tm / 8;
3988 
3989         #pragma omp parallel for num_threads(opt.num_threads)
3990         for (int p = 0; p < outch; p++)
3991         {
3992             const Mat out0_tm = top_blob_tm.channel(p);
3993             Mat out0 = top_blob_bordered.channel(p);
3994 
3995             const float bias0 = bias ? bias[p] : 0.f;
3996 #if __ARM_NEON
3997             float32x2_t _bias0 = vdup_n_f32(bias0);
3998 #endif // __ARM_NEON
3999 
4000             float tmp[6][8];
4001 
4002             // tile
4003             for (int i = 0; i < outh / 6; i++)
4004             {
4005                 for (int j = 0; j < outw / 6; j++)
4006                 {
4007 #if __ARM_NEON
4008                     const float* output0_tm0_0 = out0_tm.row(i * w_tm / 8 + j);
4009                     const float* output0_tm0_4 = out0_tm.row(i * w_tm / 8 + j + tiles);
4010                     const float* output0_tm1_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 2);
4011                     const float* output0_tm1_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 3);
4012                     const float* output0_tm2_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 4);
4013                     const float* output0_tm2_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 5);
4014                     const float* output0_tm3_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 6);
4015                     const float* output0_tm3_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 7);
4016 
4017 #if __aarch64__
4018                     for (int m = 0; m + 3 < 8; m += 4)
4019                     {
4020                         float32x4_t _output0_tm0_0123 = vld1q_f32(output0_tm0_0);
4021                         float32x4_t _output0_tm0_4567 = vld1q_f32(output0_tm0_4);
4022                         float32x4_t _output0_tm1_0123 = vld1q_f32(output0_tm1_0);
4023                         float32x4_t _output0_tm1_4567 = vld1q_f32(output0_tm1_4);
4024                         float32x4_t _output0_tm2_0123 = vld1q_f32(output0_tm2_0);
4025                         float32x4_t _output0_tm2_4567 = vld1q_f32(output0_tm2_4);
4026                         float32x4_t _output0_tm3_0123 = vld1q_f32(output0_tm3_0);
4027                         float32x4_t _output0_tm3_4567 = vld1q_f32(output0_tm3_4);
4028 
4029                         float32x4x2_t _output0_tm01_00221133 = vtrnq_f32(_output0_tm0_0123, _output0_tm1_0123);
4030                         float32x4x2_t _output0_tm01_44665577 = vtrnq_f32(_output0_tm0_4567, _output0_tm1_4567);
4031                         float32x4x2_t _output0_tm23_00221133 = vtrnq_f32(_output0_tm2_0123, _output0_tm3_0123);
4032                         float32x4x2_t _output0_tm23_44665577 = vtrnq_f32(_output0_tm2_4567, _output0_tm3_4567);
4033 
4034                         // no vswp intrinsic  :(
4035                         float32x4_t _output0_tm_00 = vcombine_f32(vget_low_f32(_output0_tm01_00221133.val[0]), vget_low_f32(_output0_tm23_00221133.val[0]));
4036                         float32x4_t _output0_tm_11 = vcombine_f32(vget_low_f32(_output0_tm01_00221133.val[1]), vget_low_f32(_output0_tm23_00221133.val[1]));
4037                         float32x4_t _output0_tm_22 = vcombine_f32(vget_high_f32(_output0_tm01_00221133.val[0]), vget_high_f32(_output0_tm23_00221133.val[0]));
4038                         float32x4_t _output0_tm_33 = vcombine_f32(vget_high_f32(_output0_tm01_00221133.val[1]), vget_high_f32(_output0_tm23_00221133.val[1]));
4039                         float32x4_t _output0_tm_44 = vcombine_f32(vget_low_f32(_output0_tm01_44665577.val[0]), vget_low_f32(_output0_tm23_44665577.val[0]));
4040                         float32x4_t _output0_tm_55 = vcombine_f32(vget_low_f32(_output0_tm01_44665577.val[1]), vget_low_f32(_output0_tm23_44665577.val[1]));
4041                         float32x4_t _output0_tm_66 = vcombine_f32(vget_high_f32(_output0_tm01_44665577.val[0]), vget_high_f32(_output0_tm23_44665577.val[0]));
4042                         float32x4_t _output0_tm_77 = vcombine_f32(vget_high_f32(_output0_tm01_44665577.val[1]), vget_high_f32(_output0_tm23_44665577.val[1]));
4043 
4044                         float32x4_t _tmp024a = vaddq_f32(_output0_tm_11, _output0_tm_22);
4045                         float32x4_t _tmp135a = vsubq_f32(_output0_tm_11, _output0_tm_22);
4046 
4047                         float32x4_t _tmp024b = vaddq_f32(_output0_tm_33, _output0_tm_44);
4048                         float32x4_t _tmp135b = vsubq_f32(_output0_tm_33, _output0_tm_44);
4049 
4050                         float32x4_t _tmp024c = vaddq_f32(_output0_tm_55, _output0_tm_66);
4051                         float32x4_t _tmp135c = vsubq_f32(_output0_tm_55, _output0_tm_66);
4052 
4053                         float32x4_t _tmp0 = vaddq_f32(_output0_tm_00, _tmp024a);
4054                         _tmp0 = vmlaq_lane_f32(_tmp0, _tmp024c, vget_high_f32(_coeff), 1);
4055                         _tmp0 = vaddq_f32(_tmp0, _tmp024b);
4056 
4057                         float32x4_t _tmp2 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
4058                         _tmp2 = vmlaq_lane_f32(_tmp2, _tmp024c, vget_low_f32(_coeff), 1);
4059 
4060                         float32x4_t _tmp4 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
4061                         _tmp4 = vaddq_f32(_tmp4, _tmp024c);
4062                         _tmp4 = vaddq_f32(_tmp4, _tmp024c);
4063 
4064                         vst1q_f32(&tmp[0][m], _tmp0);
4065                         vst1q_f32(&tmp[2][m], _tmp2);
4066                         vst1q_f32(&tmp[4][m], _tmp4);
4067 
4068                         float32x4_t _tmp1 = vmlaq_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
4069                         _tmp1 = vaddq_f32(_tmp1, _tmp135b);
4070                         _tmp1 = vaddq_f32(_tmp1, _tmp135b);
4071 
4072                         float32x4_t _tmp3 = vmlaq_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
4073                         _tmp3 = vmlaq_lane_f32(_tmp3, _tmp135c, vget_low_f32(_coeff), 0);
4074 
4075                         float32x4_t _tmp5 = vaddq_f32(_output0_tm_77, _tmp135a);
4076                         _tmp5 = vmlaq_lane_f32(_tmp5, _tmp135b, vget_high_f32(_coeff), 1);
4077                         _tmp5 = vaddq_f32(_tmp5, _tmp135c);
4078 
4079                         vst1q_f32(&tmp[1][m], _tmp1);
4080                         vst1q_f32(&tmp[3][m], _tmp3);
4081                         vst1q_f32(&tmp[5][m], _tmp5);
4082 
4083                         output0_tm0_0 += out0_tm.w * tiles * 2 * 4;
4084                         output0_tm0_4 += out0_tm.w * tiles * 2 * 4;
4085                         output0_tm1_0 += out0_tm.w * tiles * 2 * 4;
4086                         output0_tm1_4 += out0_tm.w * tiles * 2 * 4;
4087                         output0_tm2_0 += out0_tm.w * tiles * 2 * 4;
4088                         output0_tm2_4 += out0_tm.w * tiles * 2 * 4;
4089                         output0_tm3_0 += out0_tm.w * tiles * 2 * 4;
4090                         output0_tm3_4 += out0_tm.w * tiles * 2 * 4;
4091                     }
4092 
4093                     const float* t0 = tmp[0];
4094                     const float* t1 = tmp[1];
4095 
4096                     float* output0 = out0.row(i * 6) + j * 6;
4097                     float* output1 = output0 + outw;
4098 
4099                     for (int m = 0; m + 1 < 6; m += 2)
4100                     {
4101                         float32x4_t _t0_0123 = vld1q_f32(t0);
4102                         float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
4103                         float32x4_t _t1_0123 = vld1q_f32(t1);
4104                         float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
4105 
4106                         float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
4107                         float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
4108 
4109                         float32x2_t _t_00 = vget_low_f32(_t01_00221133.val[0]);
4110                         float32x2_t _t_11 = vget_low_f32(_t01_00221133.val[1]);
4111                         float32x2_t _t_22 = vget_high_f32(_t01_00221133.val[0]);
4112                         float32x2_t _t_33 = vget_high_f32(_t01_00221133.val[1]);
4113                         float32x2_t _t_44 = vget_low_f32(_t01_44665577.val[0]);
4114                         float32x2_t _t_55 = vget_low_f32(_t01_44665577.val[1]);
4115                         float32x2_t _t_66 = vget_high_f32(_t01_44665577.val[0]);
4116                         float32x2_t _t_77 = vget_high_f32(_t01_44665577.val[1]);
4117 
4118                         float32x2_t _tmp024a = vadd_f32(_t_11, _t_22);
4119                         float32x2_t _tmp135a = vsub_f32(_t_11, _t_22);
4120 
4121                         float32x2_t _tmp024b = vadd_f32(_t_33, _t_44);
4122                         float32x2_t _tmp135b = vsub_f32(_t_33, _t_44);
4123 
4124                         float32x2_t _tmp024c = vadd_f32(_t_55, _t_66);
4125                         float32x2_t _tmp135c = vsub_f32(_t_55, _t_66);
4126 
4127                         float32x2_t _output_0 = vadd_f32(_t_00, _tmp024a);
4128                         _output_0 = vmla_lane_f32(_output_0, _tmp024c, vget_high_f32(_coeff), 1);
4129                         _output_0 = vadd_f32(_output_0, _tmp024b);
4130                         _output_0 = vadd_f32(_output_0, _bias0);
4131 
4132                         float32x2_t _output_2 = vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
4133                         _output_2 = vmla_lane_f32(_output_2, _tmp024c, vget_low_f32(_coeff), 1);
4134                         _output_2 = vadd_f32(_output_2, _bias0);
4135 
4136                         float32x2_t _output_4 = vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
4137                         _output_4 = vadd_f32(_output_4, _tmp024c);
4138                         _output_4 = vadd_f32(_output_4, _tmp024c);
4139                         _output_4 = vadd_f32(_output_4, _bias0);
4140 
4141                         output0[0] = vget_lane_f32(_output_0, 0);
4142                         output1[0] = vget_lane_f32(_output_0, 1);
4143                         output0[2] = vget_lane_f32(_output_2, 0);
4144                         output1[2] = vget_lane_f32(_output_2, 1);
4145                         output0[4] = vget_lane_f32(_output_4, 0);
4146                         output1[4] = vget_lane_f32(_output_4, 1);
4147 
4148                         float32x2_t _output_1 = vmla_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
4149                         _output_1 = vadd_f32(_output_1, _tmp135b);
4150                         _output_1 = vadd_f32(_output_1, _tmp135b);
4151                         _output_1 = vadd_f32(_output_1, _bias0);
4152 
4153                         float32x2_t _output_3 = vmla_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
4154                         _output_3 = vmla_lane_f32(_output_3, _tmp135c, vget_low_f32(_coeff), 0);
4155                         _output_3 = vadd_f32(_output_3, _bias0);
4156 
4157                         float32x2_t _output_5 = vadd_f32(_t_77, _tmp135a);
4158                         _output_5 = vmla_lane_f32(_output_5, _tmp135b, vget_high_f32(_coeff), 1);
4159                         _output_5 = vadd_f32(_output_5, _tmp135c);
4160                         _output_5 = vadd_f32(_output_5, _bias0);
4161 
4162                         output0[1] = vget_lane_f32(_output_1, 0);
4163                         output1[1] = vget_lane_f32(_output_1, 1);
4164                         output0[3] = vget_lane_f32(_output_3, 0);
4165                         output1[3] = vget_lane_f32(_output_3, 1);
4166                         output0[5] = vget_lane_f32(_output_5, 0);
4167                         output1[5] = vget_lane_f32(_output_5, 1);
4168 
4169                         t0 += 8 * 2;
4170                         t1 += 8 * 2;
4171                         output0 += outw * 2;
4172                         output1 += outw * 2;
4173                     }
4174 #else  // __aarch64__
4175                     float* t0 = tmp[0];
4176                     float* t1 = tmp[1];
4177 
4178                     int step = out0_tm.w * tiles * 2 * 4 * 4;
4179 
4180                     asm volatile(
4181 
4182                         // loop0
4183                         "vld1.f32   {d16-d17}, [%2], %21 \n"
4184                         "vld1.f32   {d18-d19}, [%3], %21 \n"
4185                         "vld1.f32   {d20-d21}, [%4], %21 \n"
4186                         "vld1.f32   {d22-d23}, [%5], %21 \n"
4187                         "vld1.f32   {d24-d25}, [%6], %21 \n"
4188                         "vld1.f32   {d26-d27}, [%7], %21 \n"
4189                         "vld1.f32   {d28-d29}, [%8], %21 \n"
4190                         "vld1.f32   {d30-d31}, [%9], %21 \n"
4191 
4192                         "vtrn.32    q8, q10             \n"
4193                         "vtrn.32    q9, q11             \n"
4194                         "vtrn.32    q12, q14            \n"
4195                         "vtrn.32    q13, q15            \n"
4196 
4197                         "vswp       d17, d24            \n"
4198                         "vswp       d19, d26            \n"
4199                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
4200                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
4201 
4202                         "vadd.f32   q2, q10, q12        \n"
4203                         "vsub.f32   q3, q10, q12        \n"
4204 
4205                         "vadd.f32   q4, q14, q9         \n"
4206                         "vsub.f32   q5, q14, q9         \n"
4207 
4208                         "vadd.f32   q6, q11, q13        \n"
4209                         "vsub.f32   q7, q11, q13        \n" // spare q9 q10 q11 q12 q13 q14
4210 
4211                         "vmov       q9, q3              \n"
4212                         "vadd.f32   q8, q8, q2          \n"
4213                         "vmla.f32   q9, q7, %f20[0]     \n"
4214                         "vmov       q12, q2             \n"
4215                         "vmov       q10, q2             \n"
4216                         "vmov       q11, q3             \n"
4217                         "vmla.f32   q12, q4, %f20[0]    \n"
4218                         "vadd.f32   q15, q15, q3        \n"
4219                         "vmla.f32   q8, q6, %f20[1]     \n"
4220                         "vadd.f32   q9, q9, q5          \n"
4221                         "vmla.f32   q10, q4, %e20[0]    \n"
4222                         "vmla.f32   q11, q5, %e20[1]    \n"
4223                         "vadd.f32   q12, q12, q6        \n"
4224                         "vmla.f32   q15, q5, %f20[1]    \n"
4225                         "vadd.f32   q8, q8, q4          \n"
4226                         "vadd.f32   q9, q9, q5          \n"
4227                         "vmla.f32   q10, q6, %e20[1]    \n"
4228                         "vmla.f32   q11, q7, %e20[0]    \n"
4229                         "vadd.f32   q12, q12, q6        \n"
4230                         "vadd.f32   q15, q15, q7        \n"
4231 
4232                         "vst1.f32   {d16-d17}, [%0]     \n"
4233                         "add        %0, %0, #64         \n"
4234 
4235                         "vst1.f32   {d18-d19}, [%1]     \n"
4236                         "add        %1, %1, #64         \n"
4237 
4238                         "vst1.f32   {d20-d21}, [%0]     \n"
4239                         "add        %0, %0, #64         \n"
4240 
4241                         "vst1.f32   {d22-d23}, [%1]     \n"
4242                         "add        %1, %1, #64         \n"
4243 
4244                         "vst1.f32   {d24-d25}, [%0]     \n"
4245                         "sub        %0, %0, #112        \n"
4246 
4247                         "vst1.f32   {d30-d31}, [%1]     \n"
4248                         "sub        %1, %1, #112        \n"
4249 
4250                         // loop1
4251                         "vld1.f32   {d16-d17}, [%2]     \n"
4252                         "vld1.f32   {d18-d19}, [%3]     \n"
4253                         "vld1.f32   {d20-d21}, [%4]     \n"
4254                         "vld1.f32   {d22-d23}, [%5]     \n"
4255                         "vld1.f32   {d24-d25}, [%6]     \n"
4256                         "vld1.f32   {d26-d27}, [%7]     \n"
4257                         "vld1.f32   {d28-d29}, [%8]     \n"
4258                         "vld1.f32   {d30-d31}, [%9]     \n"
4259 
4260                         "vtrn.32    q8, q10             \n"
4261                         "vtrn.32    q9, q11             \n"
4262                         "vtrn.32    q12, q14            \n"
4263                         "vtrn.32    q13, q15            \n"
4264 
4265                         "vswp       d17, d24            \n"
4266                         "vswp       d19, d26            \n"
4267                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
4268                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
4269 
4270                         "vadd.f32   q2, q10, q12        \n"
4271                         "vsub.f32   q3, q10, q12        \n"
4272 
4273                         "vadd.f32   q4, q14, q9         \n"
4274                         "vsub.f32   q5, q14, q9         \n"
4275 
4276                         "vadd.f32   q6, q11, q13        \n"
4277                         "vsub.f32   q7, q11, q13        \n" // spare q9 q10 q11 q12 q13 q14
4278 
4279                         "vmov       q9, q3              \n"
4280                         "vadd.f32   q8, q8, q2          \n"
4281                         "vmla.f32   q9, q7, %f20[0]     \n"
4282                         "vmov       q12, q2             \n"
4283                         "vmov       q10, q2             \n"
4284                         "vmov       q11, q3             \n"
4285                         "vmla.f32   q12, q4, %f20[0]    \n"
4286                         "vadd.f32   q15, q15, q3        \n"
4287                         "vmla.f32   q8, q6, %f20[1]     \n"
4288                         "vadd.f32   q9, q9, q5          \n"
4289                         "vmla.f32   q10, q4, %e20[0]    \n"
4290                         "vmla.f32   q11, q5, %e20[1]    \n"
4291                         "vadd.f32   q12, q12, q6        \n"
4292                         "vmla.f32   q15, q5, %f20[1]    \n"
4293                         "vadd.f32   q8, q8, q4          \n"
4294                         "vadd.f32   q9, q9, q5          \n"
4295                         "vmla.f32   q10, q6, %e20[1]    \n"
4296                         "vmla.f32   q11, q7, %e20[0]    \n"
4297                         "vadd.f32   q12, q12, q6        \n"
4298                         "vadd.f32   q15, q15, q7        \n"
4299 
4300                         "vst1.f32   {d16-d17}, [%0]     \n"
4301                         "add        %0, %0, #64         \n"
4302 
4303                         "vst1.f32   {d18-d19}, [%1]     \n"
4304                         "add        %1, %1, #64         \n"
4305 
4306                         "vst1.f32   {d20-d21}, [%0]     \n"
4307                         "add        %0, %0, #64         \n"
4308 
4309                         "vst1.f32   {d22-d23}, [%1]     \n"
4310                         "add        %1, %1, #64         \n"
4311 
4312                         "vst1.f32   {d24-d25}, [%0]     \n"
4313 
4314                         "vst1.f32   {d30-d31}, [%1]     \n"
4315 
4316                         : "=r"(t0),            // %0
4317                         "=r"(t1),            // %1
4318                         "=r"(output0_tm0_0), // %2
4319                         "=r"(output0_tm0_4), // %3
4320                         "=r"(output0_tm1_0), // %4
4321                         "=r"(output0_tm1_4), // %5
4322                         "=r"(output0_tm2_0), // %6
4323                         "=r"(output0_tm2_4), // %7
4324                         "=r"(output0_tm3_0), // %8
4325                         "=r"(output0_tm3_4)  // %9
4326                         : "0"(t0),
4327                         "1"(t1),
4328                         "2"(output0_tm0_0),
4329                         "3"(output0_tm0_4),
4330                         "4"(output0_tm1_0),
4331                         "5"(output0_tm1_4),
4332                         "6"(output0_tm2_0),
4333                         "7"(output0_tm2_4),
4334                         "8"(output0_tm3_0),
4335                         "9"(output0_tm3_4),
4336                         "w"(_coeff), // %20
4337                         "r"(step)    // %21
4338                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4339 
4340                     t0 = tmp[0];
4341                     t1 = tmp[1];
4342 
4343                     float* output0 = out0.row(i * 6) + j * 6;
4344                     float* output1 = output0 + outw;
4345 
4346                     int stepw = outw * 2 * 4;
4347 
4348                     asm volatile(
4349 
4350                         // loop0
4351                         "vld1.f32   {d16-d19}, [%2]     \n"
4352                         "vld1.f32   {d20-d23}, [%3]     \n"
4353 
4354                         "add        %2, %2, #64         \n"
4355                         "add        %3, %3, #64         \n"
4356 
4357                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
4358                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
4359 
4360                         "vadd.f32   d4, d20, d17        \n"
4361                         "vsub.f32   d5, d20, d17        \n"
4362 
4363                         "vadd.f32   d6, d21, d18        \n"
4364                         "vsub.f32   d7, d21, d18        \n"
4365 
4366                         "vadd.f32   d8, d22, d19        \n"
4367                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
4368 
4369                         "vmov       d20, d5             \n"
4370                         "vmov       d18, d4             \n"
4371 
4372                         "vadd.f32   d16, d16, d4        \n"
4373                         "vmla.f32   d20, d9, %f8[0]     \n"
4374                         "vmov       d17, d4             \n"
4375                         "vmov       d21, d5             \n"
4376                         "vmla.f32   d18, d6, %f8[0]     \n"
4377                         "vadd.f32   d22, d23, d5        \n"
4378 
4379                         "vmla.f32   d16, d8, %f8[1]     \n"
4380                         "vadd.f32   d20, d20, d7        \n"
4381                         "vmla.f32   d17, d6, %e8[0]     \n"
4382                         "vmla.f32   d21, d7, %e8[1]     \n"
4383                         "vadd.f32   d18, d18, d8        \n"
4384                         "vmla.f32   d22, d7, %f8[1]     \n"
4385 
4386                         "vadd.f32   d16, d16, d6        \n"
4387                         "vadd.f32   d20, d20, d7        \n"
4388                         "vmla.f32   d17, d8, %e8[1]     \n"
4389                         "vmla.f32   d21, d9, %e8[0]     \n"
4390                         "vadd.f32   d18, d18, d8        \n"
4391                         "vadd.f32   d22, d22, d9        \n"
4392 
4393                         "vadd.f32   d16, d16, %P9       \n" // _bias0
4394                         "vadd.f32   d20, d20, %P9       \n" // _bias0
4395                         "vadd.f32   d17, d17, %P9       \n" // _bias0
4396                         "vadd.f32   d21, d21, %P9       \n" // _bias0
4397                         "vadd.f32   d18, d18, %P9       \n" // _bias0
4398                         "vadd.f32   d22, d22, %P9       \n" // _bias0
4399 
4400                         "vtrn.f32   q8, q10             \n"
4401                         "vtrn.f32   d18, d22            \n"
4402 
4403                         "vst1.f32   {d16-d18}, [%0], %10 \n"
4404                         "vst1.f32   {d20-d22}, [%1], %10 \n"
4405 
4406                         // loop1
4407                         "vld1.f32   {d16-d19}, [%2]     \n"
4408                         "vld1.f32   {d20-d23}, [%3]     \n"
4409 
4410                         "add        %2, %2, #64         \n"
4411                         "add        %3, %3, #64         \n"
4412 
4413                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
4414                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
4415 
4416                         "vadd.f32   d4, d20, d17        \n"
4417                         "vsub.f32   d5, d20, d17        \n"
4418 
4419                         "vadd.f32   d6, d21, d18        \n"
4420                         "vsub.f32   d7, d21, d18        \n"
4421 
4422                         "vadd.f32   d8, d22, d19        \n"
4423                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
4424 
4425                         "vmov       d20, d5             \n"
4426                         "vmov       d18, d4             \n"
4427 
4428                         "vadd.f32   d16, d16, d4        \n"
4429                         "vmla.f32   d20, d9, %f8[0]     \n"
4430                         "vmov       d17, d4             \n"
4431                         "vmov       d21, d5             \n"
4432                         "vmla.f32   d18, d6, %f8[0]     \n"
4433                         "vadd.f32   d22, d23, d5        \n"
4434 
4435                         "vmla.f32   d16, d8, %f8[1]     \n"
4436                         "vadd.f32   d20, d20, d7        \n"
4437                         "vmla.f32   d17, d6, %e8[0]     \n"
4438                         "vmla.f32   d21, d7, %e8[1]     \n"
4439                         "vadd.f32   d18, d18, d8        \n"
4440                         "vmla.f32   d22, d7, %f8[1]     \n"
4441 
4442                         "vadd.f32   d16, d16, d6        \n"
4443                         "vadd.f32   d20, d20, d7        \n"
4444                         "vmla.f32   d17, d8, %e8[1]     \n"
4445                         "vmla.f32   d21, d9, %e8[0]     \n"
4446                         "vadd.f32   d18, d18, d8        \n"
4447                         "vadd.f32   d22, d22, d9        \n"
4448 
4449                         "vadd.f32   d16, d16, %P9       \n" // _bias0
4450                         "vadd.f32   d20, d20, %P9       \n" // _bias0
4451                         "vadd.f32   d17, d17, %P9       \n" // _bias0
4452                         "vadd.f32   d21, d21, %P9       \n" // _bias0
4453                         "vadd.f32   d18, d18, %P9       \n" // _bias0
4454                         "vadd.f32   d22, d22, %P9       \n" // _bias0
4455 
4456                         "vtrn.f32   q8, q10             \n"
4457                         "vtrn.f32   d18, d22            \n"
4458 
4459                         "vst1.f32   {d16-d18}, [%0], %10 \n"
4460                         "vst1.f32   {d20-d22}, [%1], %10 \n"
4461 
4462                         // loop2
4463                         "vld1.f32   {d16-d19}, [%2]     \n"
4464                         "vld1.f32   {d20-d23}, [%3]     \n"
4465 
4466                         "add        %2, %2, #64         \n"
4467                         "add        %3, %3, #64         \n"
4468 
4469                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
4470                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
4471 
4472                         "vadd.f32   d4, d20, d17        \n"
4473                         "vsub.f32   d5, d20, d17        \n"
4474 
4475                         "vadd.f32   d6, d21, d18        \n"
4476                         "vsub.f32   d7, d21, d18        \n"
4477 
4478                         "vadd.f32   d8, d22, d19        \n"
4479                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
4480 
4481                         "vmov       d20, d5             \n"
4482                         "vmov       d18, d4             \n"
4483 
4484                         "vadd.f32   d16, d16, d4        \n"
4485                         "vmla.f32   d20, d9, %f8[0]     \n"
4486                         "vmov       d17, d4             \n"
4487                         "vmov       d21, d5             \n"
4488                         "vmla.f32   d18, d6, %f8[0]     \n"
4489                         "vadd.f32   d22, d23, d5        \n"
4490 
4491                         "vmla.f32   d16, d8, %f8[1]     \n"
4492                         "vadd.f32   d20, d20, d7        \n"
4493                         "vmla.f32   d17, d6, %e8[0]     \n"
4494                         "vmla.f32   d21, d7, %e8[1]     \n"
4495                         "vadd.f32   d18, d18, d8        \n"
4496                         "vmla.f32   d22, d7, %f8[1]     \n"
4497 
4498                         "vadd.f32   d16, d16, d6        \n"
4499                         "vadd.f32   d20, d20, d7        \n"
4500                         "vmla.f32   d17, d8, %e8[1]     \n"
4501                         "vmla.f32   d21, d9, %e8[0]     \n"
4502                         "vadd.f32   d18, d18, d8        \n"
4503                         "vadd.f32   d22, d22, d9        \n"
4504 
4505                         "vadd.f32   d16, d16, %P9       \n" // _bias0
4506                         "vadd.f32   d20, d20, %P9       \n" // _bias0
4507                         "vadd.f32   d17, d17, %P9       \n" // _bias0
4508                         "vadd.f32   d21, d21, %P9       \n" // _bias0
4509                         "vadd.f32   d18, d18, %P9       \n" // _bias0
4510                         "vadd.f32   d22, d22, %P9       \n" // _bias0
4511 
4512                         "vtrn.f32   q8, q10             \n"
4513                         "vtrn.f32   d18, d22            \n"
4514 
4515                         "vst1.f32   {d16-d18}, [%0], %10 \n"
4516                         "vst1.f32   {d20-d22}, [%1], %10 \n"
4517 
4518                         : "=r"(output0), // %0
4519                         "=r"(output1), // %1
4520                         "=r"(t0),      // %2
4521                         "=r"(t1)       // %3
4522                         : "0"(output0),
4523                         "1"(output1),
4524                         "2"(t0),
4525                         "3"(t1),
4526                         "w"(_coeff), // %8
4527                         "w"(_bias0), // %9
4528                         "r"(stepw)   // %10
4529                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4530 #endif // __aarch64__
4531 #else
4532                     const float* output0_tm_0 = out0_tm.row(i * w_tm / 8 + j);
4533                     const float* output0_tm_4 = out0_tm.row(i * w_tm / 8 + j + tiles);
4534 
4535                     for (int m = 0; m < 8; m++)
4536                     {
4537                         float tmp024a = output0_tm_0[1] + output0_tm_0[2];
4538                         float tmp135a = output0_tm_0[1] - output0_tm_0[2];
4539 
4540                         float tmp024b = output0_tm_0[3] + output0_tm_4[0];
4541                         float tmp135b = output0_tm_0[3] - output0_tm_4[0];
4542 
4543                         float tmp024c = output0_tm_4[1] + output0_tm_4[2];
4544                         float tmp135c = output0_tm_4[1] - output0_tm_4[2];
4545 
4546                         tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
4547                         tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
4548                         tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
4549 
4550                         tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
4551                         tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
4552                         tmp[5][m] = output0_tm_4[3] + tmp135a + tmp135b * 32 + tmp135c;
4553 
4554                         output0_tm_0 += out0_tm.w * tiles * 2;
4555                         output0_tm_4 += out0_tm.w * tiles * 2;
4556                     }
4557 
4558                     float* output0 = out0.row(i * 6) + j * 6;
4559 
4560                     for (int m = 0; m < 6; m++)
4561                     {
4562                         const float* tmp0 = tmp[m];
4563 
4564                         float tmp024a = tmp0[1] + tmp0[2];
4565                         float tmp135a = tmp0[1] - tmp0[2];
4566 
4567                         float tmp024b = tmp0[3] + tmp0[4];
4568                         float tmp135b = tmp0[3] - tmp0[4];
4569 
4570                         float tmp024c = tmp0[5] + tmp0[6];
4571                         float tmp135c = tmp0[5] - tmp0[6];
4572 
4573                         output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
4574                         output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
4575                         output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
4576 
4577                         output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
4578                         output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
4579                         output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
4580 
4581                         output0 += outw;
4582                     }
4583 #endif // __ARM_NEON
4584                 }
4585             }
4586         }
4587     }
4588     // END transform output
4589 
4590     // cut result pad
4591     copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
4592 }
4593 
conv3x3s1_winograd64_neon5(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)4594 static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
4595 {
4596     int w = bottom_blob.w;
4597     int h = bottom_blob.h;
4598     int inch = bottom_blob.c;
4599 
4600     int outw = top_blob.w;
4601     int outh = top_blob.h;
4602     int outch = top_blob.c;
4603 
4604     // pad to 6n+2
4605     Mat bottom_blob_bordered = bottom_blob;
4606 
4607     outw = (outw + 5) / 6 * 6;
4608     outh = (outh + 5) / 6 * 6;
4609 
4610     w = outw + 2;
4611     h = outh + 2;
4612     Option opt_b = opt;
4613     opt_b.blob_allocator = opt.workspace_allocator;
4614     copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
4615 
4616     const float* bias = _bias;
4617 
4618     // BEGIN transform input
4619     Mat bottom_blob_tm;
4620     {
4621         int w_tm = outw / 6 * 8;
4622         int h_tm = outh / 6 * 8;
4623         const int tiles = w_tm / 8 * h_tm / 8;
4624         bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator);
4625         //         bottom_blob_tm.create(inch, tiles, 64);
4626 
4627         //         const float itm[8][8] = {
4628         //             {1.0f,  0.0f, -5.25f,  0.00f,  5.25f,  0.00f, -1.0f, 0.0f},
4629         //
4630         //             {0.0f,  1.0f,  1.00f, -4.25f, -4.25f,  1.00f,  1.0f, 0.0f},
4631         //             {0.0f, -1.0f,  1.00f,  4.25f, -4.25f, -1.00f,  1.0f, 0.0f},
4632         //
4633         //             {0.0f,  0.5f,  0.25f, -2.50f, -1.25f,  2.00f,  1.0f, 0.0f},
4634         //             {0.0f, -0.5f,  0.25f,  2.50f, -1.25f, -2.00f,  1.0f, 0.0f},
4635         //
4636         //             {0.0f,  2.0f,  4.00f, -2.50f, -5.00f,  0.50f,  1.0f, 0.0f},
4637         //             {0.0f, -2.0f,  4.00f,  2.50f, -5.00f, -0.50f,  1.0f, 0.0f},
4638         //
4639         //             {0.0f, -1.0f,  0.00f,  5.25f,  0.00f, -5.25f,  0.0f, 1.0f}
4640         //         };
4641 
4642         // 0 = r00 - r06 + (r04 - r02) * 5.25
4643         // 7 = r07 - r01 + (r03 - r05) * 5.25
4644 
4645         // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
4646         // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
4647 
4648         // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
4649         // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
4650 
4651         // reuse r04 * 1.25
4652         // reuse r03 * 2.5
4653         // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
4654         // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
4655 
4656 #if __ARM_NEON
4657         const float coeff[8] = {
4658             0.25f, 0.5f, -1.25f, 2.f,
4659             -2.5f, 4.f, 4.25f, 5.25f
4660         };
4661         float32x4_t _coeff0 = vld1q_f32(coeff);
4662         float32x4_t _coeff1 = vld1q_f32(coeff + 4);
4663 #endif // __ARM_NEON
4664 
4665         #pragma omp parallel for num_threads(opt.num_threads)
4666         for (int q = 0; q < inch; q++)
4667         {
4668             const Mat img0 = bottom_blob_bordered.channel(q);
4669             Mat img0_tm = bottom_blob_tm.channel(q);
4670 
4671             float tmp[8][8];
4672 
4673             // tile
4674             for (int i = 0; i < h_tm / 8; i++)
4675             {
4676                 for (int j = 0; j < w_tm / 8; j++)
4677                 {
4678 #if __ARM_NEON
4679                     const float* r0 = img0.row(i * 6) + j * 6;
4680                     const float* r1 = r0 + w;
4681                     const float* r2 = r0 + w * 2;
4682                     const float* r3 = r0 + w * 3;
4683 
4684                     // the assembly block for armv7 input transform requires 13 general registers
4685                     // old gcc may fail to allocate register on debug build without -fomit-frame-pointer
4686                     // so, fallback to intrinsic version for armv7 debug build     --- nihui
4687 #if __aarch64__ || !defined(NDEBUG)
4688                     for (int m = 0; m + 3 < 8; m += 4)
4689                     {
4690                         float32x4_t _r0_0123 = vld1q_f32(r0);
4691                         float32x4_t _r0_4567 = vld1q_f32(r0 + 4);
4692                         float32x4_t _r1_0123 = vld1q_f32(r1);
4693                         float32x4_t _r1_4567 = vld1q_f32(r1 + 4);
4694                         float32x4_t _r2_0123 = vld1q_f32(r2);
4695                         float32x4_t _r2_4567 = vld1q_f32(r2 + 4);
4696                         float32x4_t _r3_0123 = vld1q_f32(r3);
4697                         float32x4_t _r3_4567 = vld1q_f32(r3 + 4);
4698 
4699                         float32x4x2_t _r01_00221133 = vtrnq_f32(_r0_0123, _r1_0123);
4700                         float32x4x2_t _r01_44665577 = vtrnq_f32(_r0_4567, _r1_4567);
4701                         float32x4x2_t _r23_00221133 = vtrnq_f32(_r2_0123, _r3_0123);
4702                         float32x4x2_t _r23_44665577 = vtrnq_f32(_r2_4567, _r3_4567);
4703 
4704                         // no vswp intrinsic  :(
4705                         float32x4_t _r_00 = vcombine_f32(vget_low_f32(_r01_00221133.val[0]), vget_low_f32(_r23_00221133.val[0]));
4706                         float32x4_t _r_11 = vcombine_f32(vget_low_f32(_r01_00221133.val[1]), vget_low_f32(_r23_00221133.val[1]));
4707                         float32x4_t _r_22 = vcombine_f32(vget_high_f32(_r01_00221133.val[0]), vget_high_f32(_r23_00221133.val[0]));
4708                         float32x4_t _r_33 = vcombine_f32(vget_high_f32(_r01_00221133.val[1]), vget_high_f32(_r23_00221133.val[1]));
4709                         float32x4_t _r_44 = vcombine_f32(vget_low_f32(_r01_44665577.val[0]), vget_low_f32(_r23_44665577.val[0]));
4710                         float32x4_t _r_55 = vcombine_f32(vget_low_f32(_r01_44665577.val[1]), vget_low_f32(_r23_44665577.val[1]));
4711                         float32x4_t _r_66 = vcombine_f32(vget_high_f32(_r01_44665577.val[0]), vget_high_f32(_r23_44665577.val[0]));
4712                         float32x4_t _r_77 = vcombine_f32(vget_high_f32(_r01_44665577.val[1]), vget_high_f32(_r23_44665577.val[1]));
4713 
4714                         float32x4_t _r_0_m_6 = vsubq_f32(_r_00, _r_66);
4715                         float32x4_t _r_7_m_1 = vsubq_f32(_r_77, _r_11);
4716 
4717                         float32x4_t _r_4_m_2 = vsubq_f32(_r_44, _r_22);
4718                         float32x4_t _r_3_m_5 = vsubq_f32(_r_33, _r_55);
4719 
4720                         float32x4_t _tmp0 = vmlaq_lane_f32(_r_0_m_6, _r_4_m_2, vget_high_f32(_coeff1), 1);
4721                         float32x4_t _tmp7 = vmlaq_lane_f32(_r_7_m_1, _r_3_m_5, vget_high_f32(_coeff1), 1);
4722 
4723                         vst1q_f32(&tmp[0][m], _tmp0);
4724                         vst1q_f32(&tmp[7][m], _tmp7);
4725 
4726                         float32x4_t _r_2_a_6 = vaddq_f32(_r_22, _r_66);
4727                         float32x4_t _r_1_a_5 = vaddq_f32(_r_11, _r_55);
4728 
4729                         float32x4_t _tmp12a = vmlsq_lane_f32(_r_2_a_6, _r_44, vget_high_f32(_coeff1), 0);
4730                         float32x4_t _tmp12b = vmlsq_lane_f32(_r_1_a_5, _r_33, vget_high_f32(_coeff1), 0);
4731 
4732                         float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
4733                         float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
4734 
4735                         vst1q_f32(&tmp[1][m], _tmp1);
4736                         vst1q_f32(&tmp[2][m], _tmp2);
4737 
4738                         float32x4_t _r_4_x_c = vmulq_lane_f32(_r_44, vget_high_f32(_coeff0), 0);
4739                         float32x4_t _r_3_x_c = vmulq_lane_f32(_r_33, vget_low_f32(_coeff1), 0);
4740 
4741                         float32x4_t _tmp34a = vaddq_f32(_r_66, _r_4_x_c);
4742                         _tmp34a = vmlaq_lane_f32(_tmp34a, _r_22, vget_low_f32(_coeff0), 0);
4743 
4744                         float32x4_t _tmp34b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_low_f32(_coeff0), 1);
4745                         _tmp34b = vmlaq_lane_f32(_tmp34b, _r_55, vget_high_f32(_coeff0), 1);
4746 
4747                         float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
4748                         float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
4749 
4750                         vst1q_f32(&tmp[3][m], _tmp3);
4751                         vst1q_f32(&tmp[4][m], _tmp4);
4752 
4753                         // reuse r04 * 1.25
4754                         // reuse r03 * 2.5
4755                         float32x4_t _r_2_a_4c = vaddq_f32(_r_22, _r_4_x_c);
4756                         float32x4_t _tmp56a = vmlaq_lane_f32(_r_66, _r_2_a_4c, vget_low_f32(_coeff1), 1);
4757                         float32x4_t _tmp56b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_high_f32(_coeff0), 1);
4758                         _tmp56b = vmlaq_lane_f32(_tmp56b, _r_55, vget_low_f32(_coeff0), 1);
4759 
4760                         float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
4761                         float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
4762 
4763                         vst1q_f32(&tmp[5][m], _tmp5);
4764                         vst1q_f32(&tmp[6][m], _tmp6);
4765 
4766                         r0 += w * 4;
4767                         r1 += w * 4;
4768                         r2 += w * 4;
4769                         r3 += w * 4;
4770                     }
4771 
4772                     const float* t0 = tmp[0];
4773                     const float* t1 = tmp[1];
4774                     const float* t2 = tmp[2];
4775                     const float* t3 = tmp[3];
4776 
4777                     float* r0_tm0 = img0_tm.row(i * w_tm / 8 + j);
4778                     float* r0_tm1 = img0_tm.row(i * w_tm / 8 + j + tiles * 8);
4779                     float* r0_tm2 = img0_tm.row(i * w_tm / 8 + j + tiles * 16);
4780                     float* r0_tm3 = img0_tm.row(i * w_tm / 8 + j + tiles * 24);
4781 
4782                     for (int m = 0; m + 3 < 8; m += 4)
4783                     {
4784                         float32x4_t _t0_0123 = vld1q_f32(t0);
4785                         float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
4786                         float32x4_t _t1_0123 = vld1q_f32(t1);
4787                         float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
4788                         float32x4_t _t2_0123 = vld1q_f32(t2);
4789                         float32x4_t _t2_4567 = vld1q_f32(t2 + 4);
4790                         float32x4_t _t3_0123 = vld1q_f32(t3);
4791                         float32x4_t _t3_4567 = vld1q_f32(t3 + 4);
4792 
4793                         float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
4794                         float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
4795                         float32x4x2_t _t23_00221133 = vtrnq_f32(_t2_0123, _t3_0123);
4796                         float32x4x2_t _t23_44665577 = vtrnq_f32(_t2_4567, _t3_4567);
4797 
4798                         // no vswp intrinsic  :(
4799                         float32x4_t _t_00 = vcombine_f32(vget_low_f32(_t01_00221133.val[0]), vget_low_f32(_t23_00221133.val[0]));
4800                         float32x4_t _t_11 = vcombine_f32(vget_low_f32(_t01_00221133.val[1]), vget_low_f32(_t23_00221133.val[1]));
4801                         float32x4_t _t_22 = vcombine_f32(vget_high_f32(_t01_00221133.val[0]), vget_high_f32(_t23_00221133.val[0]));
4802                         float32x4_t _t_33 = vcombine_f32(vget_high_f32(_t01_00221133.val[1]), vget_high_f32(_t23_00221133.val[1]));
4803                         float32x4_t _t_44 = vcombine_f32(vget_low_f32(_t01_44665577.val[0]), vget_low_f32(_t23_44665577.val[0]));
4804                         float32x4_t _t_55 = vcombine_f32(vget_low_f32(_t01_44665577.val[1]), vget_low_f32(_t23_44665577.val[1]));
4805                         float32x4_t _t_66 = vcombine_f32(vget_high_f32(_t01_44665577.val[0]), vget_high_f32(_t23_44665577.val[0]));
4806                         float32x4_t _t_77 = vcombine_f32(vget_high_f32(_t01_44665577.val[1]), vget_high_f32(_t23_44665577.val[1]));
4807 
4808                         float32x4_t _t_0_m_6 = vsubq_f32(_t_00, _t_66);
4809                         float32x4_t _t_7_m_1 = vsubq_f32(_t_77, _t_11);
4810 
4811                         float32x4_t _t_4_m_2 = vsubq_f32(_t_44, _t_22);
4812                         float32x4_t _t_3_m_5 = vsubq_f32(_t_33, _t_55);
4813 
4814                         float32x4_t _r0_tm_0_0 = vmlaq_lane_f32(_t_0_m_6, _t_4_m_2, vget_high_f32(_coeff1), 1);
4815                         float32x4_t _r0_tm_4_3 = vmlaq_lane_f32(_t_7_m_1, _t_3_m_5, vget_high_f32(_coeff1), 1);
4816 
4817                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_0, 0);
4818                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_0, 1);
4819                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_0, 2);
4820                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_0, 3);
4821 
4822                         r0_tm0 += img0_tm.w * tiles;
4823                         r0_tm1 += img0_tm.w * tiles;
4824                         r0_tm2 += img0_tm.w * tiles;
4825                         r0_tm3 += img0_tm.w * tiles;
4826 
4827                         float32x4_t _t_2_m_6 = vaddq_f32(_t_22, _t_66);
4828                         float32x4_t _t_1_m_5 = vaddq_f32(_t_11, _t_55);
4829 
4830                         float32x4_t _tmp12a = vmlsq_lane_f32(_t_2_m_6, _t_44, vget_high_f32(_coeff1), 0);
4831                         float32x4_t _tmp12b = vmlsq_lane_f32(_t_1_m_5, _t_33, vget_high_f32(_coeff1), 0);
4832 
4833                         float32x4_t _r0_tm_0_1 = vaddq_f32(_tmp12a, _tmp12b);
4834                         float32x4_t _r0_tm_0_2 = vsubq_f32(_tmp12a, _tmp12b);
4835 
4836                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_1, 0);
4837                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_1, 1);
4838                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_1, 2);
4839                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_1, 3);
4840 
4841                         r0_tm0 += img0_tm.w * tiles;
4842                         r0_tm1 += img0_tm.w * tiles;
4843                         r0_tm2 += img0_tm.w * tiles;
4844                         r0_tm3 += img0_tm.w * tiles;
4845 
4846                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_2, 0);
4847                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_2, 1);
4848                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_2, 2);
4849                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_2, 3);
4850 
4851                         r0_tm0 += img0_tm.w * tiles;
4852                         r0_tm1 += img0_tm.w * tiles;
4853                         r0_tm2 += img0_tm.w * tiles;
4854                         r0_tm3 += img0_tm.w * tiles;
4855 
4856                         float32x4_t _t_4_x_c = vmulq_lane_f32(_t_44, vget_high_f32(_coeff0), 0);
4857                         float32x4_t _t_3_x_c = vmulq_lane_f32(_t_33, vget_low_f32(_coeff1), 0);
4858 
4859                         float32x4_t _tmp34a = vaddq_f32(_t_66, _t_4_x_c);
4860                         _tmp34a = vmlaq_lane_f32(_tmp34a, _t_22, vget_low_f32(_coeff0), 0);
4861 
4862                         float32x4_t _tmp34b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_low_f32(_coeff0), 1);
4863                         _tmp34b = vmlaq_lane_f32(_tmp34b, _t_55, vget_high_f32(_coeff0), 1);
4864 
4865                         float32x4_t _r0_tm_0_3 = vaddq_f32(_tmp34a, _tmp34b);
4866                         float32x4_t _r0_tm_4_0 = vsubq_f32(_tmp34a, _tmp34b);
4867 
4868                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_3, 0);
4869                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_3, 1);
4870                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_3, 2);
4871                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_3, 3);
4872 
4873                         r0_tm0 += img0_tm.w * tiles;
4874                         r0_tm1 += img0_tm.w * tiles;
4875                         r0_tm2 += img0_tm.w * tiles;
4876                         r0_tm3 += img0_tm.w * tiles;
4877 
4878                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_0, 0);
4879                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_0, 1);
4880                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_0, 2);
4881                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_0, 3);
4882 
4883                         r0_tm0 += img0_tm.w * tiles;
4884                         r0_tm1 += img0_tm.w * tiles;
4885                         r0_tm2 += img0_tm.w * tiles;
4886                         r0_tm3 += img0_tm.w * tiles;
4887 
4888                         float32x4_t _t_2_a_4c = vaddq_f32(_t_22, _t_4_x_c);
4889                         float32x4_t _tmp56a = vmlaq_lane_f32(_t_66, _t_2_a_4c, vget_low_f32(_coeff1), 1);
4890                         float32x4_t _tmp56b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_high_f32(_coeff0), 1);
4891                         _tmp56b = vmlaq_lane_f32(_tmp56b, _t_55, vget_low_f32(_coeff0), 1);
4892 
4893                         float32x4_t _r0_tm_4_1 = vaddq_f32(_tmp56a, _tmp56b);
4894                         float32x4_t _r0_tm_4_2 = vsubq_f32(_tmp56a, _tmp56b);
4895 
4896                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_1, 0);
4897                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_1, 1);
4898                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_1, 2);
4899                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_1, 3);
4900 
4901                         r0_tm0 += img0_tm.w * tiles;
4902                         r0_tm1 += img0_tm.w * tiles;
4903                         r0_tm2 += img0_tm.w * tiles;
4904                         r0_tm3 += img0_tm.w * tiles;
4905 
4906                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_2, 0);
4907                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_2, 1);
4908                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_2, 2);
4909                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_2, 3);
4910 
4911                         r0_tm0 += img0_tm.w * tiles;
4912                         r0_tm1 += img0_tm.w * tiles;
4913                         r0_tm2 += img0_tm.w * tiles;
4914                         r0_tm3 += img0_tm.w * tiles;
4915 
4916                         r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_3, 0);
4917                         r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_3, 1);
4918                         r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_3, 2);
4919                         r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_3, 3);
4920 
4921                         t0 += 8 * 4;
4922                         t1 += 8 * 4;
4923                         t2 += 8 * 4;
4924                         t3 += 8 * 4;
4925 
4926                         r0_tm0 += img0_tm.w * tiles * 25;
4927                         r0_tm1 += img0_tm.w * tiles * 25;
4928                         r0_tm2 += img0_tm.w * tiles * 25;
4929                         r0_tm3 += img0_tm.w * tiles * 25;
4930                     }
4931 #else  // __aarch64__
4932                     float* t0 = tmp[0];
4933                     float* t1 = tmp[1];
4934                     float* t2 = tmp[2];
4935                     float* t3 = tmp[3];
4936                     float* t4 = tmp[4];
4937                     float* t5 = tmp[5];
4938                     float* t6 = tmp[6];
4939                     float* t7 = tmp[7];
4940 
4941                     int stepw = w * 4 * 4;
4942 
4943                     asm volatile(
4944 
4945                         // loop0
4946                         "vld1.f32   {d16-d19}, [%8], %26    \n"
4947                         "vld1.f32   {d20-d23}, [%9], %26    \n"
4948                         "vld1.f32   {d24-d27}, [%10], %26   \n"
4949 
4950                         "vtrn.32    q8, q10             \n"
4951 
4952                         "vld1.f32   {d28-d31}, [%11], %26   \n"
4953 
4954                         "vtrn.32    q9, q11             \n"
4955                         "vtrn.32    q12, q14            \n"
4956                         "vtrn.32    q13, q15            \n"
4957 
4958                         "vswp       d17, d24            \n"
4959                         "vswp       d19, d26            \n"
4960                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
4961                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
4962 
4963                         "vsub.f32   q2, q8, q13         \n"
4964                         "vsub.f32   q3, q9, q12         \n"
4965 
4966                         "vadd.f32   q4, q12, q13        \n"
4967                         "vadd.f32   q5, q10, q11        \n"
4968 
4969                         "vmla.f32   q2, q3, %f25[1]     \n"
4970 
4971                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
4972                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
4973 
4974                         "vmls.f32   q4, q9, %f25[0]     \n"
4975                         "vmls.f32   q5, q14, %f25[0]    \n"
4976 
4977                         "vst1.f32   {d4-d5}, [%0]!      \n" // tmp[0][m]
4978 
4979                         "vmov       q3, q7              \n" // use q7
4980 
4981                         "vadd.f32   q2, q13, q6         \n" // use q6
4982                         "vmla.f32   q3, q10, %e24[1]    \n"
4983 
4984                         "vadd.f32   q8, q4, q5          \n"
4985                         "vsub.f32   q9, q4, q5          \n"
4986 
4987                         "vmov       q5, q7              \n" // use q7
4988 
4989                         "vadd.f32   q6, q12, q6         \n" // use q6
4990                         "vmla.f32   q5, q10, %f24[1]    \n"
4991 
4992                         "vmov       q4, q13             \n"
4993 
4994                         "vmla.f32   q2, q12, %e24[0]    \n"
4995                         "vmla.f32   q3, q11, %f24[1]    \n"
4996 
4997                         "vst1.f32   {d16-d17}, [%1]!    \n" // tmp[1][m]
4998 
4999                         "vmla.f32   q4, q6, %e25[1]     \n"
5000                         "vmla.f32   q5, q11, %e24[1]    \n"
5001 
5002                         "vst1.f32   {d18-d19}, [%2]!    \n" // tmp[2][m]
5003 
5004                         "vadd.f32   q8, q2, q3          \n"
5005                         "vsub.f32   q9, q2, q3          \n"
5006 
5007                         "vsub.f32   q6, q15, q10        \n"
5008                         "vsub.f32   q7, q14, q11        \n"
5009 
5010                         "vadd.f32   q2, q4, q5          \n"
5011                         "vsub.f32   q3, q4, q5          \n"
5012 
5013                         "vst1.f32   {d16-d17}, [%3]!    \n" // tmp[3][m]
5014                         "vst1.f32   {d18-d19}, [%4]!    \n" // tmp[4][m]
5015 
5016                         "vmla.f32   q6, q7, %f25[1]     \n"
5017 
5018                         "vst1.f32   {d4-d5}, [%5]!      \n" // tmp[5][m]
5019                         "vst1.f32   {d6-d7}, [%6]!      \n" // tmp[6][m]
5020 
5021                         "vst1.f32   {d12-d13}, [%7]!    \n" // tmp[7][m]
5022 
5023                         // loop1
5024                         "vld1.f32   {d16-d19}, [%8]     \n"
5025                         "vld1.f32   {d20-d23}, [%9]     \n"
5026                         "vld1.f32   {d24-d27}, [%10]    \n"
5027 
5028                         "vtrn.32    q8, q10             \n"
5029 
5030                         "vld1.f32   {d28-d31}, [%11]    \n"
5031 
5032                         "vtrn.32    q9, q11             \n"
5033                         "vtrn.32    q12, q14            \n"
5034                         "vtrn.32    q13, q15            \n"
5035 
5036                         "vswp       d17, d24            \n"
5037                         "vswp       d19, d26            \n"
5038                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
5039                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
5040 
5041                         "vsub.f32   q2, q8, q13         \n"
5042                         "vsub.f32   q3, q9, q12         \n"
5043 
5044                         "vadd.f32   q4, q12, q13        \n"
5045                         "vadd.f32   q5, q10, q11        \n"
5046 
5047                         "vmla.f32   q2, q3, %f25[1]     \n"
5048 
5049                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
5050                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
5051 
5052                         "vmls.f32   q4, q9, %f25[0]     \n"
5053                         "vmls.f32   q5, q14, %f25[0]    \n"
5054 
5055                         "vst1.f32   {d4-d5}, [%0]!      \n" // tmp[0][m]
5056 
5057                         "vmov       q3, q7              \n" // use q7
5058 
5059                         "vadd.f32   q2, q13, q6         \n" // use q6
5060                         "vmla.f32   q3, q10, %e24[1]    \n"
5061 
5062                         "vadd.f32   q8, q4, q5          \n"
5063                         "vsub.f32   q9, q4, q5          \n"
5064 
5065                         "vmov       q5, q7              \n" // use q7
5066 
5067                         "vadd.f32   q6, q12, q6         \n" // use q6
5068                         "vmla.f32   q5, q10, %f24[1]    \n"
5069 
5070                         "vmov       q4, q13             \n"
5071 
5072                         "vmla.f32   q2, q12, %e24[0]    \n"
5073                         "vmla.f32   q3, q11, %f24[1]    \n"
5074 
5075                         "vst1.f32   {d16-d17}, [%1]!    \n" // tmp[1][m]
5076 
5077                         "vmla.f32   q4, q6, %e25[1]     \n"
5078                         "vmla.f32   q5, q11, %e24[1]    \n"
5079 
5080                         "vst1.f32   {d18-d19}, [%2]!    \n" // tmp[2][m]
5081 
5082                         "vadd.f32   q8, q2, q3          \n"
5083                         "vsub.f32   q9, q2, q3          \n"
5084 
5085                         "vsub.f32   q6, q15, q10        \n"
5086                         "vsub.f32   q7, q14, q11        \n"
5087 
5088                         "vadd.f32   q2, q4, q5          \n"
5089                         "vsub.f32   q3, q4, q5          \n"
5090 
5091                         "vst1.f32   {d16-d17}, [%3]!    \n" // tmp[3][m]
5092                         "vst1.f32   {d18-d19}, [%4]!    \n" // tmp[4][m]
5093 
5094                         "vmla.f32   q6, q7, %f25[1]     \n"
5095 
5096                         "vst1.f32   {d4-d5}, [%5]!      \n" // tmp[5][m]
5097                         "vst1.f32   {d6-d7}, [%6]!      \n" // tmp[6][m]
5098 
5099                         "vst1.f32   {d12-d13}, [%7]!    \n" // tmp[7][m]
5100 
5101                         : "=r"(t0), // %0
5102                         "=r"(t1), // %1
5103                         "=r"(t2), // %2
5104                         "=r"(t3), // %3
5105                         "=r"(t4), // %4
5106                         "=r"(t5), // %5
5107                         "=r"(t6), // %6
5108                         "=r"(t7), // %7
5109                         "=r"(r0), // %8
5110                         "=r"(r1), // %9
5111                         "=r"(r2), // %10
5112                         "=r"(r3)  // %11
5113                         : "0"(t0),
5114                         "1"(t1),
5115                         "2"(t2),
5116                         "3"(t3),
5117                         "4"(t4),
5118                         "5"(t5),
5119                         "6"(t6),
5120                         "7"(t7),
5121                         "8"(r0),
5122                         "9"(r1),
5123                         "10"(r2),
5124                         "11"(r3),
5125                         "w"(_coeff0), // %24
5126                         "w"(_coeff1), // %25
5127                         "r"(stepw)    // %26
5128                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5129 
5130                     t0 = tmp[0];
5131                     t1 = tmp[1];
5132                     t2 = tmp[2];
5133                     t3 = tmp[3];
5134 
5135                     float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
5136                     float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 8);
5137                     float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 16);
5138                     float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 24);
5139                     float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 32);
5140                     float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 40);
5141                     float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 48);
5142                     float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 56);
5143 
5144                     int step = img0_tm.w * tiles * 4;
5145 
5146                     asm volatile(
5147 
5148                         // loop0
5149                         "vld1.f32   {d16-d19}, [%8]     \n"
5150                         "add        %8, %8, #128        \n"
5151                         "vld1.f32   {d20-d23}, [%9]     \n"
5152                         "add        %9, %9, #128        \n"
5153                         "vld1.f32   {d24-d27}, [%10]    \n"
5154                         "add        %10, %10, #128      \n"
5155 
5156                         "vtrn.32    q8, q10             \n"
5157 
5158                         "vld1.f32   {d28-d31}, [%11]    \n"
5159                         "add        %11, %11, #128      \n"
5160 
5161                         "vtrn.32    q9, q11             \n"
5162                         "vtrn.32    q12, q14            \n"
5163                         "vtrn.32    q13, q15            \n"
5164 
5165                         "vswp       d17, d24            \n"
5166                         "vswp       d19, d26            \n"
5167                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
5168                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
5169 
5170                         "vsub.f32   q2, q8, q13         \n"
5171                         "vsub.f32   q3, q9, q12         \n"
5172 
5173                         "vadd.f32   q4, q12, q13        \n"
5174                         "vadd.f32   q5, q10, q11        \n"
5175 
5176                         "vmla.f32   q2, q3, %f25[1]     \n"
5177 
5178                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
5179                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
5180 
5181                         "vmls.f32   q4, q9, %f25[0]     \n"
5182                         "vmls.f32   q5, q14, %f25[0]    \n"
5183 
5184                         "vst1.f32   {d4[0]}, [%0], %26  \n"
5185                         "vst1.f32   {d4[1]}, [%1], %26  \n"
5186 
5187                         "vmov       q3, q7              \n" // use q7
5188 
5189                         "vst1.f32   {d5[0]}, [%2], %26  \n"
5190                         "vst1.f32   {d5[1]}, [%3], %26  \n"
5191 
5192                         "vadd.f32   q2, q13, q6         \n" // use q6
5193                         "vmla.f32   q3, q10, %e24[1]    \n"
5194 
5195                         "vadd.f32   q8, q4, q5          \n"
5196                         "vsub.f32   q9, q4, q5          \n"
5197 
5198                         "vmov       q5, q7              \n" // use q7
5199 
5200                         "vadd.f32   q6, q12, q6         \n" // use q6
5201                         "vmla.f32   q5, q10, %f24[1]    \n"
5202 
5203                         "vmov       q4, q13             \n"
5204 
5205                         "vmla.f32   q2, q12, %e24[0]    \n"
5206                         "vmla.f32   q3, q11, %f24[1]    \n"
5207 
5208                         "vst1.f32   {d16[0]}, [%0], %26 \n"
5209                         "vst1.f32   {d16[1]}, [%1], %26 \n"
5210 
5211                         "vmla.f32   q4, q6, %e25[1]     \n"
5212 
5213                         "vst1.f32   {d17[0]}, [%2], %26 \n"
5214                         "vst1.f32   {d17[1]}, [%3], %26 \n"
5215 
5216                         "vmla.f32   q5, q11, %e24[1]    \n"
5217 
5218                         "vst1.f32   {d18[0]}, [%0], %26 \n"
5219                         "vst1.f32   {d18[1]}, [%1], %26 \n"
5220 
5221                         "vadd.f32   q8, q2, q3          \n"
5222 
5223                         "vst1.f32   {d19[0]}, [%2], %26 \n"
5224                         "vst1.f32   {d19[1]}, [%3], %26 \n"
5225 
5226                         "vsub.f32   q9, q2, q3          \n"
5227 
5228                         "vsub.f32   q6, q15, q10        \n"
5229                         "vsub.f32   q7, q14, q11        \n"
5230 
5231                         "vst1.f32   {d16[0]}, [%0], %26 \n"
5232                         "vst1.f32   {d16[1]}, [%1], %26 \n"
5233                         "vst1.f32   {d17[0]}, [%2], %26 \n"
5234                         "vst1.f32   {d17[1]}, [%3], %26 \n"
5235 
5236                         "vadd.f32   q2, q4, q5          \n"
5237 
5238                         "vst1.f32   {d18[0]}, [%0], %26 \n"
5239                         "vst1.f32   {d18[1]}, [%1], %26 \n"
5240                         "vst1.f32   {d19[0]}, [%2], %26 \n"
5241                         "vst1.f32   {d19[1]}, [%3], %26 \n"
5242 
5243                         "vsub.f32   q3, q4, q5          \n"
5244 
5245                         "vst1.f32   {d4[0]}, [%0], %26  \n"
5246                         "vst1.f32   {d4[1]}, [%1], %26  \n"
5247                         "vst1.f32   {d5[0]}, [%2], %26  \n"
5248                         "vst1.f32   {d5[1]}, [%3], %26  \n"
5249 
5250                         "vmla.f32   q6, q7, %f25[1]     \n"
5251 
5252                         "vst1.f32   {d6[0]}, [%0], %26  \n"
5253                         "vst1.f32   {d6[1]}, [%1], %26  \n"
5254                         "vst1.f32   {d7[0]}, [%2], %26  \n"
5255                         "vst1.f32   {d7[1]}, [%3], %26  \n"
5256 
5257                         "vst1.f32   {d12[0]}, [%0]      \n"
5258                         "vst1.f32   {d12[1]}, [%1]      \n"
5259                         "vst1.f32   {d13[0]}, [%2]      \n"
5260                         "vst1.f32   {d13[1]}, [%3]      \n"
5261 
5262                         // loop1
5263                         "vld1.f32   {d16-d19}, [%8]     \n"
5264                         "vld1.f32   {d20-d23}, [%9]     \n"
5265                         "vld1.f32   {d24-d27}, [%10]    \n"
5266 
5267                         "vtrn.32    q8, q10             \n"
5268 
5269                         "vld1.f32   {d28-d31}, [%11]    \n"
5270 
5271                         "vtrn.32    q9, q11             \n"
5272                         "vtrn.32    q12, q14            \n"
5273                         "vtrn.32    q13, q15            \n"
5274 
5275                         "vswp       d17, d24            \n"
5276                         "vswp       d19, d26            \n"
5277                         "vswp       d21, d28            \n" //  q8 = 00   q9 = 44  q10 = 11  q11 = 55
5278                         "vswp       d23, d30            \n" // q12 = 22  q13 = 66  q14 = 33  q15 = 77
5279 
5280                         "vsub.f32   q2, q8, q13         \n"
5281                         "vsub.f32   q3, q9, q12         \n"
5282 
5283                         "vadd.f32   q4, q12, q13        \n"
5284                         "vadd.f32   q5, q10, q11        \n"
5285 
5286                         "vmla.f32   q2, q3, %f25[1]     \n"
5287 
5288                         "vmul.f32   q7, q14, %e25[0]    \n" // q7 = _r_3_x_c
5289                         "vmul.f32   q6, q9, %f24[0]     \n" // q6 = _r_4_x_c
5290 
5291                         "vmls.f32   q4, q9, %f25[0]     \n"
5292                         "vmls.f32   q5, q14, %f25[0]    \n"
5293 
5294                         "vst1.f32   {d4[0]}, [%4], %26  \n"
5295                         "vst1.f32   {d4[1]}, [%5], %26  \n"
5296 
5297                         "vmov       q3, q7              \n" // use q7
5298 
5299                         "vst1.f32   {d5[0]}, [%6], %26  \n"
5300                         "vst1.f32   {d5[1]}, [%7], %26  \n"
5301 
5302                         "vadd.f32   q2, q13, q6         \n" // use q6
5303                         "vmla.f32   q3, q10, %e24[1]    \n"
5304 
5305                         "vadd.f32   q8, q4, q5          \n"
5306                         "vsub.f32   q9, q4, q5          \n"
5307 
5308                         "vmov       q5, q7              \n" // use q7
5309 
5310                         "vadd.f32   q6, q12, q6         \n" // use q6
5311                         "vmla.f32   q5, q10, %f24[1]    \n"
5312 
5313                         "vmov       q4, q13             \n"
5314 
5315                         "vmla.f32   q2, q12, %e24[0]    \n"
5316                         "vmla.f32   q3, q11, %f24[1]    \n"
5317 
5318                         "vst1.f32   {d16[0]}, [%4], %26 \n"
5319                         "vst1.f32   {d16[1]}, [%5], %26 \n"
5320 
5321                         "vmla.f32   q4, q6, %e25[1]     \n"
5322 
5323                         "vst1.f32   {d17[0]}, [%6], %26 \n"
5324                         "vst1.f32   {d17[1]}, [%7], %26 \n"
5325 
5326                         "vmla.f32   q5, q11, %e24[1]    \n"
5327 
5328                         "vst1.f32   {d18[0]}, [%4], %26 \n"
5329                         "vst1.f32   {d18[1]}, [%5], %26 \n"
5330 
5331                         "vadd.f32   q8, q2, q3          \n"
5332 
5333                         "vst1.f32   {d19[0]}, [%6], %26 \n"
5334                         "vst1.f32   {d19[1]}, [%7], %26 \n"
5335 
5336                         "vsub.f32   q9, q2, q3          \n"
5337 
5338                         "vsub.f32   q6, q15, q10        \n"
5339                         "vsub.f32   q7, q14, q11        \n"
5340 
5341                         "vst1.f32   {d16[0]}, [%4], %26 \n"
5342                         "vst1.f32   {d16[1]}, [%5], %26 \n"
5343                         "vst1.f32   {d17[0]}, [%6], %26 \n"
5344                         "vst1.f32   {d17[1]}, [%7], %26 \n"
5345 
5346                         "vadd.f32   q2, q4, q5          \n"
5347 
5348                         "vst1.f32   {d18[0]}, [%4], %26 \n"
5349                         "vst1.f32   {d18[1]}, [%5], %26 \n"
5350                         "vst1.f32   {d19[0]}, [%6], %26 \n"
5351                         "vst1.f32   {d19[1]}, [%7], %26 \n"
5352 
5353                         "vsub.f32   q3, q4, q5          \n"
5354 
5355                         "vst1.f32   {d4[0]}, [%4], %26  \n"
5356                         "vst1.f32   {d4[1]}, [%5], %26  \n"
5357                         "vst1.f32   {d5[0]}, [%6], %26  \n"
5358                         "vst1.f32   {d5[1]}, [%7], %26  \n"
5359 
5360                         "vmla.f32   q6, q7, %f25[1]     \n"
5361 
5362                         "vst1.f32   {d6[0]}, [%4], %26  \n"
5363                         "vst1.f32   {d6[1]}, [%5], %26  \n"
5364                         "vst1.f32   {d7[0]}, [%6], %26  \n"
5365                         "vst1.f32   {d7[1]}, [%7], %26  \n"
5366 
5367                         "vst1.f32   {d12[0]}, [%4]      \n"
5368                         "vst1.f32   {d12[1]}, [%5]      \n"
5369                         "vst1.f32   {d13[0]}, [%6]      \n"
5370                         "vst1.f32   {d13[1]}, [%7]      \n"
5371 
5372                         : "=r"(r0_tm0_0), // %0
5373                         "=r"(r0_tm1_0), // %1
5374                         "=r"(r0_tm2_0), // %2
5375                         "=r"(r0_tm3_0), // %3
5376                         "=r"(r0_tm0_4), // %4
5377                         "=r"(r0_tm1_4), // %5
5378                         "=r"(r0_tm2_4), // %6
5379                         "=r"(r0_tm3_4), // %7
5380                         "=r"(t0),       // %8
5381                         "=r"(t1),       // %9
5382                         "=r"(t2),       // %10
5383                         "=r"(t3)        // %11
5384                         : "0"(r0_tm0_0),
5385                         "1"(r0_tm1_0),
5386                         "2"(r0_tm2_0),
5387                         "3"(r0_tm3_0),
5388                         "4"(r0_tm0_4),
5389                         "5"(r0_tm1_4),
5390                         "6"(r0_tm2_4),
5391                         "7"(r0_tm3_4),
5392                         "8"(t0),
5393                         "9"(t1),
5394                         "10"(t2),
5395                         "11"(t3),
5396                         "w"(_coeff0), // %24
5397                         "w"(_coeff1), // %25
5398                         "r"(step)     // %26
5399                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5400 #endif // __aarch64__
5401 #else
5402                     const float* r0 = img0.row(i * 6) + j * 6;
5403 
5404                     for (int m = 0; m < 8; m++)
5405                     {
5406                         tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25f;
5407                         tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25f;
5408 
5409                         float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25f);
5410                         float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25f);
5411 
5412                         tmp[1][m] = tmp12a + tmp12b;
5413                         tmp[2][m] = tmp12a - tmp12b;
5414 
5415                         float tmp34a = (r0[6] + r0[2] * 0.25f - r0[4] * 1.25f);
5416                         float tmp34b = (r0[1] * 0.5f - r0[3] * 2.5f + r0[5] * 2.f);
5417 
5418                         tmp[3][m] = tmp34a + tmp34b;
5419                         tmp[4][m] = tmp34a - tmp34b;
5420 
5421                         float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25f) * 4.f);
5422                         float tmp56b = (r0[1] * 2.f - r0[3] * 2.5f + r0[5] * 0.5f);
5423 
5424                         tmp[5][m] = tmp56a + tmp56b;
5425                         tmp[6][m] = tmp56a - tmp56b;
5426 
5427                         r0 += w;
5428                     }
5429 
5430                     float* r0_tm_0 = img0_tm.row(i * w_tm / 8 + j);
5431                     float* r0_tm_1 = img0_tm.row(i * w_tm / 8 + j + tiles);
5432                     float* r0_tm_2 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
5433                     float* r0_tm_3 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
5434                     float* r0_tm_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
5435                     float* r0_tm_5 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
5436                     float* r0_tm_6 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
5437                     float* r0_tm_7 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
5438 
5439                     for (int m = 0; m < 8; m++)
5440                     {
5441                         const float* tmp0 = tmp[m];
5442 
5443                         r0_tm_0[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f;
5444                         r0_tm_7[0] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f;
5445 
5446                         float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25f);
5447                         float tmp12b = (tmp0[1] - tmp0[3] * 4.25f + tmp0[5]);
5448 
5449                         r0_tm_1[0] = tmp12a + tmp12b;
5450                         r0_tm_2[0] = tmp12a - tmp12b;
5451 
5452                         float tmp34a = (tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f);
5453                         float tmp34b = (tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f);
5454 
5455                         r0_tm_3[0] = tmp34a + tmp34b;
5456                         r0_tm_4[0] = tmp34a - tmp34b;
5457 
5458                         float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f);
5459                         float tmp56b = (tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f);
5460 
5461                         r0_tm_5[0] = tmp56a + tmp56b;
5462                         r0_tm_6[0] = tmp56a - tmp56b;
5463 
5464                         r0_tm_0 += img0_tm.w * tiles * 8;
5465                         r0_tm_1 += img0_tm.w * tiles * 8;
5466                         r0_tm_2 += img0_tm.w * tiles * 8;
5467                         r0_tm_3 += img0_tm.w * tiles * 8;
5468                         r0_tm_4 += img0_tm.w * tiles * 8;
5469                         r0_tm_5 += img0_tm.w * tiles * 8;
5470                         r0_tm_6 += img0_tm.w * tiles * 8;
5471                         r0_tm_7 += img0_tm.w * tiles * 8;
5472                     }
5473 #endif // __ARM_NEON
5474                 }
5475             }
5476         }
5477     }
5478     bottom_blob_bordered = Mat();
5479     // END transform input
5480 
5481     // BEGIN dot
5482     Mat top_blob_tm;
5483     {
5484         int w_tm = outw / 6 * 8;
5485         int h_tm = outh / 6 * 8;
5486         const int tiles = w_tm / 8 * h_tm / 8;
5487 
5488         // permute
5489         // bottom_blob_tm.create(1, 64 * tiles, inch);
5490         //         Mat bottom_blob_tm2(inch, tiles, 64);
5491         Mat bottom_blob_tm2(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u, opt.workspace_allocator);
5492 
5493         #pragma omp parallel for num_threads(opt.num_threads)
5494         for (int r = 0; r < 64; r++)
5495         {
5496             Mat tm2 = bottom_blob_tm2.channel(r);
5497 
5498             // tile
5499             int i = 0;
5500             for (; i + 7 < tiles; i += 8)
5501             {
5502                 float* tm2p = tm2.row(i / 8);
5503 
5504                 const float* r0 = bottom_blob_tm;
5505 
5506                 r0 += r * tiles + i;
5507 
5508                 for (int q = 0; q < inch; q++)
5509                 {
5510 #if __ARM_NEON
5511                     float32x4_t _r0 = vld1q_f32(r0);
5512                     float32x4_t _r0n = vld1q_f32(r0 + 4);
5513                     vst1q_f32(tm2p, _r0);
5514                     vst1q_f32(tm2p + 4, _r0n);
5515 #else
5516                     tm2p[0] = r0[0];
5517                     tm2p[1] = r0[1];
5518                     tm2p[2] = r0[2];
5519                     tm2p[3] = r0[3];
5520                     tm2p[4] = r0[4];
5521                     tm2p[5] = r0[5];
5522                     tm2p[6] = r0[6];
5523                     tm2p[7] = r0[7];
5524 #endif // __ARM_NEON
5525 
5526                     r0 += bottom_blob_tm.cstep;
5527                     tm2p += 8;
5528                 }
5529             }
5530             for (; i + 3 < tiles; i += 4)
5531             {
5532                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
5533 
5534                 const float* r0 = bottom_blob_tm;
5535 
5536                 r0 += r * tiles + i;
5537 
5538                 for (int q = 0; q < inch; q++)
5539                 {
5540 #if __ARM_NEON
5541                     float32x4_t _r0 = vld1q_f32(r0);
5542                     vst1q_f32(tm2p, _r0);
5543 #else
5544                     tm2p[0] = r0[0];
5545                     tm2p[1] = r0[1];
5546                     tm2p[2] = r0[2];
5547                     tm2p[3] = r0[3];
5548 #endif // __ARM_NEON
5549 
5550                     r0 += bottom_blob_tm.cstep;
5551                     tm2p += 4;
5552                 }
5553             }
5554             for (; i < tiles; i++)
5555             {
5556                 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + i % 4);
5557 
5558                 const float* r0 = bottom_blob_tm;
5559 
5560                 r0 += r * tiles + i;
5561 
5562                 for (int q = 0; q < inch; q++)
5563                 {
5564                     tm2p[0] = r0[0];
5565 
5566                     r0 += bottom_blob_tm.cstep;
5567                     tm2p += 1;
5568                 }
5569             }
5570         }
5571 
5572         bottom_blob_tm = Mat();
5573         // permute end
5574 
5575         top_blob_tm.create(1, 64 * tiles, outch);
5576 
5577         int nn_outch = 0;
5578         int remain_outch_start = 0;
5579 
5580 #if __ARM_NEON && __aarch64__
5581         nn_outch = outch >> 3;
5582         remain_outch_start = nn_outch << 3;
5583 
5584         #pragma omp parallel for num_threads(opt.num_threads)
5585         for (int pp = 0; pp < nn_outch; pp++)
5586         {
5587             int p = pp * 8;
5588 
5589             const Mat kernel_tm0 = kernel_tm.channel(p / 8);
5590 
5591             Mat out0_tm = top_blob_tm.channel(p);
5592             Mat out1_tm = top_blob_tm.channel(p + 1);
5593             Mat out2_tm = top_blob_tm.channel(p + 2);
5594             Mat out3_tm = top_blob_tm.channel(p + 3);
5595             Mat out4_tm = top_blob_tm.channel(p + 4);
5596             Mat out5_tm = top_blob_tm.channel(p + 5);
5597             Mat out6_tm = top_blob_tm.channel(p + 6);
5598             Mat out7_tm = top_blob_tm.channel(p + 7);
5599 
5600             float* output0_tm = out0_tm;
5601             float* output1_tm = out1_tm;
5602             float* output2_tm = out2_tm;
5603             float* output3_tm = out3_tm;
5604             float* output4_tm = out4_tm;
5605             float* output5_tm = out5_tm;
5606             float* output6_tm = out6_tm;
5607             float* output7_tm = out7_tm;
5608 
5609             for (int r = 0; r < 64; r++)
5610             {
5611                 const Mat bb2 = bottom_blob_tm2.channel(r);
5612 
5613                 // tile
5614                 int i = 0;
5615                 for (; i + 7 < tiles; i += 8)
5616                 {
5617                     const float* bb2p0 = bb2.row(i / 8);
5618 
5619                     const float* ktm0 = kernel_tm0.row(r);
5620 
5621                     asm volatile(
5622                         "eor    v16.16b, v16.16b, v16.16b  \n"
5623                         "eor    v17.16b, v17.16b, v17.16b  \n"
5624                         "eor    v18.16b, v18.16b, v18.16b  \n"
5625                         "eor    v19.16b, v19.16b, v19.16b  \n"
5626                         "eor    v20.16b, v20.16b, v20.16b  \n"
5627                         "eor    v21.16b, v21.16b, v21.16b  \n"
5628                         "eor    v22.16b, v22.16b, v22.16b  \n"
5629                         "eor    v23.16b, v23.16b, v23.16b  \n"
5630                         "eor    v24.16b, v24.16b, v24.16b  \n"
5631                         "eor    v25.16b, v25.16b, v25.16b  \n"
5632                         "eor    v26.16b, v26.16b, v26.16b  \n"
5633                         "eor    v27.16b, v27.16b, v27.16b  \n"
5634                         "eor    v28.16b, v28.16b, v28.16b  \n"
5635                         "eor    v29.16b, v29.16b, v29.16b  \n"
5636                         "eor    v30.16b, v30.16b, v30.16b  \n"
5637                         "eor    v31.16b, v31.16b, v31.16b  \n"
5638 
5639                         // inch loop
5640                         "lsr    w4, %w20, #2            \n" // w4 = nn = inch >> 2
5641                         "cmp    w4, #0                  \n"
5642                         "beq    1f                      \n"
5643 
5644                         "0:                             \n"
5645 
5646                         "prfm   pldl1keep, [%8, #512]   \n"
5647                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64   \n"
5648 
5649                         "prfm   pldl1keep, [%9, #512]   \n"
5650                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64   \n"
5651 
5652                         "fmla   v16.4s, v8.4s, v0.s[0]  \n"
5653                         "fmla   v17.4s, v9.4s, v0.s[0]  \n"
5654                         "fmla   v18.4s, v8.4s, v0.s[1]  \n"
5655                         "fmla   v19.4s, v9.4s, v0.s[1]  \n"
5656                         "fmla   v20.4s, v8.4s, v0.s[2]  \n"
5657                         "fmla   v21.4s, v9.4s, v0.s[2]  \n"
5658                         "fmla   v22.4s, v8.4s, v0.s[3]  \n"
5659                         "fmla   v23.4s, v9.4s, v0.s[3]  \n"
5660 
5661                         "prfm   pldl1keep, [%9, #512]   \n"
5662                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64   \n"
5663 
5664                         "fmla   v24.4s, v8.4s, v1.s[0]  \n"
5665                         "fmla   v25.4s, v9.4s, v1.s[0]  \n"
5666                         "fmla   v26.4s, v8.4s, v1.s[1]  \n"
5667                         "fmla   v27.4s, v9.4s, v1.s[1]  \n"
5668                         "fmla   v28.4s, v8.4s, v1.s[2]  \n"
5669                         "fmla   v29.4s, v9.4s, v1.s[2]  \n"
5670                         "fmla   v30.4s, v8.4s, v1.s[3]  \n"
5671                         "fmla   v31.4s, v9.4s, v1.s[3]  \n"
5672 
5673                         "fmla   v16.4s, v10.4s, v2.s[0] \n"
5674                         "fmla   v17.4s, v11.4s, v2.s[0] \n"
5675                         "fmla   v18.4s, v10.4s, v2.s[1] \n"
5676                         "fmla   v19.4s, v11.4s, v2.s[1] \n"
5677                         "fmla   v20.4s, v10.4s, v2.s[2] \n"
5678                         "fmla   v21.4s, v11.4s, v2.s[2] \n"
5679                         "fmla   v22.4s, v10.4s, v2.s[3] \n"
5680                         "fmla   v23.4s, v11.4s, v2.s[3] \n"
5681 
5682                         "prfm   pldl1keep, [%8, #512]   \n"
5683                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n"
5684 
5685                         "fmla   v24.4s, v10.4s, v3.s[0] \n"
5686                         "fmla   v25.4s, v11.4s, v3.s[0] \n"
5687                         "fmla   v26.4s, v10.4s, v3.s[1] \n"
5688                         "fmla   v27.4s, v11.4s, v3.s[1] \n"
5689                         "fmla   v28.4s, v10.4s, v3.s[2] \n"
5690                         "fmla   v29.4s, v11.4s, v3.s[2] \n"
5691                         "fmla   v30.4s, v10.4s, v3.s[3] \n"
5692                         "fmla   v31.4s, v11.4s, v3.s[3] \n"
5693 
5694                         "fmla   v16.4s, v12.4s, v4.s[0] \n"
5695                         "fmla   v17.4s, v13.4s, v4.s[0] \n"
5696                         "fmla   v18.4s, v12.4s, v4.s[1] \n"
5697                         "fmla   v19.4s, v13.4s, v4.s[1] \n"
5698                         "fmla   v20.4s, v12.4s, v4.s[2] \n"
5699                         "fmla   v21.4s, v13.4s, v4.s[2] \n"
5700                         "fmla   v22.4s, v12.4s, v4.s[3] \n"
5701                         "fmla   v23.4s, v13.4s, v4.s[3] \n"
5702 
5703                         "fmla   v24.4s, v12.4s, v5.s[0] \n"
5704                         "fmla   v25.4s, v13.4s, v5.s[0] \n"
5705                         "fmla   v26.4s, v12.4s, v5.s[1] \n"
5706                         "fmla   v27.4s, v13.4s, v5.s[1] \n"
5707                         "fmla   v28.4s, v12.4s, v5.s[2] \n"
5708                         "fmla   v29.4s, v13.4s, v5.s[2] \n"
5709                         "fmla   v30.4s, v12.4s, v5.s[3] \n"
5710                         "fmla   v31.4s, v13.4s, v5.s[3] \n"
5711 
5712                         "fmla   v16.4s, v14.4s, v6.s[0] \n"
5713                         "fmla   v17.4s, v15.4s, v6.s[0] \n"
5714                         "fmla   v18.4s, v14.4s, v6.s[1] \n"
5715                         "fmla   v19.4s, v15.4s, v6.s[1] \n"
5716                         "fmla   v20.4s, v14.4s, v6.s[2] \n"
5717                         "fmla   v21.4s, v15.4s, v6.s[2] \n"
5718                         "fmla   v22.4s, v14.4s, v6.s[3] \n"
5719                         "fmla   v23.4s, v15.4s, v6.s[3] \n"
5720 
5721                         "subs   w4, w4, #1              \n"
5722 
5723                         "fmla   v24.4s, v14.4s, v7.s[0] \n"
5724                         "fmla   v25.4s, v15.4s, v7.s[0] \n"
5725                         "fmla   v26.4s, v14.4s, v7.s[1] \n"
5726                         "fmla   v27.4s, v15.4s, v7.s[1] \n"
5727                         "fmla   v28.4s, v14.4s, v7.s[2] \n"
5728                         "fmla   v29.4s, v15.4s, v7.s[2] \n"
5729                         "fmla   v30.4s, v14.4s, v7.s[3] \n"
5730                         "fmla   v31.4s, v15.4s, v7.s[3] \n"
5731 
5732                         "bne    0b                      \n"
5733 
5734                         "1:                             \n"
5735 
5736                         // remain loop
5737                         "and    w4, %w20, #3            \n" // w4 = remain = tiles & 3;
5738                         "cmp    w4, #0                  \n"
5739                         "beq    3f                      \n"
5740 
5741                         "2:                             \n"
5742 
5743                         "prfm   pldl1keep, [%8, #256]   \n"
5744                         "ld1    {v8.4s, v9.4s}, [%8], #32   \n"
5745 
5746                         "prfm   pldl1keep, [%9, #256]   \n"
5747                         "ld1    {v0.4s, v1.4s}, [%9], #32   \n"
5748 
5749                         "fmla   v16.4s, v8.4s, v0.s[0]  \n"
5750                         "fmla   v17.4s, v9.4s, v0.s[0]  \n"
5751                         "fmla   v18.4s, v8.4s, v0.s[1]  \n"
5752                         "fmla   v19.4s, v9.4s, v0.s[1]  \n"
5753                         "fmla   v20.4s, v8.4s, v0.s[2]  \n"
5754                         "fmla   v21.4s, v9.4s, v0.s[2]  \n"
5755                         "fmla   v22.4s, v8.4s, v0.s[3]  \n"
5756                         "fmla   v23.4s, v9.4s, v0.s[3]  \n"
5757 
5758                         "subs   w4, w4, #1              \n"
5759 
5760                         "fmla   v24.4s, v8.4s, v1.s[0]  \n"
5761                         "fmla   v25.4s, v9.4s, v1.s[0]  \n"
5762                         "fmla   v26.4s, v8.4s, v1.s[1]  \n"
5763                         "fmla   v27.4s, v9.4s, v1.s[1]  \n"
5764                         "fmla   v28.4s, v8.4s, v1.s[2]  \n"
5765                         "fmla   v29.4s, v9.4s, v1.s[2]  \n"
5766                         "fmla   v30.4s, v8.4s, v1.s[3]  \n"
5767                         "fmla   v31.4s, v9.4s, v1.s[3]  \n"
5768 
5769                         "bne    2b                      \n"
5770 
5771                         "3:                             \n"
5772 
5773                         "st1    {v16.4s, v17.4s}, [%0], #32 \n"
5774                         "st1    {v18.4s, v19.4s}, [%1], #32 \n"
5775                         "st1    {v20.4s, v21.4s}, [%2], #32 \n"
5776                         "st1    {v22.4s, v23.4s}, [%3], #32 \n"
5777                         "st1    {v24.4s, v25.4s}, [%4], #32 \n"
5778                         "st1    {v26.4s, v27.4s}, [%5], #32 \n"
5779                         "st1    {v28.4s, v29.4s}, [%6], #32 \n"
5780                         "st1    {v30.4s, v31.4s}, [%7], #32 \n"
5781 
5782                         : "=r"(output0_tm), // %0
5783                         "=r"(output1_tm), // %1
5784                         "=r"(output2_tm), // %2
5785                         "=r"(output3_tm), // %3
5786                         "=r"(output4_tm), // %4
5787                         "=r"(output5_tm), // %5
5788                         "=r"(output6_tm), // %6
5789                         "=r"(output7_tm), // %7
5790                         "=r"(bb2p0),      // %8
5791                         "=r"(ktm0)        // %9
5792                         : "0"(output0_tm),
5793                         "1"(output1_tm),
5794                         "2"(output2_tm),
5795                         "3"(output3_tm),
5796                         "4"(output4_tm),
5797                         "5"(output5_tm),
5798                         "6"(output6_tm),
5799                         "7"(output7_tm),
5800                         "8"(bb2p0),
5801                         "9"(ktm0),
5802                         "r"(inch) // %20
5803                         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
5804                 }
5805                 for (; i + 3 < tiles; i += 4)
5806                 {
5807                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
5808 
5809                     const float* ktm0 = kernel_tm0.row(r);
5810 
5811                     asm volatile(
5812                         "eor    v16.16b, v16.16b, v16.16b  \n"
5813                         "eor    v17.16b, v17.16b, v17.16b  \n"
5814                         "eor    v18.16b, v18.16b, v18.16b  \n"
5815                         "eor    v19.16b, v19.16b, v19.16b  \n"
5816                         "eor    v20.16b, v20.16b, v20.16b  \n"
5817                         "eor    v21.16b, v21.16b, v21.16b  \n"
5818                         "eor    v22.16b, v22.16b, v22.16b  \n"
5819                         "eor    v23.16b, v23.16b, v23.16b  \n"
5820 
5821                         // inch loop
5822                         "lsr    w4, %w20, #2            \n" // w4 = nn = inch >> 2
5823                         "cmp    w4, #0                  \n"
5824                         "beq    1f                      \n"
5825 
5826                         "0:                             \n"
5827 
5828                         "prfm   pldl1keep, [%8, #512]   \n"
5829                         "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64 \n"
5830 
5831                         "prfm   pldl1keep, [%9, #512]   \n"
5832                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64   \n"
5833 
5834                         "fmla   v16.4s, v8.4s, v0.s[0]  \n"
5835                         "fmla   v17.4s, v8.4s, v0.s[1]  \n"
5836                         "fmla   v18.4s, v8.4s, v0.s[2]  \n"
5837                         "fmla   v19.4s, v8.4s, v0.s[3]  \n"
5838                         "fmla   v20.4s, v8.4s, v1.s[0]  \n"
5839                         "fmla   v21.4s, v8.4s, v1.s[1]  \n"
5840                         "fmla   v22.4s, v8.4s, v1.s[2]  \n"
5841                         "fmla   v23.4s, v8.4s, v1.s[3]  \n"
5842 
5843                         "prfm   pldl1keep, [%9, #512]   \n"
5844                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64   \n"
5845 
5846                         "fmla   v16.4s, v9.4s, v2.s[0]  \n"
5847                         "fmla   v17.4s, v9.4s, v2.s[1]  \n"
5848                         "fmla   v18.4s, v9.4s, v2.s[2]  \n"
5849                         "fmla   v19.4s, v9.4s, v2.s[3]  \n"
5850                         "fmla   v20.4s, v9.4s, v3.s[0]  \n"
5851                         "fmla   v21.4s, v9.4s, v3.s[1]  \n"
5852                         "fmla   v22.4s, v9.4s, v3.s[2]  \n"
5853                         "fmla   v23.4s, v9.4s, v3.s[3]  \n"
5854 
5855                         "fmla   v16.4s, v10.4s, v4.s[0] \n"
5856                         "fmla   v17.4s, v10.4s, v4.s[1] \n"
5857                         "fmla   v18.4s, v10.4s, v4.s[2] \n"
5858                         "fmla   v19.4s, v10.4s, v4.s[3] \n"
5859                         "fmla   v20.4s, v10.4s, v5.s[0] \n"
5860                         "fmla   v21.4s, v10.4s, v5.s[1] \n"
5861                         "fmla   v22.4s, v10.4s, v5.s[2] \n"
5862                         "fmla   v23.4s, v10.4s, v5.s[3] \n"
5863 
5864                         "subs   w4, w4, #1              \n"
5865 
5866                         "fmla   v16.4s, v11.4s, v6.s[0] \n"
5867                         "fmla   v17.4s, v11.4s, v6.s[1] \n"
5868                         "fmla   v18.4s, v11.4s, v6.s[2] \n"
5869                         "fmla   v19.4s, v11.4s, v6.s[3] \n"
5870                         "fmla   v20.4s, v11.4s, v7.s[0] \n"
5871                         "fmla   v21.4s, v11.4s, v7.s[1] \n"
5872                         "fmla   v22.4s, v11.4s, v7.s[2] \n"
5873                         "fmla   v23.4s, v11.4s, v7.s[3] \n"
5874 
5875                         "bne    0b                      \n"
5876 
5877                         "1:                             \n"
5878 
5879                         // remain loop
5880                         "and    w4, %w20, #3            \n" // w4 = remain = tiles & 3;
5881                         "cmp    w4, #0                  \n"
5882                         "beq    3f                      \n"
5883 
5884                         "2:                             \n"
5885 
5886                         "prfm   pldl1keep, [%8, #128]   \n"
5887                         "ld1    {v8.4s}, [%8], #16      \n"
5888 
5889                         "prfm   pldl1keep, [%9, #256]   \n"
5890                         "ld1    {v0.4s, v1.4s}, [%9], #32   \n"
5891 
5892                         "fmla   v16.4s, v8.4s, v0.s[0]  \n"
5893                         "fmla   v17.4s, v8.4s, v0.s[1]  \n"
5894                         "fmla   v18.4s, v8.4s, v0.s[2]  \n"
5895                         "fmla   v19.4s, v8.4s, v0.s[3]  \n"
5896 
5897                         "subs   w4, w4, #1              \n"
5898 
5899                         "fmla   v20.4s, v8.4s, v1.s[0]  \n"
5900                         "fmla   v21.4s, v8.4s, v1.s[1]  \n"
5901                         "fmla   v22.4s, v8.4s, v1.s[2]  \n"
5902                         "fmla   v23.4s, v8.4s, v1.s[3]  \n"
5903 
5904                         "bne    2b                      \n"
5905 
5906                         "3:                             \n"
5907 
5908                         "st1    {v16.4s}, [%0], #16     \n"
5909                         "st1    {v17.4s}, [%1], #16     \n"
5910                         "st1    {v18.4s}, [%2], #16     \n"
5911                         "st1    {v19.4s}, [%3], #16     \n"
5912                         "st1    {v20.4s}, [%4], #16     \n"
5913                         "st1    {v21.4s}, [%5], #16     \n"
5914                         "st1    {v22.4s}, [%6], #16     \n"
5915                         "st1    {v23.4s}, [%7], #16     \n"
5916 
5917                         : "=r"(output0_tm), // %0
5918                         "=r"(output1_tm), // %1
5919                         "=r"(output2_tm), // %2
5920                         "=r"(output3_tm), // %3
5921                         "=r"(output4_tm), // %4
5922                         "=r"(output5_tm), // %5
5923                         "=r"(output6_tm), // %6
5924                         "=r"(output7_tm), // %7
5925                         "=r"(bb2p0),      // %8
5926                         "=r"(ktm0)        // %9
5927                         : "0"(output0_tm),
5928                         "1"(output1_tm),
5929                         "2"(output2_tm),
5930                         "3"(output3_tm),
5931                         "4"(output4_tm),
5932                         "5"(output5_tm),
5933                         "6"(output6_tm),
5934                         "7"(output7_tm),
5935                         "8"(bb2p0),
5936                         "9"(ktm0),
5937                         "r"(inch) // %20
5938                         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
5939                 }
5940                 for (; i < tiles; i++)
5941                 {
5942                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
5943 
5944                     const float* ktm0 = kernel_tm0.row(r);
5945 
5946                     float32x4_t _sum0123 = vdupq_n_f32(0.f);
5947                     float32x4_t _sum4567 = vdupq_n_f32(0.f);
5948 
5949                     int q = 0;
5950                     for (; q + 3 < inch; q += 4)
5951                     {
5952                         //                         asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
5953                         float32x4_t _bb2p0 = vld1q_f32(bb2p0);
5954                         bb2p0 += 4;
5955 
5956                         //                         asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
5957                         float32x4_t _ktm0 = vld1q_f32(ktm0 + 0);
5958                         float32x4_t _ktm1 = vld1q_f32(ktm0 + 4);
5959                         float32x4_t _ktm2 = vld1q_f32(ktm0 + 8);
5960                         float32x4_t _ktm3 = vld1q_f32(ktm0 + 12);
5961                         ktm0 += 16;
5962 
5963                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm0, _bb2p0, 0);
5964                         _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm1, _bb2p0, 0);
5965                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm2, _bb2p0, 1);
5966                         _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm3, _bb2p0, 1);
5967 
5968                         //                         asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
5969                         float32x4_t _ktm4 = vld1q_f32(ktm0 + 0);
5970                         float32x4_t _ktm5 = vld1q_f32(ktm0 + 4);
5971                         float32x4_t _ktm6 = vld1q_f32(ktm0 + 8);
5972                         float32x4_t _ktm7 = vld1q_f32(ktm0 + 12);
5973                         ktm0 += 16;
5974 
5975                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm4, _bb2p0, 2);
5976                         _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm5, _bb2p0, 2);
5977                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm6, _bb2p0, 3);
5978                         _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm7, _bb2p0, 3);
5979                     }
5980 
5981                     for (; q < inch; q++)
5982                     {
5983                         float32x4_t _bb2p0 = vld1q_dup_f32(bb2p0);
5984                         float32x4_t _ktm0123 = vld1q_f32(ktm0 + 0);
5985                         float32x4_t _ktm4567 = vld1q_f32(ktm0 + 4);
5986 
5987                         _sum0123 = vmlaq_f32(_sum0123, _bb2p0, _ktm0123);
5988                         _sum4567 = vmlaq_f32(_sum4567, _bb2p0, _ktm4567);
5989 
5990                         bb2p0 += 1;
5991                         ktm0 += 8;
5992                     }
5993 
5994                     float sum0 = vgetq_lane_f32(_sum0123, 0);
5995                     float sum1 = vgetq_lane_f32(_sum0123, 1);
5996                     float sum2 = vgetq_lane_f32(_sum0123, 2);
5997                     float sum3 = vgetq_lane_f32(_sum0123, 3);
5998                     float sum4 = vgetq_lane_f32(_sum4567, 0);
5999                     float sum5 = vgetq_lane_f32(_sum4567, 1);
6000                     float sum6 = vgetq_lane_f32(_sum4567, 2);
6001                     float sum7 = vgetq_lane_f32(_sum4567, 3);
6002 
6003                     output0_tm[0] = sum0;
6004                     output1_tm[0] = sum1;
6005                     output2_tm[0] = sum2;
6006                     output3_tm[0] = sum3;
6007                     output4_tm[0] = sum4;
6008                     output5_tm[0] = sum5;
6009                     output6_tm[0] = sum6;
6010                     output7_tm[0] = sum7;
6011 
6012                     output0_tm += 1;
6013                     output1_tm += 1;
6014                     output2_tm += 1;
6015                     output3_tm += 1;
6016                     output4_tm += 1;
6017                     output5_tm += 1;
6018                     output6_tm += 1;
6019                     output7_tm += 1;
6020                 }
6021             }
6022         }
6023 #endif // __aarch64__
6024 
6025         nn_outch = (outch - remain_outch_start) >> 2;
6026 
6027         #pragma omp parallel for num_threads(opt.num_threads)
6028         for (int pp = 0; pp < nn_outch; pp++)
6029         {
6030             int p = remain_outch_start + pp * 4;
6031 
6032 #if __ARM_NEON && __aarch64__
6033             const Mat kernel_tm0 = kernel_tm.channel(p / 8 + (p % 8) / 4);
6034 #else
6035             const Mat kernel_tm0 = kernel_tm.channel(p / 4);
6036 #endif
6037 
6038             Mat out0_tm = top_blob_tm.channel(p);
6039             Mat out1_tm = top_blob_tm.channel(p + 1);
6040             Mat out2_tm = top_blob_tm.channel(p + 2);
6041             Mat out3_tm = top_blob_tm.channel(p + 3);
6042 
6043             float* output0_tm = out0_tm;
6044             float* output1_tm = out1_tm;
6045             float* output2_tm = out2_tm;
6046             float* output3_tm = out3_tm;
6047 
6048             for (int r = 0; r < 64; r++)
6049             {
6050                 const Mat bb2 = bottom_blob_tm2.channel(r);
6051 
6052                 // tile
6053                 int i = 0;
6054                 for (; i + 7 < tiles; i += 8)
6055                 {
6056                     const float* bb2p0 = bb2.row(i / 8);
6057 
6058                     const float* ktm0 = kernel_tm0.row(r);
6059 #if __ARM_NEON
6060 #if __aarch64__
6061                     asm volatile(
6062                         "eor    v8.16b, v8.16b, v8.16b     \n"
6063                         "eor    v9.16b, v9.16b, v9.16b     \n"
6064                         "eor    v10.16b, v10.16b, v10.16b  \n"
6065                         "eor    v11.16b, v11.16b, v11.16b  \n"
6066                         "eor    v12.16b, v12.16b, v12.16b  \n"
6067                         "eor    v13.16b, v13.16b, v13.16b  \n"
6068                         "eor    v14.16b, v14.16b, v14.16b  \n"
6069                         "eor    v15.16b, v15.16b, v15.16b  \n"
6070 
6071                         // inch loop
6072                         "lsr    w4, %w12, #2            \n" // w4 = nn = inch >> 2
6073                         "cmp    w4, #0                  \n"
6074                         "beq    1f                      \n"
6075 
6076                         "0:                             \n"
6077 
6078                         "prfm   pldl1keep, [%4, #512]   \n"
6079                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64     \n"
6080 
6081                         "prfm   pldl1keep, [%5, #512]   \n"
6082                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64     \n"
6083 
6084                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6085                         "fmla   v9.4s, v5.4s, v0.s[0]   \n"
6086                         "fmla   v10.4s, v4.4s, v0.s[1]  \n"
6087                         "fmla   v11.4s, v5.4s, v0.s[1]  \n"
6088                         "fmla   v12.4s, v4.4s, v0.s[2]  \n"
6089                         "fmla   v13.4s, v5.4s, v0.s[2]  \n"
6090                         "fmla   v14.4s, v4.4s, v0.s[3]  \n"
6091                         "fmla   v15.4s, v5.4s, v0.s[3]  \n"
6092 
6093                         "prfm   pldl1keep, [%4, #512]   \n"
6094                         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
6095 
6096                         "fmla   v8.4s, v6.4s, v1.s[0]   \n"
6097                         "fmla   v9.4s, v7.4s, v1.s[0]   \n"
6098                         "fmla   v10.4s, v6.4s, v1.s[1]  \n"
6099                         "fmla   v11.4s, v7.4s, v1.s[1]  \n"
6100                         "fmla   v12.4s, v6.4s, v1.s[2]  \n"
6101                         "fmla   v13.4s, v7.4s, v1.s[2]  \n"
6102                         "fmla   v14.4s, v6.4s, v1.s[3]  \n"
6103                         "fmla   v15.4s, v7.4s, v1.s[3]  \n"
6104 
6105                         "fmla   v8.4s, v16.4s, v2.s[0]  \n"
6106                         "fmla   v9.4s, v17.4s, v2.s[0]  \n"
6107                         "fmla   v10.4s, v16.4s, v2.s[1] \n"
6108                         "fmla   v11.4s, v17.4s, v2.s[1] \n"
6109                         "fmla   v12.4s, v16.4s, v2.s[2] \n"
6110                         "fmla   v13.4s, v17.4s, v2.s[2] \n"
6111                         "fmla   v14.4s, v16.4s, v2.s[3] \n"
6112                         "fmla   v15.4s, v17.4s, v2.s[3] \n"
6113 
6114                         "fmla   v8.4s, v18.4s, v3.s[0]  \n"
6115                         "fmla   v9.4s, v19.4s, v3.s[0]  \n"
6116                         "fmla   v10.4s, v18.4s, v3.s[1] \n"
6117                         "fmla   v11.4s, v19.4s, v3.s[1] \n"
6118                         "fmla   v12.4s, v18.4s, v3.s[2] \n"
6119                         "fmla   v13.4s, v19.4s, v3.s[2] \n"
6120                         "fmla   v14.4s, v18.4s, v3.s[3] \n"
6121                         "fmla   v15.4s, v19.4s, v3.s[3] \n"
6122 
6123                         "subs   w4, w4, #1              \n"
6124                         "bne    0b                      \n"
6125 
6126                         "1:                             \n"
6127 
6128                         // remain loop
6129                         "and    w4, %w12, #3            \n" // w4 = remain = tiles & 3;
6130                         "cmp    w4, #0                  \n"
6131                         "beq    3f                      \n"
6132 
6133                         "2:                             \n"
6134 
6135                         "prfm   pldl1keep, [%4, #256]   \n"
6136                         "ld1    {v4.4s, v5.4s}, [%4], #32      \n"
6137 
6138                         "prfm   pldl1keep, [%5, #128]   \n"
6139                         "ld1    {v0.4s}, [%5], #16      \n"
6140 
6141                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6142                         "fmla   v9.4s, v5.4s, v0.s[0]   \n"
6143                         "fmla   v10.4s, v4.4s, v0.s[1]  \n"
6144                         "fmla   v11.4s, v5.4s, v0.s[1]  \n"
6145                         "fmla   v12.4s, v4.4s, v0.s[2]  \n"
6146                         "fmla   v13.4s, v5.4s, v0.s[2]  \n"
6147                         "fmla   v14.4s, v4.4s, v0.s[3]  \n"
6148                         "fmla   v15.4s, v5.4s, v0.s[3]  \n"
6149 
6150                         "subs   w4, w4, #1              \n"
6151                         "bne    2b                      \n"
6152 
6153                         "3:                             \n"
6154 
6155                         "st1    {v8.4s, v9.4s}, [%0], #32       \n"
6156                         "st1    {v10.4s, v11.4s}, [%1], #32     \n"
6157                         "st1    {v12.4s, v13.4s}, [%2], #32     \n"
6158                         "st1    {v14.4s, v15.4s}, [%3], #32     \n"
6159 
6160                         : "=r"(output0_tm), // %0
6161                         "=r"(output1_tm), // %1
6162                         "=r"(output2_tm), // %2
6163                         "=r"(output3_tm), // %3
6164                         "=r"(bb2p0),      // %4
6165                         "=r"(ktm0)        // %5
6166                         : "0"(output0_tm),
6167                         "1"(output1_tm),
6168                         "2"(output2_tm),
6169                         "3"(output3_tm),
6170                         "4"(bb2p0),
6171                         "5"(ktm0),
6172                         "r"(inch) // %12
6173                         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
6174 #else  // __aarch64__
6175                     asm volatile(
6176                         "veor       q8, q8, q8      \n"
6177                         "veor       q9, q9, q9      \n"
6178                         "veor       q10, q10, q10   \n"
6179                         "veor       q11, q11, q11   \n"
6180                         "veor       q12, q12, q12   \n"
6181                         "veor       q13, q13, q13   \n"
6182                         "veor       q14, q14, q14   \n"
6183                         "veor       q15, q15, q15   \n"
6184 
6185                         // inch loop
6186                         "lsr        r4, %12, #2     \n" // r4 = nn = inch >> 2
6187                         "cmp        r4, #0          \n"
6188                         "beq        1f              \n"
6189 
6190                         "0:                         \n"
6191 
6192                         "pld        [%4, #512]      \n"
6193                         "vldm       %4!, {d8-d15}   \n"
6194                         //                         "vld1.f32   {d8-d11}, [%4 :128]! \n"
6195                         //                         "vld1.f32   {d12-d15}, [%4 :128]! \n"
6196 
6197                         "pld        [%5, #512]      \n"
6198                         "vldm       %5!, {d0-d7}    \n"
6199                         //                         "vld1.f32   {d0-d3}, [%5 :128]!  \n"
6200                         //                         "vld1.f32   {d4-d7}, [%5 :128]!  \n"
6201 
6202                         "vmla.f32   q8, q4, d0[0]   \n"
6203                         "vmla.f32   q9, q5, d0[0]   \n"
6204                         "vmla.f32   q10, q4, d0[1]  \n"
6205                         "vmla.f32   q11, q5, d0[1]  \n"
6206                         "vmla.f32   q12, q4, d1[0]  \n"
6207                         "vmla.f32   q13, q5, d1[0]  \n"
6208                         "vmla.f32   q14, q4, d1[1]  \n"
6209                         "vmla.f32   q15, q5, d1[1]  \n"
6210 
6211                         "vmla.f32   q8, q6, d2[0]   \n"
6212                         "vmla.f32   q9, q7, d2[0]   \n"
6213                         "vmla.f32   q10, q6, d2[1]  \n"
6214                         "vmla.f32   q11, q7, d2[1]  \n"
6215                         "vmla.f32   q12, q6, d3[0]  \n"
6216                         "vmla.f32   q13, q7, d3[0]  \n"
6217                         "vmla.f32   q14, q6, d3[1]  \n"
6218                         "vmla.f32   q15, q7, d3[1]  \n"
6219 
6220                         "pld        [%4, #512]      \n"
6221                         "vldm       %4!, {d8-d15}   \n"
6222                         //                         "vld1.f32   {d8-d11}, [%4 :128]! \n"
6223                         //                         "vld1.f32   {d12-d15}, [%4 :128]! \n"
6224 
6225                         "vmla.f32   q8, q4, d4[0]   \n"
6226                         "vmla.f32   q9, q5, d4[0]   \n"
6227                         "vmla.f32   q10, q4, d4[1]  \n"
6228                         "vmla.f32   q11, q5, d4[1]  \n"
6229                         "vmla.f32   q12, q4, d5[0]  \n"
6230                         "vmla.f32   q13, q5, d5[0]  \n"
6231                         "vmla.f32   q14, q4, d5[1]  \n"
6232                         "vmla.f32   q15, q5, d5[1]  \n"
6233 
6234                         "subs       r4, r4, #1      \n"
6235 
6236                         "vmla.f32   q8, q6, d6[0]   \n"
6237                         "vmla.f32   q9, q7, d6[0]   \n"
6238                         "vmla.f32   q10, q6, d6[1]  \n"
6239                         "vmla.f32   q11, q7, d6[1]  \n"
6240                         "vmla.f32   q12, q6, d7[0]  \n"
6241                         "vmla.f32   q13, q7, d7[0]  \n"
6242                         "vmla.f32   q14, q6, d7[1]  \n"
6243                         "vmla.f32   q15, q7, d7[1]  \n"
6244 
6245                         "bne        0b              \n"
6246 
6247                         "1:                         \n"
6248 
6249                         // remain loop
6250                         "and        r4, %12, #3     \n" // r4 = remain = tiles & 3;
6251                         "cmp        r4, #0          \n"
6252                         "beq        3f              \n"
6253 
6254                         "2:                         \n"
6255 
6256                         "pld        [%4, #256]      \n"
6257                         "vld1.f32   {d8-d11}, [%4 :128]! \n"
6258 
6259                         "pld        [%5, #128]      \n"
6260                         "vld1.f32   {d0-d1}, [%5 :128]!  \n"
6261 
6262                         "vmla.f32   q8, q4, d0[0]   \n"
6263                         "vmla.f32   q9, q5, d0[0]   \n"
6264                         "vmla.f32   q10, q4, d0[1]  \n"
6265                         "vmla.f32   q11, q5, d0[1]  \n"
6266 
6267                         "subs       r4, r4, #1      \n"
6268 
6269                         "vmla.f32   q12, q4, d1[0]  \n"
6270                         "vmla.f32   q13, q5, d1[0]  \n"
6271                         "vmla.f32   q14, q4, d1[1]  \n"
6272                         "vmla.f32   q15, q5, d1[1]  \n"
6273 
6274                         "bne        2b              \n"
6275 
6276                         "3:                         \n"
6277 
6278                         "vst1.f32   {d16-d19}, [%0]! \n"
6279                         "vst1.f32   {d20-d23}, [%1]! \n"
6280                         "vst1.f32   {d24-d27}, [%2]! \n"
6281                         "vst1.f32   {d28-d31}, [%3]! \n"
6282 
6283                         : "=r"(output0_tm), // %0
6284                         "=r"(output1_tm), // %1
6285                         "=r"(output2_tm), // %2
6286                         "=r"(output3_tm), // %3
6287                         "=r"(bb2p0),      // %4
6288                         "=r"(ktm0)        // %5
6289                         : "0"(output0_tm),
6290                         "1"(output1_tm),
6291                         "2"(output2_tm),
6292                         "3"(output3_tm),
6293                         "4"(bb2p0),
6294                         "5"(ktm0),
6295                         "r"(inch) // %12
6296                         : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
6297 #endif // __aarch64__
6298 #else
6299                     float sum0_0 = 0.f;
6300                     float sum0_1 = 0.f;
6301                     float sum0_2 = 0.f;
6302                     float sum0_3 = 0.f;
6303                     float sum0_4 = 0.f;
6304                     float sum0_5 = 0.f;
6305                     float sum0_6 = 0.f;
6306                     float sum0_7 = 0.f;
6307 
6308                     float sum1_0 = 0.f;
6309                     float sum1_1 = 0.f;
6310                     float sum1_2 = 0.f;
6311                     float sum1_3 = 0.f;
6312                     float sum1_4 = 0.f;
6313                     float sum1_5 = 0.f;
6314                     float sum1_6 = 0.f;
6315                     float sum1_7 = 0.f;
6316 
6317                     float sum2_0 = 0.f;
6318                     float sum2_1 = 0.f;
6319                     float sum2_2 = 0.f;
6320                     float sum2_3 = 0.f;
6321                     float sum2_4 = 0.f;
6322                     float sum2_5 = 0.f;
6323                     float sum2_6 = 0.f;
6324                     float sum2_7 = 0.f;
6325 
6326                     float sum3_0 = 0.f;
6327                     float sum3_1 = 0.f;
6328                     float sum3_2 = 0.f;
6329                     float sum3_3 = 0.f;
6330                     float sum3_4 = 0.f;
6331                     float sum3_5 = 0.f;
6332                     float sum3_6 = 0.f;
6333                     float sum3_7 = 0.f;
6334 
6335                     for (int q = 0; q < inch; q++)
6336                     {
6337                         sum0_0 += bb2p0[0] * ktm0[0];
6338                         sum0_1 += bb2p0[1] * ktm0[0];
6339                         sum0_2 += bb2p0[2] * ktm0[0];
6340                         sum0_3 += bb2p0[3] * ktm0[0];
6341                         sum0_4 += bb2p0[4] * ktm0[0];
6342                         sum0_5 += bb2p0[5] * ktm0[0];
6343                         sum0_6 += bb2p0[6] * ktm0[0];
6344                         sum0_7 += bb2p0[7] * ktm0[0];
6345 
6346                         sum1_0 += bb2p0[0] * ktm0[1];
6347                         sum1_1 += bb2p0[1] * ktm0[1];
6348                         sum1_2 += bb2p0[2] * ktm0[1];
6349                         sum1_3 += bb2p0[3] * ktm0[1];
6350                         sum1_4 += bb2p0[4] * ktm0[1];
6351                         sum1_5 += bb2p0[5] * ktm0[1];
6352                         sum1_6 += bb2p0[6] * ktm0[1];
6353                         sum1_7 += bb2p0[7] * ktm0[1];
6354 
6355                         sum2_0 += bb2p0[0] * ktm0[2];
6356                         sum2_1 += bb2p0[1] * ktm0[2];
6357                         sum2_2 += bb2p0[2] * ktm0[2];
6358                         sum2_3 += bb2p0[3] * ktm0[2];
6359                         sum2_4 += bb2p0[4] * ktm0[2];
6360                         sum2_5 += bb2p0[5] * ktm0[2];
6361                         sum2_6 += bb2p0[6] * ktm0[2];
6362                         sum2_7 += bb2p0[7] * ktm0[2];
6363 
6364                         sum3_0 += bb2p0[0] * ktm0[3];
6365                         sum3_1 += bb2p0[1] * ktm0[3];
6366                         sum3_2 += bb2p0[2] * ktm0[3];
6367                         sum3_3 += bb2p0[3] * ktm0[3];
6368                         sum3_4 += bb2p0[4] * ktm0[3];
6369                         sum3_5 += bb2p0[5] * ktm0[3];
6370                         sum3_6 += bb2p0[6] * ktm0[3];
6371                         sum3_7 += bb2p0[7] * ktm0[3];
6372 
6373                         bb2p0 += 8;
6374                         ktm0 += 4;
6375                     }
6376 
6377                     output0_tm[0] = sum0_0;
6378                     output0_tm[1] = sum0_1;
6379                     output0_tm[2] = sum0_2;
6380                     output0_tm[3] = sum0_3;
6381                     output0_tm[4] = sum0_4;
6382                     output0_tm[5] = sum0_5;
6383                     output0_tm[6] = sum0_6;
6384                     output0_tm[7] = sum0_7;
6385 
6386                     output1_tm[0] = sum1_0;
6387                     output1_tm[1] = sum1_1;
6388                     output1_tm[2] = sum1_2;
6389                     output1_tm[3] = sum1_3;
6390                     output1_tm[4] = sum1_4;
6391                     output1_tm[5] = sum1_5;
6392                     output1_tm[6] = sum1_6;
6393                     output1_tm[7] = sum1_7;
6394 
6395                     output2_tm[0] = sum2_0;
6396                     output2_tm[1] = sum2_1;
6397                     output2_tm[2] = sum2_2;
6398                     output2_tm[3] = sum2_3;
6399                     output2_tm[4] = sum2_4;
6400                     output2_tm[5] = sum2_5;
6401                     output2_tm[6] = sum2_6;
6402                     output2_tm[7] = sum2_7;
6403 
6404                     output3_tm[0] = sum3_0;
6405                     output3_tm[1] = sum3_1;
6406                     output3_tm[2] = sum3_2;
6407                     output3_tm[3] = sum3_3;
6408                     output3_tm[4] = sum3_4;
6409                     output3_tm[5] = sum3_5;
6410                     output3_tm[6] = sum3_6;
6411                     output3_tm[7] = sum3_7;
6412 
6413                     output0_tm += 8;
6414                     output1_tm += 8;
6415                     output2_tm += 8;
6416                     output3_tm += 8;
6417 #endif // __ARM_NEON
6418                 }
6419                 for (; i + 3 < tiles; i += 4)
6420                 {
6421                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
6422 
6423                     const float* ktm0 = kernel_tm0.row(r);
6424 #if __ARM_NEON
6425 #if __aarch64__
6426                     asm volatile(
6427                         "eor    v8.16b, v8.16b, v8.16b     \n"
6428                         "eor    v9.16b, v9.16b, v9.16b     \n"
6429                         "eor    v10.16b, v10.16b, v10.16b  \n"
6430                         "eor    v11.16b, v11.16b, v11.16b  \n"
6431 
6432                         // inch loop
6433                         "lsr    w4, %w12, #2            \n" // w4 = nn = inch >> 2
6434                         "cmp    w4, #0                  \n"
6435                         "beq    1f                      \n"
6436 
6437                         "0:                             \n"
6438 
6439                         "prfm   pldl1keep, [%4, #512]   \n"
6440                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64     \n"
6441 
6442                         "prfm   pldl1keep, [%5, #512]   \n"
6443                         "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64     \n"
6444 
6445                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6446                         "fmla   v9.4s, v4.4s, v0.s[1]   \n"
6447                         "fmla   v10.4s, v4.4s, v0.s[2]  \n"
6448                         "fmla   v11.4s, v4.4s, v0.s[3]  \n"
6449 
6450                         "fmla   v8.4s, v5.4s, v1.s[0]   \n"
6451                         "fmla   v9.4s, v5.4s, v1.s[1]   \n"
6452                         "fmla   v10.4s, v5.4s, v1.s[2]  \n"
6453                         "fmla   v11.4s, v5.4s, v1.s[3]  \n"
6454 
6455                         "fmla   v8.4s, v6.4s, v2.s[0]   \n"
6456                         "fmla   v9.4s, v6.4s, v2.s[1]   \n"
6457                         "fmla   v10.4s, v6.4s, v2.s[2]  \n"
6458                         "fmla   v11.4s, v6.4s, v2.s[3]  \n"
6459 
6460                         "fmla   v8.4s, v7.4s, v3.s[0]   \n"
6461                         "fmla   v9.4s, v7.4s, v3.s[1]   \n"
6462                         "fmla   v10.4s, v7.4s, v3.s[2]  \n"
6463                         "fmla   v11.4s, v7.4s, v3.s[3]  \n"
6464 
6465                         "subs   w4, w4, #1              \n"
6466                         "bne    0b                      \n"
6467 
6468                         "1:                             \n"
6469 
6470                         // remain loop
6471                         "and    w4, %w12, #3            \n" // w4 = remain = tiles & 3;
6472                         "cmp    w4, #0                  \n"
6473                         "beq    3f                      \n"
6474 
6475                         "2:                             \n"
6476 
6477                         "prfm   pldl1keep, [%4, #128]   \n"
6478                         "ld1    {v4.4s}, [%4], #16      \n"
6479 
6480                         "prfm   pldl1keep, [%5, #128]   \n"
6481                         "ld1    {v0.4s}, [%5], #16      \n"
6482 
6483                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6484                         "fmla   v9.4s, v4.4s, v0.s[1]   \n"
6485                         "fmla   v10.4s, v4.4s, v0.s[2]  \n"
6486                         "fmla   v11.4s, v4.4s, v0.s[3]  \n"
6487 
6488                         "subs   w4, w4, #1              \n"
6489                         "bne    2b                      \n"
6490 
6491                         "3:                             \n"
6492 
6493                         "st1    {v8.4s}, [%0], #16      \n"
6494                         "st1    {v9.4s}, [%1], #16      \n"
6495                         "st1    {v10.4s}, [%2], #16     \n"
6496                         "st1    {v11.4s}, [%3], #16     \n"
6497 
6498                         : "=r"(output0_tm), // %0
6499                         "=r"(output1_tm), // %1
6500                         "=r"(output2_tm), // %2
6501                         "=r"(output3_tm), // %3
6502                         "=r"(bb2p0),      // %4
6503                         "=r"(ktm0)        // %5
6504                         : "0"(output0_tm),
6505                         "1"(output1_tm),
6506                         "2"(output2_tm),
6507                         "3"(output3_tm),
6508                         "4"(bb2p0),
6509                         "5"(ktm0),
6510                         "r"(inch) // %12
6511                         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
6512 #else  // __aarch64__
6513                     asm volatile(
6514                         "veor       q8, q8, q8      \n"
6515                         "veor       q9, q9, q9      \n"
6516                         "veor       q10, q10, q10   \n"
6517                         "veor       q11, q11, q11   \n"
6518 
6519                         // inch loop
6520                         "lsr        r4, %12, #2     \n" // r4 = nn = inch >> 2
6521                         "cmp        r4, #0          \n"
6522                         "beq        1f              \n"
6523 
6524                         "0:                         \n"
6525 
6526                         "pld        [%4, #512]      \n"
6527                         "vldm       %4!, {d8-d15}   \n"
6528                         //                         "vld1.f32   {d8-d11}, [%4 :128]! \n"
6529                         //                         "vld1.f32   {d12-d15}, [%4 :128]! \n"
6530 
6531                         "pld        [%5, #512]      \n"
6532                         "vldm       %5!, {d0-d7}    \n"
6533                         //                         "vld1.f32   {d0-d3}, [%5 :128]!  \n"
6534                         //                         "vld1.f32   {d4-d7}, [%5 :128]!  \n"
6535 
6536                         "vmla.f32   q8, q4, d0[0]   \n"
6537                         "vmla.f32   q9, q4, d0[1]   \n"
6538                         "vmla.f32   q10, q4, d1[0]  \n"
6539                         "vmla.f32   q11, q4, d1[1]  \n"
6540 
6541                         "vmla.f32   q8, q5, d2[0]   \n"
6542                         "vmla.f32   q9, q5, d2[1]   \n"
6543                         "vmla.f32   q10, q5, d3[0]  \n"
6544                         "vmla.f32   q11, q5, d3[1]  \n"
6545 
6546                         "subs       r4, r4, #1      \n"
6547 
6548                         "vmla.f32   q8, q6, d4[0]   \n"
6549                         "vmla.f32   q9, q6, d4[1]   \n"
6550                         "vmla.f32   q10, q6, d5[0]  \n"
6551                         "vmla.f32   q11, q6, d5[1]  \n"
6552 
6553                         "vmla.f32   q8, q7, d6[0]   \n"
6554                         "vmla.f32   q9, q7, d6[1]   \n"
6555                         "vmla.f32   q10, q7, d7[0]  \n"
6556                         "vmla.f32   q11, q7, d7[1]  \n"
6557 
6558                         "bne        0b              \n"
6559 
6560                         "1:                         \n"
6561 
6562                         // remain loop
6563                         "and        r4, %12, #3     \n" // r4 = remain = tiles & 3;
6564                         "cmp        r4, #0          \n"
6565                         "beq        3f              \n"
6566 
6567                         "2:                         \n"
6568 
6569                         "pld        [%4, #128]      \n"
6570                         "vld1.f32   {d8-d9}, [%4 :128]!  \n"
6571 
6572                         "pld        [%5, #128]      \n"
6573                         "vld1.f32   {d0-d1}, [%5 :128]!  \n"
6574 
6575                         "subs       r4, r4, #1      \n"
6576 
6577                         "vmla.f32   q8, q4, d0[0]   \n"
6578                         "vmla.f32   q9, q4, d0[1]   \n"
6579                         "vmla.f32   q10, q4, d1[0]  \n"
6580                         "vmla.f32   q11, q4, d1[1]  \n"
6581 
6582                         "bne        2b              \n"
6583 
6584                         "3:                         \n"
6585 
6586                         "vst1.f32   {d16-d17}, [%0]! \n"
6587                         "vst1.f32   {d18-d19}, [%1]! \n"
6588                         "vst1.f32   {d20-d21}, [%2]! \n"
6589                         "vst1.f32   {d22-d23}, [%3]! \n"
6590 
6591                         : "=r"(output0_tm), // %0
6592                         "=r"(output1_tm), // %1
6593                         "=r"(output2_tm), // %2
6594                         "=r"(output3_tm), // %3
6595                         "=r"(bb2p0),      // %4
6596                         "=r"(ktm0)        // %5
6597                         : "0"(output0_tm),
6598                         "1"(output1_tm),
6599                         "2"(output2_tm),
6600                         "3"(output3_tm),
6601                         "4"(bb2p0),
6602                         "5"(ktm0),
6603                         "r"(inch) // %12
6604                         : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
6605 #endif // __aarch64__
6606 #else
6607                     float sum0_0 = 0.f;
6608                     float sum0_1 = 0.f;
6609                     float sum0_2 = 0.f;
6610                     float sum0_3 = 0.f;
6611 
6612                     float sum1_0 = 0.f;
6613                     float sum1_1 = 0.f;
6614                     float sum1_2 = 0.f;
6615                     float sum1_3 = 0.f;
6616 
6617                     float sum2_0 = 0.f;
6618                     float sum2_1 = 0.f;
6619                     float sum2_2 = 0.f;
6620                     float sum2_3 = 0.f;
6621 
6622                     float sum3_0 = 0.f;
6623                     float sum3_1 = 0.f;
6624                     float sum3_2 = 0.f;
6625                     float sum3_3 = 0.f;
6626 
6627                     for (int q = 0; q < inch; q++)
6628                     {
6629                         sum0_0 += bb2p0[0] * ktm0[0];
6630                         sum0_1 += bb2p0[1] * ktm0[0];
6631                         sum0_2 += bb2p0[2] * ktm0[0];
6632                         sum0_3 += bb2p0[3] * ktm0[0];
6633 
6634                         sum1_0 += bb2p0[0] * ktm0[1];
6635                         sum1_1 += bb2p0[1] * ktm0[1];
6636                         sum1_2 += bb2p0[2] * ktm0[1];
6637                         sum1_3 += bb2p0[3] * ktm0[1];
6638 
6639                         sum2_0 += bb2p0[0] * ktm0[2];
6640                         sum2_1 += bb2p0[1] * ktm0[2];
6641                         sum2_2 += bb2p0[2] * ktm0[2];
6642                         sum2_3 += bb2p0[3] * ktm0[2];
6643 
6644                         sum3_0 += bb2p0[0] * ktm0[3];
6645                         sum3_1 += bb2p0[1] * ktm0[3];
6646                         sum3_2 += bb2p0[2] * ktm0[3];
6647                         sum3_3 += bb2p0[3] * ktm0[3];
6648 
6649                         bb2p0 += 4;
6650                         ktm0 += 4;
6651                     }
6652 
6653                     output0_tm[0] = sum0_0;
6654                     output0_tm[1] = sum0_1;
6655                     output0_tm[2] = sum0_2;
6656                     output0_tm[3] = sum0_3;
6657 
6658                     output1_tm[0] = sum1_0;
6659                     output1_tm[1] = sum1_1;
6660                     output1_tm[2] = sum1_2;
6661                     output1_tm[3] = sum1_3;
6662 
6663                     output2_tm[0] = sum2_0;
6664                     output2_tm[1] = sum2_1;
6665                     output2_tm[2] = sum2_2;
6666                     output2_tm[3] = sum2_3;
6667 
6668                     output3_tm[0] = sum3_0;
6669                     output3_tm[1] = sum3_1;
6670                     output3_tm[2] = sum3_2;
6671                     output3_tm[3] = sum3_3;
6672 
6673                     output0_tm += 4;
6674                     output1_tm += 4;
6675                     output2_tm += 4;
6676                     output3_tm += 4;
6677 #endif // __ARM_NEON
6678                 }
6679                 for (; i < tiles; i++)
6680                 {
6681                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
6682 
6683                     const float* ktm0 = kernel_tm0.row(r);
6684 
6685 #if __ARM_NEON
6686                     float32x4_t _sum0123 = vdupq_n_f32(0.f);
6687 
6688                     int q = 0;
6689                     for (; q + 3 < inch; q += 4)
6690                     {
6691                         //                         asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
6692                         float32x4_t _bb2p0 = vld1q_f32(bb2p0);
6693                         bb2p0 += 4;
6694 
6695                         //                         asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
6696                         float32x4_t _ktm0 = vld1q_f32(ktm0 + 0);
6697                         float32x4_t _ktm1 = vld1q_f32(ktm0 + 4);
6698                         float32x4_t _ktm2 = vld1q_f32(ktm0 + 8);
6699                         float32x4_t _ktm3 = vld1q_f32(ktm0 + 12);
6700                         ktm0 += 16;
6701 
6702 #if __aarch64__
6703                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm0, _bb2p0, 0);
6704                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm1, _bb2p0, 1);
6705                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm2, _bb2p0, 2);
6706                         _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm3, _bb2p0, 3);
6707 #else
6708                         _sum0123 = vmlaq_lane_f32(_sum0123, _ktm0, vget_low_f32(_bb2p0), 0);
6709                         _sum0123 = vmlaq_lane_f32(_sum0123, _ktm1, vget_low_f32(_bb2p0), 1);
6710                         _sum0123 = vmlaq_lane_f32(_sum0123, _ktm2, vget_high_f32(_bb2p0), 0);
6711                         _sum0123 = vmlaq_lane_f32(_sum0123, _ktm3, vget_high_f32(_bb2p0), 1);
6712 #endif // __aarch64__
6713                     }
6714 
6715                     for (; q < inch; q++)
6716                     {
6717                         float32x4_t _bb2p0 = vld1q_dup_f32(bb2p0);
6718                         float32x4_t _ktm0 = vld1q_f32(ktm0);
6719 
6720                         _sum0123 = vmlaq_f32(_sum0123, _bb2p0, _ktm0);
6721 
6722                         bb2p0 += 1;
6723                         ktm0 += 4;
6724                     }
6725 
6726                     float sum0 = vgetq_lane_f32(_sum0123, 0);
6727                     float sum1 = vgetq_lane_f32(_sum0123, 1);
6728                     float sum2 = vgetq_lane_f32(_sum0123, 2);
6729                     float sum3 = vgetq_lane_f32(_sum0123, 3);
6730 #else
6731                     float sum0 = 0.f;
6732                     float sum1 = 0.f;
6733                     float sum2 = 0.f;
6734                     float sum3 = 0.f;
6735 
6736                     for (int q = 0; q < inch; q++)
6737                     {
6738                         sum0 += bb2p0[0] * ktm0[0];
6739                         sum1 += bb2p0[0] * ktm0[1];
6740                         sum2 += bb2p0[0] * ktm0[2];
6741                         sum3 += bb2p0[0] * ktm0[3];
6742 
6743                         bb2p0 += 1;
6744                         ktm0 += 4;
6745                     }
6746 #endif // __ARM_NEON
6747 
6748                     output0_tm[0] = sum0;
6749                     output1_tm[0] = sum1;
6750                     output2_tm[0] = sum2;
6751                     output3_tm[0] = sum3;
6752 
6753                     output0_tm += 1;
6754                     output1_tm += 1;
6755                     output2_tm += 1;
6756                     output3_tm += 1;
6757                 }
6758             }
6759         }
6760 
6761         remain_outch_start += nn_outch << 2;
6762 
6763         #pragma omp parallel for num_threads(opt.num_threads)
6764         for (int p = remain_outch_start; p < outch; p++)
6765         {
6766 #if __ARM_NEON && __aarch64__
6767             const Mat kernel_tm0 = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
6768 #else
6769             const Mat kernel_tm0 = kernel_tm.channel(p / 4 + p % 4);
6770 #endif
6771 
6772             Mat out0_tm = top_blob_tm.channel(p);
6773 
6774             float* output0_tm = out0_tm;
6775 
6776             for (int r = 0; r < 64; r++)
6777             {
6778                 const Mat bb2 = bottom_blob_tm2.channel(r);
6779 
6780                 // tile
6781                 int i = 0;
6782                 for (; i + 7 < tiles; i += 8)
6783                 {
6784                     const float* bb2p0 = bb2.row(i / 8);
6785 
6786                     const float* ktm0 = kernel_tm0.row(r);
6787 #if __ARM_NEON
6788 #if __aarch64__
6789                     asm volatile(
6790                         "eor    v8.16b, v8.16b, v8.16b     \n"
6791                         "eor    v9.16b, v9.16b, v9.16b     \n"
6792 
6793                         // inch loop
6794                         "lsr    w4, %w6, #2             \n" // w4 = nn = inch >> 2
6795                         "cmp    w4, #0                  \n"
6796                         "beq    1f                      \n"
6797 
6798                         "0:                             \n"
6799 
6800                         "prfm   pldl1keep, [%1, #512]   \n"
6801                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64     \n"
6802 
6803                         "prfm   pldl1keep, [%2, #128]   \n"
6804                         "ld1    {v0.4s}, [%2], #16      \n"
6805 
6806                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6807                         "fmla   v9.4s, v5.4s, v0.s[0]   \n"
6808                         "fmla   v8.4s, v6.4s, v0.s[1]   \n"
6809                         "fmla   v9.4s, v7.4s, v0.s[1]   \n"
6810 
6811                         "prfm   pldl1keep, [%1, #512]   \n"
6812                         "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
6813 
6814                         "fmla   v8.4s, v12.4s, v0.s[2]  \n"
6815                         "fmla   v9.4s, v13.4s, v0.s[2]  \n"
6816                         "fmla   v8.4s, v14.4s, v0.s[3]  \n"
6817                         "fmla   v9.4s, v15.4s, v0.s[3]  \n"
6818 
6819                         "subs   w4, w4, #1              \n"
6820                         "bne    0b                      \n"
6821 
6822                         "1:                             \n"
6823 
6824                         // remain loop
6825                         "and    w4, %w6, #3             \n" // w4 = remain = tiles & 3;
6826                         "cmp    w4, #0                  \n"
6827                         "beq    3f                      \n"
6828 
6829                         "2:                             \n"
6830 
6831                         "prfm   pldl1keep, [%1, #256]   \n"
6832                         "ld1    {v4.4s, v5.4s}, [%1], #32      \n"
6833 
6834                         "prfm   pldl1keep, [%2, #32]    \n"
6835                         "ld1r   {v0.4s}, [%2], #4       \n"
6836 
6837                         "fmla   v8.4s, v4.4s, v0.4s     \n"
6838                         "fmla   v9.4s, v5.4s, v0.4s     \n"
6839 
6840                         "subs   w4, w4, #1              \n"
6841                         "bne    2b                      \n"
6842 
6843                         "3:                             \n"
6844 
6845                         "st1    {v8.4s, v9.4s}, [%0], #32       \n"
6846 
6847                         : "=r"(output0_tm), // %0
6848                         "=r"(bb2p0),      // %1
6849                         "=r"(ktm0)        // %2
6850                         : "0"(output0_tm),
6851                         "1"(bb2p0),
6852                         "2"(ktm0),
6853                         "r"(inch) // %6
6854                         : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v12", "v13", "v14", "v15");
6855 #else  // __aarch64__
6856                     asm volatile(
6857                         "veor       q8, q8, q8          \n"
6858                         "veor       q9, q9, q9          \n"
6859 
6860                         // inch loop
6861                         "lsr        r4, %6, #2          \n" // r4 = nn = inch >> 2
6862                         "cmp        r4, #0              \n"
6863                         "beq        1f                  \n"
6864 
6865                         "0:                             \n"
6866 
6867                         "pld        [%1, #512]          \n"
6868                         "vldm       %1!, {d8-d15}       \n"
6869                         //                         "vld1.f32   {d8-d11}, [%1 :128]! \n"
6870                         //                         "vld1.f32   {d12-d15}, [%1 :128]! \n"
6871 
6872                         "pld        [%2, #128]          \n"
6873                         "vld1.f32   {d0-d1}, [%2 :128]! \n"
6874 
6875                         "vmla.f32   q8, q4, d0[0]       \n"
6876                         "vmla.f32   q9, q5, d0[0]       \n"
6877                         "vmla.f32   q8, q6, d0[1]       \n"
6878                         "vmla.f32   q9, q7, d0[1]       \n"
6879 
6880                         "pld        [%1, #512]          \n"
6881                         "vldm       %1!, {d24-d31}      \n"
6882                         //                         "vld1.f32   {d24-d27}, [%1 :128]! \n"
6883                         //                         "vld1.f32   {d28-d31}, [%1 :128]! \n"
6884 
6885                         "subs       r4, r4, #1          \n"
6886 
6887                         "vmla.f32   q8, q12, d1[0]      \n"
6888                         "vmla.f32   q9, q13, d1[0]      \n"
6889                         "vmla.f32   q8, q14, d1[1]      \n"
6890                         "vmla.f32   q9, q15, d1[1]      \n"
6891 
6892                         "bne        0b                  \n"
6893 
6894                         "1:                             \n"
6895 
6896                         // remain loop
6897                         "and        r4, %6, #3          \n" // r4 = remain = tiles & 3;
6898                         "cmp        r4, #0              \n"
6899                         "beq        3f                  \n"
6900 
6901                         "2:                             \n"
6902 
6903                         "pld        [%1, #256]          \n"
6904                         "vld1.f32   {d8-d11}, [%1 :128]! \n"
6905 
6906                         "pld        [%2, #32]           \n"
6907                         "vld1.f32   {d0[],d1[]}, [%2]!  \n"
6908 
6909                         "subs       r4, r4, #1          \n"
6910 
6911                         "vmla.f32   q8, q4, q0          \n"
6912                         "vmla.f32   q9, q5, q0          \n"
6913 
6914                         "bne        2b                  \n"
6915 
6916                         "3:                             \n"
6917 
6918                         "vst1.f32   {d16-d19}, [%0]!    \n"
6919 
6920                         : "=r"(output0_tm), // %0
6921                         "=r"(bb2p0),      // %1
6922                         "=r"(ktm0)        // %2
6923                         : "0"(output0_tm),
6924                         "1"(bb2p0),
6925                         "2"(ktm0),
6926                         "r"(inch) // %6
6927                         : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q12", "q13", "q14", "q15");
6928 #endif // __aarch64__
6929 #else
6930                     float sum0 = 0.f;
6931                     float sum1 = 0.f;
6932                     float sum2 = 0.f;
6933                     float sum3 = 0.f;
6934                     float sum4 = 0.f;
6935                     float sum5 = 0.f;
6936                     float sum6 = 0.f;
6937                     float sum7 = 0.f;
6938 
6939                     for (int q = 0; q < inch; q++)
6940                     {
6941                         sum0 += bb2p0[0] * ktm0[0];
6942                         sum1 += bb2p0[1] * ktm0[0];
6943                         sum2 += bb2p0[2] * ktm0[0];
6944                         sum3 += bb2p0[3] * ktm0[0];
6945                         sum4 += bb2p0[4] * ktm0[0];
6946                         sum5 += bb2p0[5] * ktm0[0];
6947                         sum6 += bb2p0[6] * ktm0[0];
6948                         sum7 += bb2p0[7] * ktm0[0];
6949 
6950                         bb2p0 += 8;
6951                         ktm0 += 1;
6952                     }
6953 
6954                     output0_tm[0] = sum0;
6955                     output0_tm[1] = sum1;
6956                     output0_tm[2] = sum2;
6957                     output0_tm[3] = sum3;
6958                     output0_tm[4] = sum4;
6959                     output0_tm[5] = sum5;
6960                     output0_tm[6] = sum6;
6961                     output0_tm[7] = sum7;
6962 
6963                     output0_tm += 8;
6964 #endif // __ARM_NEON
6965                 }
6966                 for (; i + 3 < tiles; i += 4)
6967                 {
6968                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
6969 
6970                     const float* ktm0 = kernel_tm0.row(r);
6971 #if __ARM_NEON
6972 #if __aarch64__
6973                     asm volatile(
6974                         "eor    v8.16b, v8.16b, v8.16b     \n"
6975 
6976                         // inch loop
6977                         "lsr    w4, %w6, #2             \n" // w4 = nn = inch >> 2
6978                         "cmp    w4, #0                  \n"
6979                         "beq    1f                      \n"
6980 
6981                         "0:                             \n"
6982 
6983                         "prfm   pldl1keep, [%4, #512]   \n"
6984                         "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64     \n"
6985 
6986                         "prfm   pldl1keep, [%5, #128]   \n"
6987                         "ld1    {v0.4s}, [%5], #16      \n"
6988 
6989                         "fmla   v8.4s, v4.4s, v0.s[0]   \n"
6990                         "fmla   v8.4s, v5.4s, v0.s[1]   \n"
6991                         "fmla   v8.4s, v6.4s, v0.s[2]   \n"
6992                         "fmla   v8.4s, v7.4s, v0.s[3]   \n"
6993 
6994                         "subs   w4, w4, #1              \n"
6995                         "bne    0b                      \n"
6996 
6997                         "1:                             \n"
6998 
6999                         // remain loop
7000                         "and    w4, %w6, #3             \n" // w4 = remain = tiles & 3;
7001                         "cmp    w4, #0                  \n"
7002                         "beq    3f                      \n"
7003 
7004                         "2:                             \n"
7005 
7006                         "prfm   pldl1keep, [%4, #128]   \n"
7007                         "ld1    {v4.4s}, [%4], #16      \n"
7008 
7009                         "prfm   pldl1keep, [%5, #32]    \n"
7010                         "ld1r   {v0.4s}, [%5], #4       \n"
7011 
7012                         "fmla   v8.4s, v4.4s, v0.4s     \n"
7013 
7014                         "subs   w4, w4, #1              \n"
7015                         "bne    2b                      \n"
7016 
7017                         "3:                             \n"
7018 
7019                         "st1    {v8.4s}, [%0], #16      \n"
7020 
7021                         : "=r"(output0_tm), // %0
7022                         "=r"(bb2p0),      // %1
7023                         "=r"(ktm0)        // %2
7024                         : "0"(output0_tm),
7025                         "1"(bb2p0),
7026                         "2"(ktm0),
7027                         "r"(inch) // %6
7028                         : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8");
7029 #else  // __aarch64__
7030                     asm volatile(
7031                         "veor       q8, q8, q8          \n"
7032 
7033                         // inch loop
7034                         "lsr        r4, %6, #2          \n" // r4 = nn = inch >> 2
7035                         "cmp        r4, #0              \n"
7036                         "beq        1f                  \n"
7037 
7038                         "0:                             \n"
7039 
7040                         "pld        [%4, #512]          \n"
7041                         "vldm       %4!, {d8-d15}       \n"
7042                         //                         "vld1.f32   {d8-d11}, [%4 :128]! \n"
7043                         //                         "vld1.f32   {d12-d15}, [%4 :128]! \n"
7044 
7045                         "pld        [%5, #128]          \n"
7046                         "vld1.f32   {d0-d1}, [%5 :128]! \n"
7047 
7048                         "subs       r4, r4, #1          \n"
7049 
7050                         "vmla.f32   q8, q4, d0[0]       \n"
7051                         "vmla.f32   q8, q5, d0[1]       \n"
7052                         "vmla.f32   q8, q6, d1[0]       \n"
7053                         "vmla.f32   q8, q7, d1[1]       \n"
7054 
7055                         "bne        0b                  \n"
7056 
7057                         "1:                             \n"
7058 
7059                         // remain loop
7060                         "and        r4, %6, #3          \n" // r4 = remain = tiles & 3;
7061                         "cmp        r4, #0              \n"
7062                         "beq        3f                  \n"
7063 
7064                         "2:                             \n"
7065 
7066                         "pld        [%4, #128]          \n"
7067                         "vld1.f32   {d8-d9}, [%4]!      \n"
7068 
7069                         "pld        [%5, #32]           \n"
7070                         "vld1.f32   {d0[],d1[]}, [%5]!  \n"
7071 
7072                         "subs       r4, r4, #1          \n"
7073 
7074                         "vmla.f32   q8, q4, q0          \n"
7075 
7076                         "bne        2b                  \n"
7077 
7078                         "3:                             \n"
7079 
7080                         "vst1.f32   {d16-d17}, [%0]!    \n"
7081 
7082                         : "=r"(output0_tm), // %0
7083                         "=r"(bb2p0),      // %1
7084                         "=r"(ktm0)        // %2
7085                         : "0"(output0_tm),
7086                         "1"(bb2p0),
7087                         "2"(ktm0),
7088                         "r"(inch) // %6
7089                         : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q8");
7090 #endif // __aarch64__
7091 #else
7092                     float sum0 = 0.f;
7093                     float sum1 = 0.f;
7094                     float sum2 = 0.f;
7095                     float sum3 = 0.f;
7096 
7097                     for (int q = 0; q < inch; q++)
7098                     {
7099                         sum0 += bb2p0[0] * ktm0[0];
7100                         sum1 += bb2p0[1] * ktm0[0];
7101                         sum2 += bb2p0[2] * ktm0[0];
7102                         sum3 += bb2p0[3] * ktm0[0];
7103 
7104                         bb2p0 += 4;
7105                         ktm0 += 1;
7106                     }
7107 
7108                     output0_tm[0] = sum0;
7109                     output0_tm[1] = sum1;
7110                     output0_tm[2] = sum2;
7111                     output0_tm[3] = sum3;
7112 
7113                     output0_tm += 4;
7114 #endif // __ARM_NEON
7115                 }
7116                 for (; i < tiles; i++)
7117                 {
7118                     const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
7119 
7120                     const float* ktm0 = kernel_tm0.row(r);
7121 
7122                     int q = 0;
7123 #if __ARM_NEON
7124                     float32x4_t _sum0 = vdupq_n_f32(0.f);
7125                     for (; q + 3 < inch; q += 4)
7126                     {
7127                         //                         asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
7128                         float32x4_t _bb2p0 = vld1q_f32(bb2p0);
7129                         bb2p0 += 4;
7130 
7131                         float32x4_t _ktm0 = vld1q_f32(ktm0);
7132                         ktm0 += 4;
7133 
7134                         _sum0 = vmlaq_f32(_sum0, _bb2p0, _ktm0);
7135                     }
7136 
7137 #if __aarch64__
7138                     float sum0 = vaddvq_f32(_sum0);
7139 #else
7140                     float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
7141                     float sum0 = vget_lane_f32(vpadd_f32(_ss0, _ss0), 0);
7142 #endif // __aarch64__
7143 #else
7144                     float sum0 = 0.f;
7145 #endif
7146                     for (; q < inch; q++)
7147                     {
7148                         sum0 += bb2p0[0] * ktm0[0];
7149 
7150                         bb2p0 += 1;
7151                         ktm0 += 1;
7152                     }
7153 
7154                     output0_tm[0] = sum0;
7155 
7156                     output0_tm += 1;
7157                 }
7158             }
7159         }
7160     }
7161     bottom_blob_tm = Mat();
7162     // END dot
7163 
7164     // BEGIN transform output
7165     Mat top_blob_bordered;
7166     if (outw == top_blob.w && outh == top_blob.h)
7167     {
7168         top_blob_bordered = top_blob;
7169     }
7170     else
7171     {
7172         top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
7173     }
7174     {
7175         //         const float otm[6][8] = {
7176         //             {1.0f,  1.0f,   1.0f,   1.0f,   1.0f,  32.0f, 32.0f, 0.0f},
7177         //             {0.0f,  1.0f,  -1.0f,   2.0f,  -2.0f,  16.0f,-16.0f, 0.0f},
7178         //             {0.0f,  1.0f,   1.0f,   4.0f,   4.0f,   8.0f,  8.0f, 0.0f},
7179         //             {0.0f,  1.0f,  -1.0f,   8.0f,  -8.0f,   4.0f, -4.0f, 0.0f},
7180         //             {0.0f,  1.0f,   1.0f,  16.0f,  16.0f,   2.0f,  2.0f, 0.0f},
7181         //             {0.0f,  1.0f,  -1.0f,  32.0f, -32.0f,   1.0f, -1.0f, 1.0f}
7182         //         };
7183 
7184         // 0 = r0 + (r1 + r2) + (r3 + r4)     + (r5 + r6) * 32
7185         // 1 =      (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
7186         // 2 =      (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
7187         // 3 =      (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
7188         // 4 =      (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
7189         // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
7190 
7191 #if __ARM_NEON
7192         const float coeff[4] = {4.f, 8.f, 16.f, 32.f};
7193         float32x4_t _coeff = vld1q_f32(coeff);
7194 #endif // __ARM_NEON
7195 
7196         int w_tm = outw / 6 * 8;
7197         int h_tm = outh / 6 * 8;
7198         const int tiles = w_tm / 8 * h_tm / 8;
7199 
7200         #pragma omp parallel for num_threads(opt.num_threads)
7201         for (int p = 0; p < outch; p++)
7202         {
7203             const Mat out0_tm = top_blob_tm.channel(p);
7204             Mat out0 = top_blob_bordered.channel(p);
7205 
7206             const float bias0 = bias ? bias[p] : 0.f;
7207 #if __ARM_NEON
7208             float32x2_t _bias0 = vdup_n_f32(bias0);
7209 #endif // __ARM_NEON
7210 
7211             float tmp[6][8];
7212 
7213             // tile
7214             for (int i = 0; i < outh / 6; i++)
7215             {
7216                 for (int j = 0; j < outw / 6; j++)
7217                 {
7218 #if __ARM_NEON
7219 #if __aarch64__
7220                     const float* output0_tm0 = out0_tm.row(i * w_tm / 8 + j);
7221                     const float* output0_tm1 = out0_tm.row(i * w_tm / 8 + j + tiles * 8);
7222                     const float* output0_tm2 = out0_tm.row(i * w_tm / 8 + j + tiles * 16);
7223                     const float* output0_tm3 = out0_tm.row(i * w_tm / 8 + j + tiles * 24);
7224 
7225                     for (int m = 0; m + 3 < 8; m += 4)
7226                     {
7227                         float32x4_t _output0_tm_00 = {};
7228                         float32x4_t _output0_tm_11 = {};
7229                         float32x4_t _output0_tm_22 = {};
7230                         float32x4_t _output0_tm_33 = {};
7231                         float32x4_t _output0_tm_44 = {};
7232                         float32x4_t _output0_tm_55 = {};
7233                         float32x4_t _output0_tm_66 = {};
7234                         float32x4_t _output0_tm_77 = {};
7235 
7236                         _output0_tm_00 = vsetq_lane_f32(output0_tm0[0], _output0_tm_00, 0);
7237                         output0_tm0 += out0_tm.w * tiles;
7238                         _output0_tm_00 = vsetq_lane_f32(output0_tm1[0], _output0_tm_00, 1);
7239                         output0_tm1 += out0_tm.w * tiles;
7240                         _output0_tm_00 = vsetq_lane_f32(output0_tm2[0], _output0_tm_00, 2);
7241                         output0_tm2 += out0_tm.w * tiles;
7242                         _output0_tm_00 = vsetq_lane_f32(output0_tm3[0], _output0_tm_00, 3);
7243                         output0_tm3 += out0_tm.w * tiles;
7244 
7245                         _output0_tm_11 = vsetq_lane_f32(output0_tm0[0], _output0_tm_11, 0);
7246                         output0_tm0 += out0_tm.w * tiles;
7247                         _output0_tm_11 = vsetq_lane_f32(output0_tm1[0], _output0_tm_11, 1);
7248                         output0_tm1 += out0_tm.w * tiles;
7249                         _output0_tm_11 = vsetq_lane_f32(output0_tm2[0], _output0_tm_11, 2);
7250                         output0_tm2 += out0_tm.w * tiles;
7251                         _output0_tm_11 = vsetq_lane_f32(output0_tm3[0], _output0_tm_11, 3);
7252                         output0_tm3 += out0_tm.w * tiles;
7253 
7254                         _output0_tm_22 = vsetq_lane_f32(output0_tm0[0], _output0_tm_22, 0);
7255                         output0_tm0 += out0_tm.w * tiles;
7256                         _output0_tm_22 = vsetq_lane_f32(output0_tm1[0], _output0_tm_22, 1);
7257                         output0_tm1 += out0_tm.w * tiles;
7258                         _output0_tm_22 = vsetq_lane_f32(output0_tm2[0], _output0_tm_22, 2);
7259                         output0_tm2 += out0_tm.w * tiles;
7260                         _output0_tm_22 = vsetq_lane_f32(output0_tm3[0], _output0_tm_22, 3);
7261                         output0_tm3 += out0_tm.w * tiles;
7262 
7263                         _output0_tm_33 = vsetq_lane_f32(output0_tm0[0], _output0_tm_33, 0);
7264                         output0_tm0 += out0_tm.w * tiles;
7265                         _output0_tm_33 = vsetq_lane_f32(output0_tm1[0], _output0_tm_33, 1);
7266                         output0_tm1 += out0_tm.w * tiles;
7267                         _output0_tm_33 = vsetq_lane_f32(output0_tm2[0], _output0_tm_33, 2);
7268                         output0_tm2 += out0_tm.w * tiles;
7269                         _output0_tm_33 = vsetq_lane_f32(output0_tm3[0], _output0_tm_33, 3);
7270                         output0_tm3 += out0_tm.w * tiles;
7271 
7272                         _output0_tm_44 = vsetq_lane_f32(output0_tm0[0], _output0_tm_44, 0);
7273                         output0_tm0 += out0_tm.w * tiles;
7274                         _output0_tm_44 = vsetq_lane_f32(output0_tm1[0], _output0_tm_44, 1);
7275                         output0_tm1 += out0_tm.w * tiles;
7276                         _output0_tm_44 = vsetq_lane_f32(output0_tm2[0], _output0_tm_44, 2);
7277                         output0_tm2 += out0_tm.w * tiles;
7278                         _output0_tm_44 = vsetq_lane_f32(output0_tm3[0], _output0_tm_44, 3);
7279                         output0_tm3 += out0_tm.w * tiles;
7280 
7281                         _output0_tm_55 = vsetq_lane_f32(output0_tm0[0], _output0_tm_55, 0);
7282                         output0_tm0 += out0_tm.w * tiles;
7283                         _output0_tm_55 = vsetq_lane_f32(output0_tm1[0], _output0_tm_55, 1);
7284                         output0_tm1 += out0_tm.w * tiles;
7285                         _output0_tm_55 = vsetq_lane_f32(output0_tm2[0], _output0_tm_55, 2);
7286                         output0_tm2 += out0_tm.w * tiles;
7287                         _output0_tm_55 = vsetq_lane_f32(output0_tm3[0], _output0_tm_55, 3);
7288                         output0_tm3 += out0_tm.w * tiles;
7289 
7290                         _output0_tm_66 = vsetq_lane_f32(output0_tm0[0], _output0_tm_66, 0);
7291                         output0_tm0 += out0_tm.w * tiles;
7292                         _output0_tm_66 = vsetq_lane_f32(output0_tm1[0], _output0_tm_66, 1);
7293                         output0_tm1 += out0_tm.w * tiles;
7294                         _output0_tm_66 = vsetq_lane_f32(output0_tm2[0], _output0_tm_66, 2);
7295                         output0_tm2 += out0_tm.w * tiles;
7296                         _output0_tm_66 = vsetq_lane_f32(output0_tm3[0], _output0_tm_66, 3);
7297                         output0_tm3 += out0_tm.w * tiles;
7298 
7299                         _output0_tm_77 = vsetq_lane_f32(output0_tm0[0], _output0_tm_77, 0);
7300                         _output0_tm_77 = vsetq_lane_f32(output0_tm1[0], _output0_tm_77, 1);
7301                         _output0_tm_77 = vsetq_lane_f32(output0_tm2[0], _output0_tm_77, 2);
7302                         _output0_tm_77 = vsetq_lane_f32(output0_tm3[0], _output0_tm_77, 3);
7303 
7304                         float32x4_t _tmp024a = vaddq_f32(_output0_tm_11, _output0_tm_22);
7305                         float32x4_t _tmp135a = vsubq_f32(_output0_tm_11, _output0_tm_22);
7306 
7307                         float32x4_t _tmp024b = vaddq_f32(_output0_tm_33, _output0_tm_44);
7308                         float32x4_t _tmp135b = vsubq_f32(_output0_tm_33, _output0_tm_44);
7309 
7310                         float32x4_t _tmp024c = vaddq_f32(_output0_tm_55, _output0_tm_66);
7311                         float32x4_t _tmp135c = vsubq_f32(_output0_tm_55, _output0_tm_66);
7312 
7313                         float32x4_t _tmp0 = vaddq_f32(_output0_tm_00, _tmp024a);
7314                         _tmp0 = vmlaq_lane_f32(_tmp0, _tmp024c, vget_high_f32(_coeff), 1);
7315                         _tmp0 = vaddq_f32(_tmp0, _tmp024b);
7316 
7317                         float32x4_t _tmp2 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
7318                         _tmp2 = vmlaq_lane_f32(_tmp2, _tmp024c, vget_low_f32(_coeff), 1);
7319 
7320                         float32x4_t _tmp4 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
7321                         _tmp4 = vaddq_f32(_tmp4, _tmp024c);
7322                         _tmp4 = vaddq_f32(_tmp4, _tmp024c);
7323 
7324                         vst1q_f32(&tmp[0][m], _tmp0);
7325                         vst1q_f32(&tmp[2][m], _tmp2);
7326                         vst1q_f32(&tmp[4][m], _tmp4);
7327 
7328                         float32x4_t _tmp1 = vmlaq_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
7329                         _tmp1 = vaddq_f32(_tmp1, _tmp135b);
7330                         _tmp1 = vaddq_f32(_tmp1, _tmp135b);
7331 
7332                         float32x4_t _tmp3 = vmlaq_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
7333                         _tmp3 = vmlaq_lane_f32(_tmp3, _tmp135c, vget_low_f32(_coeff), 0);
7334 
7335                         float32x4_t _tmp5 = vaddq_f32(_output0_tm_77, _tmp135a);
7336                         _tmp5 = vmlaq_lane_f32(_tmp5, _tmp135b, vget_high_f32(_coeff), 1);
7337                         _tmp5 = vaddq_f32(_tmp5, _tmp135c);
7338 
7339                         vst1q_f32(&tmp[1][m], _tmp1);
7340                         vst1q_f32(&tmp[3][m], _tmp3);
7341                         vst1q_f32(&tmp[5][m], _tmp5);
7342 
7343                         output0_tm0 += out0_tm.w * tiles * 25;
7344                         output0_tm1 += out0_tm.w * tiles * 25;
7345                         output0_tm2 += out0_tm.w * tiles * 25;
7346                         output0_tm3 += out0_tm.w * tiles * 25;
7347                     }
7348 
7349                     const float* t0 = tmp[0];
7350                     const float* t1 = tmp[1];
7351 
7352                     float* output0 = out0.row(i * 6) + j * 6;
7353                     float* output1 = output0 + outw;
7354 
7355                     for (int m = 0; m + 1 < 6; m += 2)
7356                     {
7357                         float32x4_t _t0_0123 = vld1q_f32(t0);
7358                         float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
7359                         float32x4_t _t1_0123 = vld1q_f32(t1);
7360                         float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
7361 
7362                         float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
7363                         float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
7364 
7365                         float32x2_t _t_00 = vget_low_f32(_t01_00221133.val[0]);
7366                         float32x2_t _t_11 = vget_low_f32(_t01_00221133.val[1]);
7367                         float32x2_t _t_22 = vget_high_f32(_t01_00221133.val[0]);
7368                         float32x2_t _t_33 = vget_high_f32(_t01_00221133.val[1]);
7369                         float32x2_t _t_44 = vget_low_f32(_t01_44665577.val[0]);
7370                         float32x2_t _t_55 = vget_low_f32(_t01_44665577.val[1]);
7371                         float32x2_t _t_66 = vget_high_f32(_t01_44665577.val[0]);
7372                         float32x2_t _t_77 = vget_high_f32(_t01_44665577.val[1]);
7373 
7374                         float32x2_t _tmp024a = vadd_f32(_t_11, _t_22);
7375                         float32x2_t _tmp135a = vsub_f32(_t_11, _t_22);
7376 
7377                         float32x2_t _tmp024b = vadd_f32(_t_33, _t_44);
7378                         float32x2_t _tmp135b = vsub_f32(_t_33, _t_44);
7379 
7380                         float32x2_t _tmp024c = vadd_f32(_t_55, _t_66);
7381                         float32x2_t _tmp135c = vsub_f32(_t_55, _t_66);
7382 
7383                         float32x2_t _output_0 = vadd_f32(_t_00, _tmp024a);
7384                         _output_0 = vmla_lane_f32(_output_0, _tmp024c, vget_high_f32(_coeff), 1);
7385                         _output_0 = vadd_f32(_output_0, _tmp024b);
7386                         _output_0 = vadd_f32(_output_0, _bias0);
7387 
7388                         float32x2_t _output_2 = vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
7389                         _output_2 = vmla_lane_f32(_output_2, _tmp024c, vget_low_f32(_coeff), 1);
7390                         _output_2 = vadd_f32(_output_2, _bias0);
7391 
7392                         float32x2_t _output_4 = vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
7393                         _output_4 = vadd_f32(_output_4, _tmp024c);
7394                         _output_4 = vadd_f32(_output_4, _tmp024c);
7395                         _output_4 = vadd_f32(_output_4, _bias0);
7396 
7397                         output0[0] = vget_lane_f32(_output_0, 0);
7398                         output1[0] = vget_lane_f32(_output_0, 1);
7399                         output0[2] = vget_lane_f32(_output_2, 0);
7400                         output1[2] = vget_lane_f32(_output_2, 1);
7401                         output0[4] = vget_lane_f32(_output_4, 0);
7402                         output1[4] = vget_lane_f32(_output_4, 1);
7403 
7404                         float32x2_t _output_1 = vmla_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
7405                         _output_1 = vadd_f32(_output_1, _tmp135b);
7406                         _output_1 = vadd_f32(_output_1, _tmp135b);
7407                         _output_1 = vadd_f32(_output_1, _bias0);
7408 
7409                         float32x2_t _output_3 = vmla_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
7410                         _output_3 = vmla_lane_f32(_output_3, _tmp135c, vget_low_f32(_coeff), 0);
7411                         _output_3 = vadd_f32(_output_3, _bias0);
7412 
7413                         float32x2_t _output_5 = vadd_f32(_t_77, _tmp135a);
7414                         _output_5 = vmla_lane_f32(_output_5, _tmp135b, vget_high_f32(_coeff), 1);
7415                         _output_5 = vadd_f32(_output_5, _tmp135c);
7416                         _output_5 = vadd_f32(_output_5, _bias0);
7417 
7418                         output0[1] = vget_lane_f32(_output_1, 0);
7419                         output1[1] = vget_lane_f32(_output_1, 1);
7420                         output0[3] = vget_lane_f32(_output_3, 0);
7421                         output1[3] = vget_lane_f32(_output_3, 1);
7422                         output0[5] = vget_lane_f32(_output_5, 0);
7423                         output1[5] = vget_lane_f32(_output_5, 1);
7424 
7425                         t0 += 8 * 2;
7426                         t1 += 8 * 2;
7427                         output0 += outw * 2;
7428                         output1 += outw * 2;
7429                     }
7430 #else  // __aarch64__
7431                     const float* output0_tm0_0 = out0_tm.row(i * w_tm / 8 + j);
7432                     const float* output0_tm1_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 8);
7433                     const float* output0_tm2_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 16);
7434                     const float* output0_tm3_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 24);
7435                     const float* output0_tm0_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 32);
7436                     const float* output0_tm1_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 40);
7437                     const float* output0_tm2_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 48);
7438                     const float* output0_tm3_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 56);
7439 
7440                     float* t0 = tmp[0];
7441                     float* t1 = tmp[1];
7442 
7443                     //                     int step = out0_tm.w * tiles * 2*4 *4;
7444                     int step = out0_tm.w * tiles * 4;
7445 
7446                     asm volatile(
7447 
7448                         // loop0
7449                         //                         "vld1.f32   {d16-d17}, [%2], %21 \n"
7450                         //                         "vld1.f32   {d18-d19}, [%3], %21 \n"
7451                         //                         "vld1.f32   {d20-d21}, [%4], %21 \n"
7452                         //                         "vld1.f32   {d22-d23}, [%5], %21 \n"
7453                         //                         "vld1.f32   {d24-d25}, [%6], %21 \n"
7454                         //                         "vld1.f32   {d26-d27}, [%7], %21 \n"
7455                         //                         "vld1.f32   {d28-d29}, [%8], %21 \n"
7456                         //                         "vld1.f32   {d30-d31}, [%9], %21 \n"
7457 
7458                         //                         "vtrn.32    q8, q10             \n"
7459                         //                         "vtrn.32    q9, q11             \n"
7460                         //                         "vtrn.32    q12, q14            \n"
7461                         //                         "vtrn.32    q13, q15            \n"
7462 
7463                         //                         "vswp       d17, d24            \n"
7464                         //                         "vswp       d19, d26            \n"
7465                         //                         "vswp       d21, d28            \n"//  q8 = 00   q9 = 44  q10 = 11  q11 = 55
7466                         //                         "vswp       d23, d30            \n"// q12 = 22  q13 = 66  q14 = 33  q15 = 77
7467                         "vld1.f32   {d16[0]}, [%2], %21 \n"
7468                         "vld1.f32   {d16[1]}, [%3], %21 \n"
7469                         "vld1.f32   {d17[0]}, [%4], %21 \n"
7470                         "vld1.f32   {d17[1]}, [%5], %21 \n"
7471 
7472                         "vld1.f32   {d20[0]}, [%2], %21 \n"
7473                         "vld1.f32   {d20[1]}, [%3], %21 \n"
7474                         "vld1.f32   {d21[0]}, [%4], %21 \n"
7475                         "vld1.f32   {d21[1]}, [%5], %21 \n"
7476 
7477                         "vld1.f32   {d24[0]}, [%2], %21 \n"
7478                         "vld1.f32   {d24[1]}, [%3], %21 \n"
7479                         "vld1.f32   {d25[0]}, [%4], %21 \n"
7480                         "vld1.f32   {d25[1]}, [%5], %21 \n"
7481 
7482                         "vadd.f32   q2, q10, q12        \n"
7483                         "vsub.f32   q3, q10, q12        \n"
7484 
7485                         "vld1.f32   {d28[0]}, [%2], %21 \n"
7486                         "vld1.f32   {d28[1]}, [%3], %21 \n"
7487                         "vld1.f32   {d29[0]}, [%4], %21 \n"
7488                         "vld1.f32   {d29[1]}, [%5], %21 \n"
7489 
7490                         "vld1.f32   {d18[0]}, [%2], %21 \n"
7491                         "vld1.f32   {d18[1]}, [%3], %21 \n"
7492                         "vld1.f32   {d19[0]}, [%4], %21 \n"
7493                         "vld1.f32   {d19[1]}, [%5], %21 \n"
7494 
7495                         "vadd.f32   q4, q14, q9         \n"
7496                         "vsub.f32   q5, q14, q9         \n"
7497 
7498                         "vld1.f32   {d22[0]}, [%2], %21 \n"
7499                         "vld1.f32   {d22[1]}, [%3], %21 \n"
7500                         "vld1.f32   {d23[0]}, [%4], %21 \n"
7501                         "vld1.f32   {d23[1]}, [%5], %21 \n"
7502 
7503                         "vld1.f32   {d26[0]}, [%2], %21 \n"
7504                         "vld1.f32   {d26[1]}, [%3], %21 \n"
7505                         "vld1.f32   {d27[0]}, [%4], %21 \n"
7506                         "vld1.f32   {d27[1]}, [%5], %21 \n"
7507 
7508                         "vadd.f32   q6, q11, q13        \n"
7509                         "vsub.f32   q7, q11, q13        \n" // spare q9 q10 q11 q12 q13 q14
7510 
7511                         "vld1.f32   {d30[0]}, [%2]      \n"
7512                         "vld1.f32   {d30[1]}, [%3]      \n"
7513                         "vld1.f32   {d31[0]}, [%4]      \n"
7514                         "vld1.f32   {d31[1]}, [%5]      \n"
7515 
7516                         "vmov       q9, q3              \n"
7517                         "vadd.f32   q8, q8, q2          \n"
7518                         "vmla.f32   q9, q7, %f20[0]     \n"
7519                         "vmov       q12, q2             \n"
7520                         "vmov       q10, q2             \n"
7521                         "vmov       q11, q3             \n"
7522                         "vmla.f32   q12, q4, %f20[0]    \n"
7523                         "vadd.f32   q15, q15, q3        \n"
7524                         "vmla.f32   q8, q6, %f20[1]     \n"
7525                         "vadd.f32   q9, q9, q5          \n"
7526                         "vmla.f32   q10, q4, %e20[0]    \n"
7527                         "vmla.f32   q11, q5, %e20[1]    \n"
7528                         "vadd.f32   q12, q12, q6        \n"
7529                         "vmla.f32   q15, q5, %f20[1]    \n"
7530                         "vadd.f32   q8, q8, q4          \n"
7531                         "vadd.f32   q9, q9, q5          \n"
7532                         "vmla.f32   q10, q6, %e20[1]    \n"
7533                         "vmla.f32   q11, q7, %e20[0]    \n"
7534                         "vadd.f32   q12, q12, q6        \n"
7535                         "vadd.f32   q15, q15, q7        \n"
7536 
7537                         "vst1.f32   {d16-d17}, [%0]     \n"
7538                         "add        %0, %0, #64         \n"
7539 
7540                         "vst1.f32   {d18-d19}, [%1]     \n"
7541                         "add        %1, %1, #64         \n"
7542 
7543                         "vst1.f32   {d20-d21}, [%0]     \n"
7544                         "add        %0, %0, #64         \n"
7545 
7546                         "vst1.f32   {d22-d23}, [%1]     \n"
7547                         "add        %1, %1, #64         \n"
7548 
7549                         "vst1.f32   {d24-d25}, [%0]     \n"
7550                         "sub        %0, %0, #112        \n"
7551 
7552                         "vst1.f32   {d30-d31}, [%1]     \n"
7553                         "sub        %1, %1, #112        \n"
7554 
7555                         // loop1
7556                         //                         "vld1.f32   {d16-d17}, [%2]     \n"
7557                         //                         "vld1.f32   {d18-d19}, [%3]     \n"
7558                         //                         "vld1.f32   {d20-d21}, [%4]     \n"
7559                         //                         "vld1.f32   {d22-d23}, [%5]     \n"
7560                         //                         "vld1.f32   {d24-d25}, [%6]     \n"
7561                         //                         "vld1.f32   {d26-d27}, [%7]     \n"
7562                         //                         "vld1.f32   {d28-d29}, [%8]     \n"
7563                         //                         "vld1.f32   {d30-d31}, [%9]     \n"
7564 
7565                         //                         "vtrn.32    q8, q10             \n"
7566                         //                         "vtrn.32    q9, q11             \n"
7567                         //                         "vtrn.32    q12, q14            \n"
7568                         //                         "vtrn.32    q13, q15            \n"
7569 
7570                         //                         "vswp       d17, d24            \n"
7571                         //                         "vswp       d19, d26            \n"
7572                         //                         "vswp       d21, d28            \n"//  q8 = 00   q9 = 44  q10 = 11  q11 = 55
7573                         //                         "vswp       d23, d30            \n"// q12 = 22  q13 = 66  q14 = 33  q15 = 77
7574                         "vld1.f32   {d16[0]}, [%6], %21 \n"
7575                         "vld1.f32   {d16[1]}, [%7], %21 \n"
7576                         "vld1.f32   {d17[0]}, [%8], %21 \n"
7577                         "vld1.f32   {d17[1]}, [%9], %21 \n"
7578 
7579                         "vld1.f32   {d20[0]}, [%6], %21 \n"
7580                         "vld1.f32   {d20[1]}, [%7], %21 \n"
7581                         "vld1.f32   {d21[0]}, [%8], %21 \n"
7582                         "vld1.f32   {d21[1]}, [%9], %21 \n"
7583 
7584                         "vld1.f32   {d24[0]}, [%6], %21 \n"
7585                         "vld1.f32   {d24[1]}, [%7], %21 \n"
7586                         "vld1.f32   {d25[0]}, [%8], %21 \n"
7587                         "vld1.f32   {d25[1]}, [%9], %21 \n"
7588 
7589                         "vadd.f32   q2, q10, q12        \n"
7590                         "vsub.f32   q3, q10, q12        \n"
7591 
7592                         "vld1.f32   {d28[0]}, [%6], %21 \n"
7593                         "vld1.f32   {d28[1]}, [%7], %21 \n"
7594                         "vld1.f32   {d29[0]}, [%8], %21 \n"
7595                         "vld1.f32   {d29[1]}, [%9], %21 \n"
7596 
7597                         "vld1.f32   {d18[0]}, [%6], %21 \n"
7598                         "vld1.f32   {d18[1]}, [%7], %21 \n"
7599                         "vld1.f32   {d19[0]}, [%8], %21 \n"
7600                         "vld1.f32   {d19[1]}, [%9], %21 \n"
7601 
7602                         "vadd.f32   q4, q14, q9         \n"
7603                         "vsub.f32   q5, q14, q9         \n"
7604 
7605                         "vld1.f32   {d22[0]}, [%6], %21 \n"
7606                         "vld1.f32   {d22[1]}, [%7], %21 \n"
7607                         "vld1.f32   {d23[0]}, [%8], %21 \n"
7608                         "vld1.f32   {d23[1]}, [%9], %21 \n"
7609 
7610                         "vld1.f32   {d26[0]}, [%6], %21 \n"
7611                         "vld1.f32   {d26[1]}, [%7], %21 \n"
7612                         "vld1.f32   {d27[0]}, [%8], %21 \n"
7613                         "vld1.f32   {d27[1]}, [%9], %21 \n"
7614 
7615                         "vadd.f32   q6, q11, q13        \n"
7616                         "vsub.f32   q7, q11, q13        \n" // spare q9 q10 q11 q12 q13 q14
7617 
7618                         "vld1.f32   {d30[0]}, [%6]      \n"
7619                         "vld1.f32   {d30[1]}, [%7]      \n"
7620                         "vld1.f32   {d31[0]}, [%8]      \n"
7621                         "vld1.f32   {d31[1]}, [%9]      \n"
7622 
7623                         "vmov       q9, q3              \n"
7624                         "vadd.f32   q8, q8, q2          \n"
7625                         "vmla.f32   q9, q7, %f20[0]     \n"
7626                         "vmov       q12, q2             \n"
7627                         "vmov       q10, q2             \n"
7628                         "vmov       q11, q3             \n"
7629                         "vmla.f32   q12, q4, %f20[0]    \n"
7630                         "vadd.f32   q15, q15, q3        \n"
7631                         "vmla.f32   q8, q6, %f20[1]     \n"
7632                         "vadd.f32   q9, q9, q5          \n"
7633                         "vmla.f32   q10, q4, %e20[0]    \n"
7634                         "vmla.f32   q11, q5, %e20[1]    \n"
7635                         "vadd.f32   q12, q12, q6        \n"
7636                         "vmla.f32   q15, q5, %f20[1]    \n"
7637                         "vadd.f32   q8, q8, q4          \n"
7638                         "vadd.f32   q9, q9, q5          \n"
7639                         "vmla.f32   q10, q6, %e20[1]    \n"
7640                         "vmla.f32   q11, q7, %e20[0]    \n"
7641                         "vadd.f32   q12, q12, q6        \n"
7642                         "vadd.f32   q15, q15, q7        \n"
7643 
7644                         "vst1.f32   {d16-d17}, [%0]     \n"
7645                         "add        %0, %0, #64         \n"
7646 
7647                         "vst1.f32   {d18-d19}, [%1]     \n"
7648                         "add        %1, %1, #64         \n"
7649 
7650                         "vst1.f32   {d20-d21}, [%0]     \n"
7651                         "add        %0, %0, #64         \n"
7652 
7653                         "vst1.f32   {d22-d23}, [%1]     \n"
7654                         "add        %1, %1, #64         \n"
7655 
7656                         "vst1.f32   {d24-d25}, [%0]     \n"
7657 
7658                         "vst1.f32   {d30-d31}, [%1]     \n"
7659 
7660                         : "=r"(t0),            // %0
7661                         "=r"(t1),            // %1
7662                         "=r"(output0_tm0_0), // %2
7663                         "=r"(output0_tm1_0), // %3
7664                         "=r"(output0_tm2_0), // %4
7665                         "=r"(output0_tm3_0), // %5
7666                         "=r"(output0_tm0_4), // %6
7667                         "=r"(output0_tm1_4), // %7
7668                         "=r"(output0_tm2_4), // %8
7669                         "=r"(output0_tm3_4)  // %9
7670                         : "0"(t0),
7671                         "1"(t1),
7672                         "2"(output0_tm0_0),
7673                         "3"(output0_tm1_0),
7674                         "4"(output0_tm2_0),
7675                         "5"(output0_tm3_0),
7676                         "6"(output0_tm0_4),
7677                         "7"(output0_tm1_4),
7678                         "8"(output0_tm2_4),
7679                         "9"(output0_tm3_4),
7680                         "w"(_coeff), // %20
7681                         "r"(step)    // %21
7682                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
7683 
7684                     t0 = tmp[0];
7685                     t1 = tmp[1];
7686 
7687                     float* output0 = out0.row(i * 6) + j * 6;
7688                     float* output1 = output0 + outw;
7689 
7690                     int stepw = outw * 2 * 4;
7691 
7692                     asm volatile(
7693 
7694                         // loop0
7695                         "vld1.f32   {d16-d19}, [%2]     \n"
7696                         "vld1.f32   {d20-d23}, [%3]     \n"
7697 
7698                         "add        %2, %2, #64         \n"
7699                         "add        %3, %3, #64         \n"
7700 
7701                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
7702                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
7703 
7704                         "vadd.f32   d4, d20, d17        \n"
7705                         "vsub.f32   d5, d20, d17        \n"
7706 
7707                         "vadd.f32   d6, d21, d18        \n"
7708                         "vsub.f32   d7, d21, d18        \n"
7709 
7710                         "vadd.f32   d8, d22, d19        \n"
7711                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
7712 
7713                         "vmov       d20, d5             \n"
7714                         "vmov       d18, d4             \n"
7715 
7716                         "vadd.f32   d16, d16, d4        \n"
7717                         "vmla.f32   d20, d9, %f8[0]     \n"
7718                         "vmov       d17, d4             \n"
7719                         "vmov       d21, d5             \n"
7720                         "vmla.f32   d18, d6, %f8[0]     \n"
7721                         "vadd.f32   d22, d23, d5        \n"
7722 
7723                         "vmla.f32   d16, d8, %f8[1]     \n"
7724                         "vadd.f32   d20, d20, d7        \n"
7725                         "vmla.f32   d17, d6, %e8[0]     \n"
7726                         "vmla.f32   d21, d7, %e8[1]     \n"
7727                         "vadd.f32   d18, d18, d8        \n"
7728                         "vmla.f32   d22, d7, %f8[1]     \n"
7729 
7730                         "vadd.f32   d16, d16, d6        \n"
7731                         "vadd.f32   d20, d20, d7        \n"
7732                         "vmla.f32   d17, d8, %e8[1]     \n"
7733                         "vmla.f32   d21, d9, %e8[0]     \n"
7734                         "vadd.f32   d18, d18, d8        \n"
7735                         "vadd.f32   d22, d22, d9        \n"
7736 
7737                         "vadd.f32   d16, d16, %P9       \n" // _bias0
7738                         "vadd.f32   d20, d20, %P9       \n" // _bias0
7739                         "vadd.f32   d17, d17, %P9       \n" // _bias0
7740                         "vadd.f32   d21, d21, %P9       \n" // _bias0
7741                         "vadd.f32   d18, d18, %P9       \n" // _bias0
7742                         "vadd.f32   d22, d22, %P9       \n" // _bias0
7743 
7744                         "vtrn.f32   q8, q10             \n"
7745                         "vtrn.f32   d18, d22            \n"
7746 
7747                         "vst1.f32   {d16-d18}, [%0], %10 \n"
7748                         "vst1.f32   {d20-d22}, [%1], %10 \n"
7749 
7750                         // loop1
7751                         "vld1.f32   {d16-d19}, [%2]     \n"
7752                         "vld1.f32   {d20-d23}, [%3]     \n"
7753 
7754                         "add        %2, %2, #64         \n"
7755                         "add        %3, %3, #64         \n"
7756 
7757                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
7758                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
7759 
7760                         "vadd.f32   d4, d20, d17        \n"
7761                         "vsub.f32   d5, d20, d17        \n"
7762 
7763                         "vadd.f32   d6, d21, d18        \n"
7764                         "vsub.f32   d7, d21, d18        \n"
7765 
7766                         "vadd.f32   d8, d22, d19        \n"
7767                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
7768 
7769                         "vmov       d20, d5             \n"
7770                         "vmov       d18, d4             \n"
7771 
7772                         "vadd.f32   d16, d16, d4        \n"
7773                         "vmla.f32   d20, d9, %f8[0]     \n"
7774                         "vmov       d17, d4             \n"
7775                         "vmov       d21, d5             \n"
7776                         "vmla.f32   d18, d6, %f8[0]     \n"
7777                         "vadd.f32   d22, d23, d5        \n"
7778 
7779                         "vmla.f32   d16, d8, %f8[1]     \n"
7780                         "vadd.f32   d20, d20, d7        \n"
7781                         "vmla.f32   d17, d6, %e8[0]     \n"
7782                         "vmla.f32   d21, d7, %e8[1]     \n"
7783                         "vadd.f32   d18, d18, d8        \n"
7784                         "vmla.f32   d22, d7, %f8[1]     \n"
7785 
7786                         "vadd.f32   d16, d16, d6        \n"
7787                         "vadd.f32   d20, d20, d7        \n"
7788                         "vmla.f32   d17, d8, %e8[1]     \n"
7789                         "vmla.f32   d21, d9, %e8[0]     \n"
7790                         "vadd.f32   d18, d18, d8        \n"
7791                         "vadd.f32   d22, d22, d9        \n"
7792 
7793                         "vadd.f32   d16, d16, %P9       \n" // _bias0
7794                         "vadd.f32   d20, d20, %P9       \n" // _bias0
7795                         "vadd.f32   d17, d17, %P9       \n" // _bias0
7796                         "vadd.f32   d21, d21, %P9       \n" // _bias0
7797                         "vadd.f32   d18, d18, %P9       \n" // _bias0
7798                         "vadd.f32   d22, d22, %P9       \n" // _bias0
7799 
7800                         "vtrn.f32   q8, q10             \n"
7801                         "vtrn.f32   d18, d22            \n"
7802 
7803                         "vst1.f32   {d16-d18}, [%0], %10 \n"
7804                         "vst1.f32   {d20-d22}, [%1], %10 \n"
7805 
7806                         // loop2
7807                         "vld1.f32   {d16-d19}, [%2]     \n"
7808                         "vld1.f32   {d20-d23}, [%3]     \n"
7809 
7810                         "add        %2, %2, #64         \n"
7811                         "add        %3, %3, #64         \n"
7812 
7813                         "vtrn.32    q8, q10             \n" // q8 = 0 2  q10 = 1 3
7814                         "vtrn.32    q9, q11             \n" // q9 = 4 6  q11 = 5 7
7815 
7816                         "vadd.f32   d4, d20, d17        \n"
7817                         "vsub.f32   d5, d20, d17        \n"
7818 
7819                         "vadd.f32   d6, d21, d18        \n"
7820                         "vsub.f32   d7, d21, d18        \n"
7821 
7822                         "vadd.f32   d8, d22, d19        \n"
7823                         "vsub.f32   d9, d22, d19        \n" // spare d17 ~ d22
7824 
7825                         "vmov       d20, d5             \n"
7826                         "vmov       d18, d4             \n"
7827 
7828                         "vadd.f32   d16, d16, d4        \n"
7829                         "vmla.f32   d20, d9, %f8[0]     \n"
7830                         "vmov       d17, d4             \n"
7831                         "vmov       d21, d5             \n"
7832                         "vmla.f32   d18, d6, %f8[0]     \n"
7833                         "vadd.f32   d22, d23, d5        \n"
7834 
7835                         "vmla.f32   d16, d8, %f8[1]     \n"
7836                         "vadd.f32   d20, d20, d7        \n"
7837                         "vmla.f32   d17, d6, %e8[0]     \n"
7838                         "vmla.f32   d21, d7, %e8[1]     \n"
7839                         "vadd.f32   d18, d18, d8        \n"
7840                         "vmla.f32   d22, d7, %f8[1]     \n"
7841 
7842                         "vadd.f32   d16, d16, d6        \n"
7843                         "vadd.f32   d20, d20, d7        \n"
7844                         "vmla.f32   d17, d8, %e8[1]     \n"
7845                         "vmla.f32   d21, d9, %e8[0]     \n"
7846                         "vadd.f32   d18, d18, d8        \n"
7847                         "vadd.f32   d22, d22, d9        \n"
7848 
7849                         "vadd.f32   d16, d16, %P9       \n" // _bias0
7850                         "vadd.f32   d20, d20, %P9       \n" // _bias0
7851                         "vadd.f32   d17, d17, %P9       \n" // _bias0
7852                         "vadd.f32   d21, d21, %P9       \n" // _bias0
7853                         "vadd.f32   d18, d18, %P9       \n" // _bias0
7854                         "vadd.f32   d22, d22, %P9       \n" // _bias0
7855 
7856                         "vtrn.f32   q8, q10             \n"
7857                         "vtrn.f32   d18, d22            \n"
7858 
7859                         "vst1.f32   {d16-d18}, [%0], %10 \n"
7860                         "vst1.f32   {d20-d22}, [%1], %10 \n"
7861 
7862                         : "=r"(output0), // %0
7863                         "=r"(output1), // %1
7864                         "=r"(t0),      // %2
7865                         "=r"(t1)       // %3
7866                         : "0"(output0),
7867                         "1"(output1),
7868                         "2"(t0),
7869                         "3"(t1),
7870                         "w"(_coeff), // %8
7871                         "w"(_bias0), // %9
7872                         "r"(stepw)   // %10
7873                         : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
7874 #endif // __aarch64__
7875 #else
7876                     const float* output0_tm_0 = out0_tm.row(i * w_tm / 8 + j);
7877                     const float* output0_tm_1 = out0_tm.row(i * w_tm / 8 + j + tiles);
7878                     const float* output0_tm_2 = out0_tm.row(i * w_tm / 8 + j + tiles * 2);
7879                     const float* output0_tm_3 = out0_tm.row(i * w_tm / 8 + j + tiles * 3);
7880                     const float* output0_tm_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 4);
7881                     const float* output0_tm_5 = out0_tm.row(i * w_tm / 8 + j + tiles * 5);
7882                     const float* output0_tm_6 = out0_tm.row(i * w_tm / 8 + j + tiles * 6);
7883                     const float* output0_tm_7 = out0_tm.row(i * w_tm / 8 + j + tiles * 7);
7884 
7885                     for (int m = 0; m < 8; m++)
7886                     {
7887                         float tmp024a = output0_tm_1[0] + output0_tm_2[0];
7888                         float tmp135a = output0_tm_1[0] - output0_tm_2[0];
7889 
7890                         float tmp024b = output0_tm_3[0] + output0_tm_4[0];
7891                         float tmp135b = output0_tm_3[0] - output0_tm_4[0];
7892 
7893                         float tmp024c = output0_tm_5[0] + output0_tm_6[0];
7894                         float tmp135c = output0_tm_5[0] - output0_tm_6[0];
7895 
7896                         tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
7897                         tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
7898                         tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
7899 
7900                         tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
7901                         tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
7902                         tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
7903 
7904                         output0_tm_0 += out0_tm.w * tiles * 8;
7905                         output0_tm_1 += out0_tm.w * tiles * 8;
7906                         output0_tm_2 += out0_tm.w * tiles * 8;
7907                         output0_tm_3 += out0_tm.w * tiles * 8;
7908                         output0_tm_4 += out0_tm.w * tiles * 8;
7909                         output0_tm_5 += out0_tm.w * tiles * 8;
7910                         output0_tm_6 += out0_tm.w * tiles * 8;
7911                         output0_tm_7 += out0_tm.w * tiles * 8;
7912                     }
7913 
7914                     float* output0 = out0.row(i * 6) + j * 6;
7915 
7916                     for (int m = 0; m < 6; m++)
7917                     {
7918                         const float* tmp0 = tmp[m];
7919 
7920                         float tmp024a = tmp0[1] + tmp0[2];
7921                         float tmp135a = tmp0[1] - tmp0[2];
7922 
7923                         float tmp024b = tmp0[3] + tmp0[4];
7924                         float tmp135b = tmp0[3] - tmp0[4];
7925 
7926                         float tmp024c = tmp0[5] + tmp0[6];
7927                         float tmp135c = tmp0[5] - tmp0[6];
7928 
7929                         output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
7930                         output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
7931                         output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
7932 
7933                         output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
7934                         output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
7935                         output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
7936 
7937                         output0 += outw;
7938                     }
7939 #endif // __ARM_NEON
7940                 }
7941             }
7942         }
7943     }
7944     // END transform output
7945 
7946     // cut result pad
7947     if (top_blob_bordered.w != top_blob.w || top_blob_bordered.h != top_blob.h)
7948         copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
7949 }
7950 
conv3x3s2_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)7951 static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
7952 {
7953     int w = bottom_blob.w;
7954     int inch = bottom_blob.c;
7955 
7956     int outw = top_blob.w;
7957     int outh = top_blob.h;
7958     int outch = top_blob.c;
7959 
7960     const int tailstep = w - 2 * outw + w;
7961 
7962     const float* kernel = _kernel;
7963     const float* bias = _bias;
7964 
7965     int nn_outch = outch >> 1;
7966     int remain_outch_start = nn_outch << 1;
7967 
7968     #pragma omp parallel for num_threads(opt.num_threads)
7969     for (int pp = 0; pp < nn_outch; pp++)
7970     {
7971         int p = pp * 2;
7972 
7973         Mat out0 = top_blob.channel(p);
7974         Mat out1 = top_blob.channel(p + 1);
7975 
7976         const float bias0 = bias ? bias[p] : 0.f;
7977         const float bias1 = bias ? bias[p + 1] : 0.f;
7978 
7979         out0.fill(bias0);
7980         out1.fill(bias1);
7981 
7982         const float* k0 = kernel + p * inch * 9;
7983         const float* k1 = kernel + (p + 1) * inch * 9;
7984 
7985         for (int q = 0; q < inch; q++)
7986         {
7987             float* outptr0 = out0;
7988             float* outptr1 = out1;
7989 
7990             const float* img0 = bottom_blob.channel(q);
7991 
7992             const float* r0 = img0;
7993             const float* r1 = img0 + w;
7994             const float* r2 = img0 + w * 2;
7995 
7996 #if __ARM_NEON
7997             float32x4_t _k00 = vld1q_f32(k0);
7998             float32x4_t _k03 = vld1q_f32(k0 + 3);
7999             float32x4_t _k06 = vld1q_f32(k0 + 6);
8000 
8001             float32x4_t _k10 = vld1q_f32(k1);
8002             float32x4_t _k13 = vld1q_f32(k1 + 3);
8003             float32x4_t _k16 = vld1q_f32(k1 + 6);
8004 #endif // __ARM_NEON
8005 
8006             int i = 0;
8007 
8008             for (; i < outh; i++)
8009             {
8010 #if __ARM_NEON
8011                 int nn = outw >> 2;
8012                 int remain = outw & 3;
8013 #else
8014                 int remain = outw;
8015 #endif // __ARM_NEON
8016 
8017 #if __ARM_NEON
8018 #if __aarch64__
8019                 if (nn > 0)
8020                 {
8021                     asm volatile(
8022                         "prfm   pldl1keep, [%3, #256]       \n"
8023                         "ld2    {v8.4s, v9.4s}, [%3], #32   \n" // v8 v9 = r0
8024 
8025                         "0:                                 \n"
8026 
8027                         "prfm   pldl1keep, [%1, #128]       \n"
8028                         "ld1    {v6.4s}, [%1]               \n" // v6 = _sum0
8029 
8030                         "fmul   v12.4s, v8.4s, %12.s[0]     \n"
8031 
8032                         "prfm   pldl1keep, [%2, #128]       \n"
8033                         "ld1    {v7.4s}, [%2]               \n" // v7 = _sum1
8034 
8035                         "fmul   v13.4s, v8.4s, %15.s[0]     \n"
8036 
8037                         "prfm   pldl1keep, [%3, #128]       \n"
8038                         "ld2    {v10.4s, v11.4s}, [%3]      \n" // v10
8039 
8040                         "fmla   v6.4s, v9.4s, %12.s[1]      \n"
8041 
8042                         "ext    v14.16b, v8.16b, v10.16b, #4\n"
8043 
8044                         "fmla   v7.4s, v9.4s, %15.s[1]      \n"
8045 
8046                         "prfm   pldl1keep, [%4, #256]       \n"
8047                         "ld2    {v8.4s, v9.4s}, [%4], #32   \n" // r1
8048 
8049                         "fmla   v12.4s, v14.4s, %12.s[2]    \n"
8050                         "fmla   v13.4s, v14.4s, %15.s[2]    \n"
8051 
8052                         "prfm   pldl1keep, [%4, #128]       \n"
8053                         "ld2    {v10.4s, v11.4s}, [%4]      \n"
8054 
8055                         "fmla   v6.4s, v8.4s, %13.s[0]      \n"
8056                         "fmla   v7.4s, v8.4s, %16.s[0]      \n"
8057 
8058                         "ext    v14.16b, v8.16b, v10.16b, #4\n"
8059 
8060                         "fmla   v12.4s, v9.4s, %13.s[1]     \n"
8061                         "fmla   v13.4s, v9.4s, %16.s[1]     \n"
8062 
8063                         "prfm   pldl1keep, [%5, #256]       \n"
8064                         "ld2    {v8.4s, v9.4s}, [%5], #32   \n" // r2
8065 
8066                         "fmla   v6.4s, v14.4s, %13.s[2]     \n"
8067                         "fmla   v7.4s, v14.4s, %16.s[2]     \n"
8068 
8069                         "prfm   pldl1keep, [%5, #128]       \n"
8070                         "ld2    {v10.4s, v11.4s}, [%5]      \n"
8071 
8072                         "fmla   v12.4s, v8.4s, %14.s[0]     \n"
8073                         "fmla   v13.4s, v8.4s, %17.s[0]     \n"
8074 
8075                         "ext    v14.16b, v8.16b, v10.16b, #4\n"
8076 
8077                         "fmla   v6.4s, v9.4s, %14.s[1]      \n"
8078                         "fmla   v7.4s, v9.4s, %17.s[1]      \n"
8079 
8080                         "fmla   v12.4s, v14.4s, %14.s[2]    \n"
8081                         "fmla   v13.4s, v14.4s, %17.s[2]    \n"
8082 
8083                         "prfm   pldl1keep, [%3, #256]       \n"
8084                         "ld2    {v8.4s, v9.4s}, [%3], #32   \n" // v8 v9 = r0
8085 
8086                         "fadd   v6.4s, v6.4s, v12.4s        \n"
8087                         "fadd   v7.4s, v7.4s, v13.4s        \n"
8088 
8089                         "subs   %w0, %w0, #1                \n"
8090 
8091                         "st1    {v6.4s}, [%1], #16          \n"
8092                         "st1    {v7.4s}, [%2], #16          \n"
8093 
8094                         "bne    0b                          \n"
8095                         "sub    %3, %3, #32                 \n"
8096 
8097                         : "=r"(nn),      // %0
8098                         "=r"(outptr0), // %1
8099                         "=r"(outptr1), // %2
8100                         "=r"(r0),      // %3
8101                         "=r"(r1),      // %4
8102                         "=r"(r2)       // %5
8103                         : "0"(nn),
8104                         "1"(outptr0),
8105                         "2"(outptr1),
8106                         "3"(r0),
8107                         "4"(r1),
8108                         "5"(r2),
8109                         "w"(_k00), // %12
8110                         "w"(_k03), // %13
8111                         "w"(_k06), // %14
8112                         "w"(_k10), // %15
8113                         "w"(_k13), // %16
8114                         "w"(_k16)  // %17
8115                         : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8116                 }
8117 #else
8118                 if (nn > 0)
8119                 {
8120                     asm volatile(
8121                         "pld        [%3, #256]          \n"
8122                         "vld2.f32   {d16-d19}, [%3]!    \n" // q8 q9 = r0
8123 
8124                         "0:                             \n"
8125 
8126                         "pld        [%1, #128]          \n"
8127                         "vld1.f32   {d12-d13}, [%1]     \n" // q6 = _sum0
8128 
8129                         "vmul.f32   q12, q8, %e12[0]    \n"
8130 
8131                         "pld        [%2, #128]          \n"
8132                         "vld1.f32   {d14-d15}, [%2]     \n" // q7 = _sum1
8133 
8134                         "vmul.f32   q13, q8, %e15[0]    \n"
8135 
8136                         "pld        [%3, #128]          \n"
8137                         "vld2.f32   {d20-d21}, [%3]     \n" // q10
8138 
8139                         "vmla.f32   q6, q9, %e12[1]     \n"
8140 
8141                         "vext.32    q11, q8, q10, #1    \n"
8142 
8143                         "vmla.f32   q7, q9, %e15[1]     \n"
8144 
8145                         "pld        [%4, #256]          \n"
8146                         "vld2.f32   {d16-d19}, [%4]!    \n" // r1
8147 
8148                         "vmla.f32   q12, q11, %f12[0]   \n"
8149                         "vmla.f32   q13, q11, %f15[0]   \n"
8150 
8151                         "pld        [%4, #128]          \n"
8152                         "vld2.f32   {d20-d21}, [%4]     \n"
8153 
8154                         "vmla.f32   q6, q8, %e13[0]     \n"
8155                         "vmla.f32   q7, q8, %e16[0]     \n"
8156 
8157                         "vext.32    q11, q8, q10, #1    \n"
8158 
8159                         "vmla.f32   q12, q9, %e13[1]    \n"
8160                         "vmla.f32   q13, q9, %e16[1]    \n"
8161 
8162                         "pld        [%5, #256]          \n"
8163                         "vld2.f32   {d16-d19}, [%5]!    \n" // r2
8164 
8165                         "vmla.f32   q6, q11, %f13[0]    \n"
8166                         "vmla.f32   q7, q11, %f16[0]    \n"
8167 
8168                         "pld        [%5, #128]          \n"
8169                         "vld2.f32   {d20-d21}, [%5]     \n"
8170 
8171                         "vmla.f32   q12, q8, %e14[0]    \n"
8172                         "vmla.f32   q13, q8, %e17[0]    \n"
8173 
8174                         "vext.32    q11, q8, q10, #1    \n"
8175 
8176                         "vmla.f32   q6, q9, %e14[1]     \n"
8177                         "vmla.f32   q7, q9, %e17[1]     \n"
8178 
8179                         "vmla.f32   q12, q11, %f14[0]   \n"
8180                         "vmla.f32   q13, q11, %f17[0]   \n"
8181 
8182                         "pld        [%3, #256]          \n"
8183                         "vld2.f32   {d16-d19}, [%3]!    \n" // q8 q9 = r0
8184 
8185                         "vadd.f32   q6, q6, q12         \n"
8186                         "vadd.f32   q7, q7, q13         \n"
8187 
8188                         "subs       %0, #1              \n"
8189 
8190                         "vst1.f32   {d12-d13}, [%1]!    \n"
8191                         "vst1.f32   {d14-d15}, [%2]!    \n"
8192 
8193                         "bne        0b                  \n"
8194                         "sub        %3, #32             \n"
8195 
8196                         : "=r"(nn),      // %0
8197                         "=r"(outptr0), // %1
8198                         "=r"(outptr1), // %2
8199                         "=r"(r0),      // %3
8200                         "=r"(r1),      // %4
8201                         "=r"(r2)       // %5
8202                         : "0"(nn),
8203                         "1"(outptr0),
8204                         "2"(outptr1),
8205                         "3"(r0),
8206                         "4"(r1),
8207                         "5"(r2),
8208                         "w"(_k00), // %12
8209                         "w"(_k03), // %13
8210                         "w"(_k06), // %14
8211                         "w"(_k10), // %15
8212                         "w"(_k13), // %16
8213                         "w"(_k16)  // %17
8214                         : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
8215                 }
8216 #endif // __aarch64__
8217 #endif // __ARM_NEON
8218                 for (; remain > 0; remain--)
8219                 {
8220 #if __ARM_NEON
8221                     float32x4_t _r00 = vld1q_f32(r0);
8222                     float32x4_t _r10 = vld1q_f32(r1);
8223                     float32x4_t _r20 = vld1q_f32(r2);
8224 
8225                     float32x4_t _sum0 = vmulq_f32(_r00, _k00);
8226                     float32x4_t _sum1 = vmulq_f32(_r00, _k10);
8227                     _sum0 = vmlaq_f32(_sum0, _r10, _k03);
8228                     _sum1 = vmlaq_f32(_sum1, _r10, _k13);
8229                     _sum0 = vmlaq_f32(_sum0, _r20, _k06);
8230                     _sum1 = vmlaq_f32(_sum1, _r20, _k16);
8231 
8232                     _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
8233                     _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
8234 #if __aarch64__
8235                     *outptr0 = vaddvq_f32(_sum0);
8236                     *outptr1 = vaddvq_f32(_sum1);
8237 #else
8238                     float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
8239                     float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
8240                     float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
8241 
8242                     *outptr0 = vget_lane_f32(_ss01, 0);
8243                     *outptr1 = vget_lane_f32(_ss01, 1);
8244 #endif // __aarch64__
8245 #else
8246                     float sum0 = 0.f;
8247                     float sum1 = 0.f;
8248 
8249                     sum0 += r0[0] * k0[0];
8250                     sum0 += r0[1] * k0[1];
8251                     sum0 += r0[2] * k0[2];
8252                     sum0 += r1[0] * k0[3];
8253                     sum0 += r1[1] * k0[4];
8254                     sum0 += r1[2] * k0[5];
8255                     sum0 += r2[0] * k0[6];
8256                     sum0 += r2[1] * k0[7];
8257                     sum0 += r2[2] * k0[8];
8258 
8259                     sum1 += r0[0] * k1[0];
8260                     sum1 += r0[1] * k1[1];
8261                     sum1 += r0[2] * k1[2];
8262                     sum1 += r1[0] * k1[3];
8263                     sum1 += r1[1] * k1[4];
8264                     sum1 += r1[2] * k1[5];
8265                     sum1 += r2[0] * k1[6];
8266                     sum1 += r2[1] * k1[7];
8267                     sum1 += r2[2] * k1[8];
8268 
8269                     *outptr0 += sum0;
8270                     *outptr1 += sum1;
8271 #endif // __ARM_NEON
8272 
8273                     r0 += 2;
8274                     r1 += 2;
8275                     r2 += 2;
8276                     outptr0++;
8277                     outptr1++;
8278                 }
8279 
8280                 r0 += tailstep;
8281                 r1 += tailstep;
8282                 r2 += tailstep;
8283             }
8284 
8285             k0 += 9;
8286             k1 += 9;
8287         }
8288     }
8289 
8290     #pragma omp parallel for num_threads(opt.num_threads)
8291     for (int p = remain_outch_start; p < outch; p++)
8292     {
8293         Mat out = top_blob.channel(p);
8294 
8295         const float bias0 = bias ? bias[p] : 0.f;
8296 
8297         out.fill(bias0);
8298 
8299         const float* kernel0 = kernel + p * inch * 9;
8300 
8301         for (int q = 0; q < inch; q++)
8302         {
8303             float* outptr = out;
8304 
8305             const float* img0 = bottom_blob.channel(q);
8306 
8307             const float* r0 = img0;
8308             const float* r1 = img0 + w;
8309             const float* r2 = img0 + w * 2;
8310 
8311             const float* k0 = kernel0;
8312             const float* k1 = kernel0 + 3;
8313             const float* k2 = kernel0 + 6;
8314 
8315 #if __ARM_NEON
8316             float32x4_t _k0123 = vld1q_f32(k0);
8317             float32x4_t _k3456 = vld1q_f32(k1);
8318             float32x4_t _k6789 = vld1q_f32(k2);
8319 #endif // __ARM_NEON
8320 
8321             int i = 0;
8322 
8323             for (; i < outh; i++)
8324             {
8325 #if __ARM_NEON
8326                 int nn = outw >> 2;
8327                 int remain = outw & 3;
8328 #else
8329                 int remain = outw;
8330 #endif // __ARM_NEON
8331 
8332 #if __ARM_NEON
8333 #if __aarch64__
8334                 if (nn > 0)
8335                 {
8336                     asm volatile(
8337                         "prfm       pldl1keep, [%2, #256]          \n"
8338                         "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
8339                         "0:                                        \n"
8340 
8341                         "prfm       pldl1keep, [%1, #128]          \n"
8342                         "ld1        {v0.4s}, [%1]                  \n"
8343 
8344                         "fmla       v0.4s,  v2.4s, %10.s[0]        \n"
8345                         "fmul       v10.4s, v3.4s, %10.s[1]        \n"
8346 
8347                         "prfm       pldl1keep, [%2, #256]          \n"
8348                         "ld2        {v8.4s, v9.4s}, [%2]           \n"
8349                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
8350 
8351                         "fmul       v11.4s, v1.4s, %10.s[2]        \n"
8352 
8353                         "prfm       pldl1keep, [%3, #256]          \n"
8354                         "ld2        {v2.4s, v3.4s}, [%3], #32      \n"
8355 
8356                         "fmla       v0.4s,  v2.4s, %11.s[0]        \n"
8357                         "fmla       v10.4s, v3.4s, %11.s[1]        \n"
8358 
8359                         "prfm       pldl1keep, [%3, #256]          \n"
8360                         "ld2        {v8.4s, v9.4s}, [%3]           \n"
8361                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
8362 
8363                         "fmla       v11.4s, v1.4s, %11.s[2]        \n"
8364 
8365                         "prfm       pldl1keep, [%4, #256]          \n"
8366                         "ld2        {v2.4s, v3.4s}, [%4], #32      \n"
8367 
8368                         "fmla       v0.4s,  v2.4s, %12.s[0]        \n"
8369                         "fmla       v10.4s, v3.4s, %12.s[1]        \n"
8370 
8371                         "prfm       pldl1keep, [%4, #256]          \n"
8372                         "ld2        {v8.4s, v9.4s}, [%4]           \n"
8373                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
8374 
8375                         "fmla       v11.4s, v1.4s, %12.s[2]        \n"
8376 
8377                         "prfm       pldl1keep, [%2, #256]          \n"
8378                         "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
8379 
8380                         "fadd       v0.4s, v0.4s, v10.4s           \n"
8381                         "fadd       v0.4s, v0.4s, v11.4s           \n"
8382 
8383                         "subs       %w0, %w0, #1                   \n"
8384                         "st1        {v0.4s}, [%1], #16             \n"
8385                         "bne        0b                             \n"
8386                         "sub        %2, %2, #32                    \n"
8387                         : "=r"(nn),     // %0
8388                         "=r"(outptr), // %1
8389                         "=r"(r0),     // %2
8390                         "=r"(r1),     // %3
8391                         "=r"(r2)      // %4
8392                         : "0"(nn),
8393                         "1"(outptr),
8394                         "2"(r0),
8395                         "3"(r1),
8396                         "4"(r2),
8397                         "w"(_k0123), // %10
8398                         "w"(_k3456), // %11
8399                         "w"(_k6789)  // %12
8400                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8401                 }
8402 #else
8403                 if (nn > 0)
8404                 {
8405                     asm volatile(
8406                         "pld        [%2, #256]          \n"
8407                         "vld2.f32   {d4-d7}, [%2]!      \n"
8408 
8409                         "0:                             \n"
8410                         "pld        [%1, #128]          \n"
8411                         "vld1.f32   {d0-d1}, [%1]       \n"
8412 
8413                         "vmla.f32   q0, q2, %e10[0]     \n"
8414                         "vmul.f32   q10, q3, %e10[1]    \n"
8415 
8416                         "pld        [%2, #128]          \n"
8417                         "vld2.f32   {d16-d17}, [%2]     \n"
8418                         "vext.32    q1, q2, q8, #1      \n"
8419 
8420                         "vmul.f32   q11, q1, %f10[0]    \n"
8421 
8422                         "pld        [%3, #256]          \n"
8423                         "vld2.f32   {d4-d7}, [%3]!      \n"
8424 
8425                         "vmla.f32   q0, q2, %e11[0]     \n"
8426                         "vmla.f32   q10, q3, %e11[1]    \n"
8427 
8428                         "pld        [%3, #128]          \n"
8429                         "vld2.f32   {d16-d17}, [%3]     \n"
8430                         "vext.32    q1, q2, q8, #1      \n"
8431 
8432                         "vmla.f32   q11, q1, %f11[0]    \n"
8433 
8434                         "pld        [%4, #256]          \n"
8435                         "vld2.f32   {d4-d7}, [%4]!      \n"
8436 
8437                         "vmla.f32   q0, q2, %e12[0]     \n"
8438                         "vmla.f32   q10, q3, %e12[1]    \n"
8439 
8440                         "pld        [%4, #128]          \n"
8441                         "vld2.f32   {d16-d17}, [%4]     \n"
8442                         "vext.32    q1, q2, q8, #1      \n"
8443 
8444                         "vmla.f32   q11, q1, %f12[0]    \n"
8445 
8446                         "pld        [%2, #256]          \n"
8447                         "vld2.f32   {d4-d7}, [%2]!      \n"
8448 
8449                         "vadd.f32   q0, q0, q10         \n"
8450                         "vadd.f32   q0, q0, q11         \n"
8451 
8452                         "subs       %0, #1              \n"
8453                         "vst1.f32   {d0-d1}, [%1]!      \n"
8454                         "bne        0b                  \n"
8455                         "sub        %2, #32             \n"
8456                         : "=r"(nn),     // %0
8457                         "=r"(outptr), // %1
8458                         "=r"(r0),     // %2
8459                         "=r"(r1),     // %3
8460                         "=r"(r2)      // %4
8461                         : "0"(nn),
8462                         "1"(outptr),
8463                         "2"(r0),
8464                         "3"(r1),
8465                         "4"(r2),
8466                         "w"(_k0123), // %10
8467                         "w"(_k3456), // %11
8468                         "w"(_k6789)  // %12
8469                         : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
8470                 }
8471 #endif // __aarch64__
8472 #endif // __ARM_NEON
8473                 for (; remain > 0; remain--)
8474                 {
8475 #if __ARM_NEON
8476                     float32x4_t _r00 = vld1q_f32(r0);
8477                     float32x4_t _r10 = vld1q_f32(r1);
8478                     float32x4_t _r20 = vld1q_f32(r2);
8479 
8480                     float32x4_t _sum = vmulq_f32(_r00, _k0123);
8481                     _sum = vmlaq_f32(_sum, _r10, _k3456);
8482                     _sum = vmlaq_f32(_sum, _r20, _k6789);
8483 
8484                     _sum = vsetq_lane_f32(*outptr, _sum, 3);
8485 
8486 #if __aarch64__
8487                     *outptr = vaddvq_f32(_sum);
8488 #else
8489                     float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
8490                     _ss = vpadd_f32(_ss, _ss);
8491 
8492                     *outptr = vget_lane_f32(_ss, 0);
8493 #endif // __aarch64__
8494 #else
8495                     float sum = 0;
8496 
8497                     sum += r0[0] * k0[0];
8498                     sum += r0[1] * k0[1];
8499                     sum += r0[2] * k0[2];
8500                     sum += r1[0] * k1[0];
8501                     sum += r1[1] * k1[1];
8502                     sum += r1[2] * k1[2];
8503                     sum += r2[0] * k2[0];
8504                     sum += r2[1] * k2[1];
8505                     sum += r2[2] * k2[2];
8506 
8507                     *outptr += sum;
8508 #endif // __ARM_NEON
8509 
8510                     r0 += 2;
8511                     r1 += 2;
8512                     r2 += 2;
8513                     outptr++;
8514                 }
8515 
8516                 r0 += tailstep;
8517                 r1 += tailstep;
8518                 r2 += tailstep;
8519             }
8520 
8521             kernel0 += 9;
8522         }
8523     }
8524 }
8525 
conv3x3s2_transform_kernel_neon(const Mat & _kernel,Mat & kernel_tm,int inch,int outch)8526 static void conv3x3s2_transform_kernel_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch)
8527 {
8528     kernel_tm.create(8 * 9, inch, outch / 8 + outch % 8);
8529 
8530     const float* kernel = _kernel;
8531 
8532     int p = 0;
8533     for (; p + 7 < outch; p += 8)
8534     {
8535         const float* k0 = kernel + (p + 0) * inch * 9;
8536         const float* k1 = kernel + (p + 1) * inch * 9;
8537         const float* k2 = kernel + (p + 2) * inch * 9;
8538         const float* k3 = kernel + (p + 3) * inch * 9;
8539         const float* k4 = kernel + (p + 4) * inch * 9;
8540         const float* k5 = kernel + (p + 5) * inch * 9;
8541         const float* k6 = kernel + (p + 6) * inch * 9;
8542         const float* k7 = kernel + (p + 7) * inch * 9;
8543 
8544         float* ktmp = kernel_tm.channel(p / 8);
8545 
8546         for (int q = 0; q < inch; q++)
8547         {
8548             for (int k = 0; k < 9; k++)
8549             {
8550                 ktmp[0] = k0[k];
8551                 ktmp[1] = k1[k];
8552                 ktmp[2] = k2[k];
8553                 ktmp[3] = k3[k];
8554                 ktmp[4] = k4[k];
8555                 ktmp[5] = k5[k];
8556                 ktmp[6] = k6[k];
8557                 ktmp[7] = k7[k];
8558                 ktmp += 8;
8559             }
8560 
8561             k0 += 9;
8562             k1 += 9;
8563             k2 += 9;
8564             k3 += 9;
8565             k4 += 9;
8566             k5 += 9;
8567             k6 += 9;
8568             k7 += 9;
8569         }
8570     }
8571     for (; p < outch; p++)
8572     {
8573         const float* k0 = kernel + (p + 0) * inch * 9;
8574 
8575         float* ktmp = kernel_tm.channel(p / 8 + p % 8);
8576 
8577         for (int q = 0; q < inch; q++)
8578         {
8579             for (int k = 0; k < 9; k++)
8580             {
8581                 ktmp[k] = k0[k];
8582             }
8583             ktmp += 9;
8584 
8585             k0 += 9;
8586         }
8587     }
8588 }
8589 
conv3x3s2_packed_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)8590 static void conv3x3s2_packed_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
8591 {
8592     int w = bottom_blob.w;
8593     int inch = bottom_blob.c;
8594 
8595     int outw = top_blob.w;
8596     int outh = top_blob.h;
8597     int outch = top_blob.c;
8598 
8599     const int tailstep = w - 2 * outw + w;
8600 
8601     //     const float* kernel = _kernel;
8602     const float* bias = _bias;
8603 
8604     int nn_outch = outch >> 3;
8605     int remain_outch_start = nn_outch << 3;
8606 
8607     #pragma omp parallel for num_threads(opt.num_threads)
8608     for (int pp = 0; pp < nn_outch; pp++)
8609     {
8610         int p = pp * 8;
8611 
8612         Mat out0 = top_blob.channel(p + 0);
8613         Mat out1 = top_blob.channel(p + 1);
8614         Mat out2 = top_blob.channel(p + 2);
8615         Mat out3 = top_blob.channel(p + 3);
8616         Mat out4 = top_blob.channel(p + 4);
8617         Mat out5 = top_blob.channel(p + 5);
8618         Mat out6 = top_blob.channel(p + 6);
8619         Mat out7 = top_blob.channel(p + 7);
8620 
8621         const float bias0 = bias ? bias[p + 0] : 0.f;
8622         const float bias1 = bias ? bias[p + 1] : 0.f;
8623         const float bias2 = bias ? bias[p + 2] : 0.f;
8624         const float bias3 = bias ? bias[p + 3] : 0.f;
8625         const float bias4 = bias ? bias[p + 4] : 0.f;
8626         const float bias5 = bias ? bias[p + 5] : 0.f;
8627         const float bias6 = bias ? bias[p + 6] : 0.f;
8628         const float bias7 = bias ? bias[p + 7] : 0.f;
8629 
8630         out0.fill(bias0);
8631         out1.fill(bias1);
8632         out2.fill(bias2);
8633         out3.fill(bias3);
8634         out4.fill(bias4);
8635         out5.fill(bias5);
8636         out6.fill(bias6);
8637         out7.fill(bias7);
8638 
8639         const float* ktmp = _kernel.channel(p / 8);
8640 
8641         for (int q = 0; q < inch; q++)
8642         {
8643             float* outptr0 = out0;
8644             float* outptr1 = out1;
8645             float* outptr2 = out2;
8646             float* outptr3 = out3;
8647             float* outptr4 = out4;
8648             float* outptr5 = out5;
8649             float* outptr6 = out6;
8650             float* outptr7 = out7;
8651 
8652             const float* img0 = bottom_blob.channel(q);
8653 
8654             const float* r0 = img0;
8655             const float* r1 = img0 + w;
8656             const float* r2 = img0 + w * 2;
8657 
8658             int i = 0;
8659 
8660             for (; i < outh; i++)
8661             {
8662 #if __ARM_NEON
8663                 int nn = outw >> 2;
8664                 int remain = outw & 3;
8665 #else
8666                 int remain = outw;
8667 #endif // __ARM_NEON
8668 
8669 #if __ARM_NEON
8670 #if __aarch64__
8671                 if (nn > 0)
8672                 {
8673                     asm volatile(
8674                         "0:                                 \n"
8675 
8676                         "prfm   pldl1keep, [%1, #128]       \n"
8677                         "ld1    {v8.4s}, [%1]               \n"
8678                         "prfm   pldl1keep, [%2, #128]       \n"
8679                         "ld1    {v9.4s}, [%2]               \n"
8680 
8681                         "prfm   pldl1keep, [%3, #128]       \n"
8682                         "ld1    {v10.4s}, [%3]              \n"
8683                         "prfm   pldl1keep, [%4, #128]       \n"
8684                         "ld1    {v11.4s}, [%4]              \n"
8685 
8686                         ///
8687                         "prfm   pldl1keep, [%9, #256]       \n"
8688                         "ld2    {v4.4s, v5.4s}, [%9], #32   \n" // v4=00 v5=01
8689 
8690                         "ld1    {v0.4s, v1.4s}, [%12], #32  \n"
8691 
8692                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
8693                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
8694 
8695                         "prfm   pldl1keep, [%5, #128]       \n"
8696                         "ld1    {v12.4s}, [%5]              \n"
8697                         "prfm   pldl1keep, [%6, #128]       \n"
8698                         "ld1    {v13.4s}, [%6]              \n"
8699 
8700                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
8701                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
8702 
8703                         "prfm   pldl1keep, [%7, #128]       \n"
8704                         "ld1    {v14.4s}, [%7]              \n"
8705                         "prfm   pldl1keep, [%8, #128]       \n"
8706                         "ld1    {v15.4s}, [%8]              \n"
8707 
8708                         "ld1    {v2.4s, v3.4s}, [%12], #32  \n"
8709 
8710                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
8711                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
8712                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
8713                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
8714 
8715                         "prfm   pldl1keep, [%9, #256]       \n"
8716                         "ld2    {v6.4s, v7.4s}, [%9]        \n" // v6
8717 
8718                         "fmla   v8.4s, v5.4s, v2.s[0]       \n"
8719                         "fmla   v9.4s, v5.4s, v2.s[1]       \n"
8720                         "fmla   v10.4s, v5.4s, v2.s[2]      \n"
8721                         "fmla   v11.4s, v5.4s, v2.s[3]      \n"
8722 
8723                         "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=02
8724 
8725                         "ld1    {v0.4s, v1.4s}, [%12], #32  \n"
8726 
8727                         "fmla   v12.4s, v5.4s, v3.s[0]      \n"
8728                         "fmla   v13.4s, v5.4s, v3.s[1]      \n"
8729                         "fmla   v14.4s, v5.4s, v3.s[2]      \n"
8730                         "fmla   v15.4s, v5.4s, v3.s[3]      \n"
8731 
8732                         ///
8733                         "prfm   pldl1keep, [%10, #256]      \n"
8734                         "ld2    {v4.4s, v5.4s}, [%10], #32  \n" // v4=10 v5=11
8735 
8736                         "fmla   v8.4s, v6.4s, v0.s[0]       \n"
8737                         "fmla   v9.4s, v6.4s, v0.s[1]       \n"
8738                         "fmla   v10.4s, v6.4s, v0.s[2]      \n"
8739                         "fmla   v11.4s, v6.4s, v0.s[3]      \n"
8740 
8741                         "ld1    {v2.4s, v3.4s}, [%12], #32  \n"
8742 
8743                         "fmla   v12.4s, v6.4s, v1.s[0]      \n"
8744                         "fmla   v13.4s, v6.4s, v1.s[1]      \n"
8745                         "fmla   v14.4s, v6.4s, v1.s[2]      \n"
8746                         "fmla   v15.4s, v6.4s, v1.s[3]      \n"
8747 
8748                         "fmla   v8.4s, v4.4s, v2.s[0]       \n"
8749                         "fmla   v9.4s, v4.4s, v2.s[1]       \n"
8750                         "fmla   v10.4s, v4.4s, v2.s[2]      \n"
8751                         "fmla   v11.4s, v4.4s, v2.s[3]      \n"
8752 
8753                         "ld1    {v0.4s, v1.4s}, [%12], #32  \n"
8754 
8755                         "fmla   v12.4s, v4.4s, v3.s[0]      \n"
8756                         "fmla   v13.4s, v4.4s, v3.s[1]      \n"
8757                         "fmla   v14.4s, v4.4s, v3.s[2]      \n"
8758                         "fmla   v15.4s, v4.4s, v3.s[3]      \n"
8759 
8760                         "prfm   pldl1keep, [%10, #256]      \n"
8761                         "ld2    {v6.4s, v7.4s}, [%10]       \n" // v6
8762 
8763                         "fmla   v8.4s, v5.4s, v0.s[0]       \n"
8764                         "fmla   v9.4s, v5.4s, v0.s[1]       \n"
8765                         "fmla   v10.4s, v5.4s, v0.s[2]      \n"
8766                         "fmla   v11.4s, v5.4s, v0.s[3]      \n"
8767 
8768                         "ld1    {v2.4s, v3.4s}, [%12], #32  \n"
8769 
8770                         "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=12
8771 
8772                         "fmla   v12.4s, v5.4s, v1.s[0]      \n"
8773                         "fmla   v13.4s, v5.4s, v1.s[1]      \n"
8774                         "fmla   v14.4s, v5.4s, v1.s[2]      \n"
8775                         "fmla   v15.4s, v5.4s, v1.s[3]      \n"
8776 
8777                         ///
8778                         "prfm   pldl1keep, [%11, #256]      \n"
8779                         "ld2    {v4.4s, v5.4s}, [%11], #32  \n" // v4=20 v5=21
8780 
8781                         "fmla   v8.4s, v6.4s, v2.s[0]       \n"
8782                         "fmla   v9.4s, v6.4s, v2.s[1]       \n"
8783                         "fmla   v10.4s, v6.4s, v2.s[2]      \n"
8784                         "fmla   v11.4s, v6.4s, v2.s[3]      \n"
8785 
8786                         "ld1    {v0.4s, v1.4s}, [%12], #32  \n"
8787 
8788                         "fmla   v12.4s, v6.4s, v3.s[0]      \n"
8789                         "fmla   v13.4s, v6.4s, v3.s[1]      \n"
8790                         "fmla   v14.4s, v6.4s, v3.s[2]      \n"
8791                         "fmla   v15.4s, v6.4s, v3.s[3]      \n"
8792 
8793                         "fmla   v8.4s, v4.4s, v0.s[0]       \n"
8794                         "fmla   v9.4s, v4.4s, v0.s[1]       \n"
8795                         "fmla   v10.4s, v4.4s, v0.s[2]      \n"
8796                         "fmla   v11.4s, v4.4s, v0.s[3]      \n"
8797 
8798                         "ld1    {v2.4s, v3.4s}, [%12], #32  \n"
8799 
8800                         "fmla   v12.4s, v4.4s, v1.s[0]      \n"
8801                         "fmla   v13.4s, v4.4s, v1.s[1]      \n"
8802                         "fmla   v14.4s, v4.4s, v1.s[2]      \n"
8803                         "fmla   v15.4s, v4.4s, v1.s[3]      \n"
8804 
8805                         "prfm   pldl1keep, [%11, #256]      \n"
8806                         "ld2    {v6.4s, v7.4s}, [%11]       \n" // v6
8807 
8808                         "fmla   v8.4s, v5.4s, v2.s[0]       \n"
8809                         "fmla   v9.4s, v5.4s, v2.s[1]       \n"
8810                         "fmla   v10.4s, v5.4s, v2.s[2]      \n"
8811                         "fmla   v11.4s, v5.4s, v2.s[3]      \n"
8812 
8813                         "ext    v6.16b, v4.16b, v6.16b, #4  \n" // v6=22
8814 
8815                         "ld1    {v0.4s, v1.4s}, [%12], #32  \n"
8816 
8817                         "fmla   v12.4s, v5.4s, v3.s[0]      \n"
8818                         "fmla   v13.4s, v5.4s, v3.s[1]      \n"
8819                         "fmla   v14.4s, v5.4s, v3.s[2]      \n"
8820                         "fmla   v15.4s, v5.4s, v3.s[3]      \n"
8821 
8822                         "fmla   v8.4s, v6.4s, v0.s[0]       \n"
8823                         "fmla   v9.4s, v6.4s, v0.s[1]       \n"
8824                         "fmla   v10.4s, v6.4s, v0.s[2]      \n"
8825                         "fmla   v11.4s, v6.4s, v0.s[3]      \n"
8826 
8827                         "fmla   v12.4s, v6.4s, v1.s[0]      \n"
8828                         "fmla   v13.4s, v6.4s, v1.s[1]      \n"
8829 
8830                         "st1    {v8.4s}, [%1], #16          \n"
8831                         "st1    {v9.4s}, [%2], #16          \n"
8832 
8833                         "fmla   v14.4s, v6.4s, v1.s[2]      \n"
8834                         "fmla   v15.4s, v6.4s, v1.s[3]      \n"
8835 
8836                         "st1    {v10.4s}, [%3], #16         \n"
8837                         "st1    {v11.4s}, [%4], #16         \n"
8838 
8839                         "sub    %12, %12, #288              \n"
8840 
8841                         "st1    {v12.4s}, [%5], #16         \n"
8842                         "st1    {v13.4s}, [%6], #16         \n"
8843 
8844                         "subs   %w0, %w0, #1                \n"
8845 
8846                         "st1    {v14.4s}, [%7], #16         \n"
8847                         "st1    {v15.4s}, [%8], #16         \n"
8848 
8849                         "bne    0b                          \n"
8850                         : "=r"(nn),      // %0
8851                         "=r"(outptr0), // %1
8852                         "=r"(outptr1), // %2
8853                         "=r"(outptr2), // %3
8854                         "=r"(outptr3), // %4
8855                         "=r"(outptr4), // %5
8856                         "=r"(outptr5), // %6
8857                         "=r"(outptr6), // %7
8858                         "=r"(outptr7), // %8
8859                         "=r"(r0),      // %9
8860                         "=r"(r1),      // %10
8861                         "=r"(r2),      // %11
8862                         "=r"(ktmp)     // %12
8863                         : "0"(nn),
8864                         "1"(outptr0),
8865                         "2"(outptr1),
8866                         "3"(outptr2),
8867                         "4"(outptr3),
8868                         "5"(outptr4),
8869                         "6"(outptr5),
8870                         "7"(outptr6),
8871                         "8"(outptr7),
8872                         "9"(r0),
8873                         "10"(r1),
8874                         "11"(r2),
8875                         "12"(ktmp)
8876                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8877                 }
8878 #else  // __aarch64__
8879                 if (nn > 0)
8880                 {
8881                     asm volatile(
8882                         "0:                             \n"
8883 
8884                         "pld        [%1, #128]          \n"
8885                         "vld1.f32   {d16-d17}, [%1]     \n"
8886                         "pld        [%2, #128]          \n"
8887                         "vld1.f32   {d18-d19}, [%2]     \n"
8888 
8889                         "pld        [%3, #128]          \n"
8890                         "vld1.f32   {d20-d21}, [%3]     \n"
8891                         "pld        [%4, #128]          \n"
8892                         "vld1.f32   {d22-d23}, [%4]     \n"
8893 
8894                         ///
8895                         "pld        [%9, #256]          \n"
8896                         "vld2.f32   {d8-d11}, [%9]!     \n" // q4=00 q5=01
8897 
8898                         "vld1.f32   {d0-d3}, [%12 :128]! \n"
8899 
8900                         "vmla.f32   q8, q4, d0[0]       \n"
8901                         "vmla.f32   q9, q4, d0[1]       \n"
8902 
8903                         "pld        [%5, #128]          \n"
8904                         "vld1.f32   {d24-d25}, [%5]     \n"
8905                         "pld        [%6, #128]          \n"
8906                         "vld1.f32   {d26-d27}, [%6]     \n"
8907 
8908                         "vmla.f32   q10, q4, d1[0]      \n"
8909                         "vmla.f32   q11, q4, d1[1]      \n"
8910 
8911                         "pld        [%7, #128]          \n"
8912                         "vld1.f32   {d28-d29}, [%7]     \n"
8913                         "pld        [%8, #128]          \n"
8914                         "vld1.f32   {d30-d31}, [%8]     \n"
8915 
8916                         "vld1.f32   {d4-d7}, [%12 :128]! \n"
8917 
8918                         "vmla.f32   q12, q4, d2[0]      \n"
8919                         "vmla.f32   q13, q4, d2[1]      \n"
8920                         "vmla.f32   q14, q4, d3[0]      \n"
8921                         "vmla.f32   q15, q4, d3[1]      \n"
8922 
8923                         "pld        [%9, #128]          \n"
8924                         "vld2.f32   {d12-d13}, [%9]     \n" // q6
8925 
8926                         "vmla.f32   q8, q5, d4[0]       \n"
8927                         "vmla.f32   q9, q5, d4[1]       \n"
8928                         "vmla.f32   q10, q5, d5[0]      \n"
8929                         "vmla.f32   q11, q5, d5[1]      \n"
8930 
8931                         "vext.f32   q6, q4, q6, #1      \n" // q6=02
8932 
8933                         "vld1.f32   {d0-d3}, [%12 :128]! \n"
8934 
8935                         "vmla.f32   q12, q5, d6[0]      \n"
8936                         "vmla.f32   q13, q5, d6[1]      \n"
8937                         "vmla.f32   q14, q5, d7[0]      \n"
8938                         "vmla.f32   q15, q5, d7[1]      \n"
8939 
8940                         ///
8941                         "pld        [%10, #256]         \n"
8942                         "vld2.f32   {d8-d11}, [%10]!    \n" // q4=10 q5=11
8943 
8944                         "vmla.f32   q8, q6, d0[0]       \n"
8945                         "vmla.f32   q9, q6, d0[1]       \n"
8946                         "vmla.f32   q10, q6, d1[0]      \n"
8947                         "vmla.f32   q11, q6, d1[1]      \n"
8948 
8949                         "vld1.f32   {d4-d7}, [%12 :128]! \n"
8950 
8951                         "vmla.f32   q12, q6, d2[0]      \n"
8952                         "vmla.f32   q13, q6, d2[1]      \n"
8953                         "vmla.f32   q14, q6, d3[0]      \n"
8954                         "vmla.f32   q15, q6, d3[1]      \n"
8955 
8956                         "vmla.f32   q8, q4, d4[0]       \n"
8957                         "vmla.f32   q9, q4, d4[1]       \n"
8958                         "vmla.f32   q10, q4, d5[0]      \n"
8959                         "vmla.f32   q11, q4, d5[1]      \n"
8960 
8961                         "vld1.f32   {d0-d3}, [%12 :128]! \n"
8962 
8963                         "vmla.f32   q12, q4, d6[0]      \n"
8964                         "vmla.f32   q13, q4, d6[1]      \n"
8965                         "vmla.f32   q14, q4, d7[0]      \n"
8966                         "vmla.f32   q15, q4, d7[1]      \n"
8967 
8968                         "pld        [%10, #128]         \n"
8969                         "vld2.f32   {d12-d13}, [%10]    \n" // q6
8970 
8971                         "vmla.f32   q8, q5, d0[0]       \n"
8972                         "vmla.f32   q9, q5, d0[1]       \n"
8973                         "vmla.f32   q10, q5, d1[0]      \n"
8974                         "vmla.f32   q11, q5, d1[1]      \n"
8975 
8976                         "vld1.f32   {d4-d7}, [%12 :128]! \n"
8977 
8978                         "vext.f32   q6, q4, q6, #1      \n" // q6=12
8979 
8980                         "vmla.f32   q12, q5, d2[0]      \n"
8981                         "vmla.f32   q13, q5, d2[1]      \n"
8982                         "vmla.f32   q14, q5, d3[0]      \n"
8983                         "vmla.f32   q15, q5, d3[1]      \n"
8984 
8985                         ///
8986                         "pld        [%11, #256]         \n"
8987                         "vld2.f32   {d8-d11}, [%11]!    \n" // q4=20 q5=21
8988 
8989                         "vmla.f32   q8, q6, d4[0]       \n"
8990                         "vmla.f32   q9, q6, d4[1]       \n"
8991                         "vmla.f32   q10, q6, d5[0]      \n"
8992                         "vmla.f32   q11, q6, d5[1]      \n"
8993 
8994                         "vld1.f32   {d0-d3}, [%12 :128]! \n"
8995 
8996                         "vmla.f32   q12, q6, d6[0]      \n"
8997                         "vmla.f32   q13, q6, d6[1]      \n"
8998                         "vmla.f32   q14, q6, d7[0]      \n"
8999                         "vmla.f32   q15, q6, d7[1]      \n"
9000 
9001                         "vmla.f32   q8, q4, d0[0]       \n"
9002                         "vmla.f32   q9, q4, d0[1]       \n"
9003                         "vmla.f32   q10, q4, d1[0]      \n"
9004                         "vmla.f32   q11, q4, d1[1]      \n"
9005 
9006                         "vld1.f32   {d4-d7}, [%12 :128]! \n"
9007 
9008                         "vmla.f32   q12, q4, d2[0]      \n"
9009                         "vmla.f32   q13, q4, d2[1]      \n"
9010                         "vmla.f32   q14, q4, d3[0]      \n"
9011                         "vmla.f32   q15, q4, d3[1]      \n"
9012 
9013                         "pld        [%11, #128]         \n"
9014                         "vld2.f32   {d12-d13}, [%11]    \n" // q6
9015 
9016                         "vmla.f32   q8, q5, d4[0]       \n"
9017                         "vmla.f32   q9, q5, d4[1]       \n"
9018                         "vmla.f32   q10, q5, d5[0]      \n"
9019                         "vmla.f32   q11, q5, d5[1]      \n"
9020 
9021                         "vext.f32   q6, q4, q6, #1      \n" // q6=22
9022 
9023                         "vld1.f32   {d0-d3}, [%12 :128]! \n"
9024 
9025                         "vmla.f32   q12, q5, d6[0]      \n"
9026                         "vmla.f32   q13, q5, d6[1]      \n"
9027                         "vmla.f32   q14, q5, d7[0]      \n"
9028                         "vmla.f32   q15, q5, d7[1]      \n"
9029 
9030                         "vmla.f32   q8, q6, d0[0]       \n"
9031                         "vmla.f32   q9, q6, d0[1]       \n"
9032                         "vmla.f32   q10, q6, d1[0]      \n"
9033                         "vmla.f32   q11, q6, d1[1]      \n"
9034 
9035                         "vmla.f32   q12, q6, d2[0]      \n"
9036                         "vmla.f32   q13, q6, d2[1]      \n"
9037 
9038                         "vst1.f32   {d16-d17}, [%1]!    \n"
9039                         "vst1.f32   {d18-d19}, [%2]!    \n"
9040 
9041                         "vmla.f32   q14, q6, d3[0]      \n"
9042                         "vmla.f32   q15, q6, d3[1]      \n"
9043 
9044                         "vst1.f32   {d20-d21}, [%3]!    \n"
9045                         "vst1.f32   {d22-d23}, [%4]!    \n"
9046 
9047                         "sub        %12, %12, #288      \n"
9048 
9049                         "vst1.f32   {d24-d25}, [%5]!    \n"
9050                         "vst1.f32   {d26-d27}, [%6]!    \n"
9051 
9052                         "subs       %0, #1              \n"
9053 
9054                         "vst1.f32   {d28-d29}, [%7]!    \n"
9055                         "vst1.f32   {d30-d31}, [%8]!    \n"
9056 
9057                         "bne        0b                  \n"
9058                         : "=r"(nn),      // %0
9059                         "=r"(outptr0), // %1
9060                         "=r"(outptr1), // %2
9061                         "=r"(outptr2), // %3
9062                         "=r"(outptr3), // %4
9063                         "=r"(outptr4), // %5
9064                         "=r"(outptr5), // %6
9065                         "=r"(outptr6), // %7
9066                         "=r"(outptr7), // %8
9067                         "=r"(r0),      // %9
9068                         "=r"(r1),      // %10
9069                         "=r"(r2),      // %11
9070                         "=r"(ktmp)     // %12
9071                         : "0"(nn),
9072                         "1"(outptr0),
9073                         "2"(outptr1),
9074                         "3"(outptr2),
9075                         "4"(outptr3),
9076                         "5"(outptr4),
9077                         "6"(outptr5),
9078                         "7"(outptr6),
9079                         "8"(outptr7),
9080                         "9"(r0),
9081                         "10"(r1),
9082                         "11"(r2),
9083                         "12"(ktmp)
9084                         : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9085                 }
9086 #endif // __aarch64__
9087 #endif // __ARM_NEON
9088                 for (; remain > 0; remain--)
9089                 {
9090 #if __ARM_NEON
9091 #if __aarch64__
9092                     asm volatile(
9093                         "ld1    {v10.4s, v11.4s}, [%11], #32    \n"
9094 
9095                         "prfm   pldl1keep, [%8, #128]   \n"
9096                         "ld1    {v0.4s}, [%8]           \n"
9097 
9098                         "ld1    {v12.4s, v13.4s}, [%11], #32    \n"
9099 
9100                         "ld1    {v8.s}[0], [%0]         \n"
9101                         "ld1    {v8.s}[1], [%1]         \n"
9102                         "ld1    {v8.s}[2], [%2]         \n"
9103                         "ld1    {v8.s}[3], [%3]         \n"
9104 
9105                         "fmul   v14.4s, v10.4s, v0.s[0] \n"
9106                         "fmul   v15.4s, v11.4s, v0.s[0] \n"
9107 
9108                         "ld1    {v9.s}[0], [%4]         \n"
9109                         "ld1    {v9.s}[1], [%5]         \n"
9110                         "ld1    {v9.s}[2], [%6]         \n"
9111                         "ld1    {v9.s}[3], [%7]         \n"
9112 
9113                         "ld1    {v10.4s, v11.4s}, [%11], #32    \n"
9114 
9115                         "fmla   v8.4s, v12.4s, v0.s[1]  \n"
9116                         "fmla   v9.4s, v13.4s, v0.s[1]  \n"
9117 
9118                         "ld1    {v12.4s, v13.4s}, [%11], #32    \n"
9119 
9120                         "fmla   v14.4s, v10.4s, v0.s[2] \n"
9121                         "fmla   v15.4s, v11.4s, v0.s[2] \n"
9122 
9123                         "prfm   pldl1keep, [%9, #128]   \n"
9124                         "ld1    {v1.4s}, [%9]           \n"
9125 
9126                         "ld1    {v10.4s, v11.4s}, [%11], #32    \n"
9127 
9128                         "fmla   v8.4s, v12.4s, v1.s[0]  \n"
9129                         "fmla   v9.4s, v13.4s, v1.s[0]  \n"
9130 
9131                         "ld1    {v12.4s, v13.4s}, [%11], #32    \n"
9132 
9133                         "fmla   v14.4s, v10.4s, v1.s[1] \n"
9134                         "fmla   v15.4s, v11.4s, v1.s[1] \n"
9135 
9136                         "ld1    {v10.4s, v11.4s}, [%11], #32    \n"
9137 
9138                         "fmla   v8.4s, v12.4s, v1.s[2]  \n"
9139                         "fmla   v9.4s, v13.4s, v1.s[2]  \n"
9140 
9141                         "prfm   pldl1keep, [%10, #128]  \n"
9142                         "ld1    {v0.4s}, [%10]          \n"
9143 
9144                         "ld1    {v12.4s, v13.4s}, [%11], #32    \n"
9145 
9146                         "fmla   v14.4s, v10.4s, v0.s[0] \n"
9147                         "fmla   v15.4s, v11.4s, v0.s[0] \n"
9148 
9149                         "ld1    {v10.4s, v11.4s}, [%11], #32    \n"
9150 
9151                         "fmla   v8.4s, v12.4s, v0.s[1]  \n"
9152                         "fmla   v9.4s, v13.4s, v0.s[1]  \n"
9153 
9154                         "fmla   v14.4s, v10.4s, v0.s[2] \n"
9155                         "fmla   v15.4s, v11.4s, v0.s[2] \n"
9156 
9157                         "fadd   v8.4s, v8.4s, v14.4s    \n"
9158                         "fadd   v9.4s, v9.4s, v15.4s    \n"
9159 
9160                         "sub    %11, %11, #288          \n"
9161 
9162                         "st1    {v8.s}[0], [%0], #4     \n"
9163                         "st1    {v8.s}[1], [%1], #4     \n"
9164                         "st1    {v8.s}[2], [%2], #4     \n"
9165                         "st1    {v8.s}[3], [%3], #4     \n"
9166 
9167                         "st1    {v9.s}[0], [%4], #4     \n"
9168                         "st1    {v9.s}[1], [%5], #4     \n"
9169                         "st1    {v9.s}[2], [%6], #4     \n"
9170                         "st1    {v9.s}[3], [%7], #4     \n"
9171 
9172                         : "=r"(outptr0), // %0
9173                         "=r"(outptr1), // %1
9174                         "=r"(outptr2), // %2
9175                         "=r"(outptr3), // %3
9176                         "=r"(outptr4), // %4
9177                         "=r"(outptr5), // %5
9178                         "=r"(outptr6), // %6
9179                         "=r"(outptr7), // %7
9180                         "=r"(r0),      // %8
9181                         "=r"(r1),      // %9
9182                         "=r"(r2),      // %10
9183                         "=r"(ktmp)     // %11
9184                         : "0"(outptr0),
9185                         "1"(outptr1),
9186                         "2"(outptr2),
9187                         "3"(outptr3),
9188                         "4"(outptr4),
9189                         "5"(outptr5),
9190                         "6"(outptr6),
9191                         "7"(outptr7),
9192                         "8"(r0),
9193                         "9"(r1),
9194                         "10"(r2),
9195                         "11"(ktmp)
9196                         : "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
9197 #else  // __aarch64__
9198                     asm volatile(
9199                         "vld1.f32   {d20-d23}, [%11 :128]! \n"
9200 
9201                         "pld        [%8, #128]      \n"
9202                         "vld1.f32   {d0-d1}, [%8]   \n"
9203 
9204                         "vld1.f32   {d24-d27}, [%11 :128]! \n"
9205 
9206                         "vld1.f32   {d16[0]}, [%0]  \n"
9207                         "vld1.f32   {d16[1]}, [%1]  \n"
9208                         "vld1.f32   {d17[0]}, [%2]  \n"
9209                         "vld1.f32   {d17[1]}, [%3]  \n"
9210 
9211                         "vmul.f32   q14, q10, d0[0] \n"
9212                         "vmul.f32   q15, q11, d0[0] \n"
9213 
9214                         "vld1.f32   {d18[0]}, [%4]  \n"
9215                         "vld1.f32   {d18[1]}, [%5]  \n"
9216                         "vld1.f32   {d19[0]}, [%6]  \n"
9217                         "vld1.f32   {d19[1]}, [%7]  \n"
9218 
9219                         "vld1.f32   {d20-d23}, [%11 :128]! \n"
9220 
9221                         "vmla.f32   q8, q12, d0[1]  \n"
9222                         "vmla.f32   q9, q13, d0[1]  \n"
9223 
9224                         "vld1.f32   {d24-d27}, [%11 :128]! \n"
9225 
9226                         "vmla.f32   q14, q10, d1[0] \n"
9227                         "vmla.f32   q15, q11, d1[0] \n"
9228 
9229                         "pld        [%9, #128]      \n"
9230                         "vld1.f32   {d2-d3}, [%9]   \n"
9231 
9232                         "vld1.f32   {d20-d23}, [%11 :128]! \n"
9233 
9234                         "vmla.f32   q8, q12, d2[0]  \n"
9235                         "vmla.f32   q9, q13, d2[0]  \n"
9236 
9237                         "vld1.f32   {d24-d27}, [%11 :128]! \n"
9238 
9239                         "vmla.f32   q14, q10, d2[1] \n"
9240                         "vmla.f32   q15, q11, d2[1] \n"
9241 
9242                         "vld1.f32   {d20-d23}, [%11 :128]! \n"
9243 
9244                         "vmla.f32   q8, q12, d3[0]  \n"
9245                         "vmla.f32   q9, q13, d3[0]  \n"
9246 
9247                         "pld        [%10, #128]     \n"
9248                         "vld1.f32   {d0-d1}, [%10]  \n"
9249 
9250                         "vld1.f32   {d24-d27}, [%11 :128]! \n"
9251 
9252                         "vmla.f32   q14, q10, d0[0] \n"
9253                         "vmla.f32   q15, q11, d0[0] \n"
9254 
9255                         "vld1.f32   {d20-d23}, [%11 :128]! \n"
9256 
9257                         "vmla.f32   q8, q12, d0[1]  \n"
9258                         "vmla.f32   q9, q13, d0[1]  \n"
9259 
9260                         "vmla.f32   q14, q10, d1[0] \n"
9261                         "vmla.f32   q15, q11, d1[0] \n"
9262 
9263                         "vadd.f32   q8, q8, q14     \n"
9264                         "vadd.f32   q9, q9, q15     \n"
9265 
9266                         "sub        %11, %11, #288  \n"
9267 
9268                         "vst1.f32   {d16[0]}, [%0]! \n"
9269                         "vst1.f32   {d16[1]}, [%1]! \n"
9270                         "vst1.f32   {d17[0]}, [%2]! \n"
9271                         "vst1.f32   {d17[1]}, [%3]! \n"
9272 
9273                         "vst1.f32   {d18[0]}, [%4]! \n"
9274                         "vst1.f32   {d18[1]}, [%5]! \n"
9275                         "vst1.f32   {d19[0]}, [%6]! \n"
9276                         "vst1.f32   {d19[1]}, [%7]! \n"
9277 
9278                         : "=r"(outptr0), // %0
9279                         "=r"(outptr1), // %1
9280                         "=r"(outptr2), // %2
9281                         "=r"(outptr3), // %3
9282                         "=r"(outptr4), // %4
9283                         "=r"(outptr5), // %5
9284                         "=r"(outptr6), // %6
9285                         "=r"(outptr7), // %7
9286                         "=r"(r0),      // %8
9287                         "=r"(r1),      // %9
9288                         "=r"(r2),      // %10
9289                         "=r"(ktmp)     // %11
9290                         : "0"(outptr0),
9291                         "1"(outptr1),
9292                         "2"(outptr2),
9293                         "3"(outptr3),
9294                         "4"(outptr4),
9295                         "5"(outptr5),
9296                         "6"(outptr6),
9297                         "7"(outptr7),
9298                         "8"(r0),
9299                         "9"(r1),
9300                         "10"(r2),
9301                         "11"(ktmp)
9302                         : "memory", "q0", "q1", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9303 #endif // __aarch64__
9304 #else  // __ARM_NEON
9305                     float sum0 = 0.f;
9306                     float sum1 = 0.f;
9307                     float sum2 = 0.f;
9308                     float sum3 = 0.f;
9309                     float sum4 = 0.f;
9310                     float sum5 = 0.f;
9311                     float sum6 = 0.f;
9312                     float sum7 = 0.f;
9313 
9314                     sum0 += r0[0] * ktmp[0];
9315                     sum1 += r0[0] * ktmp[1];
9316                     sum2 += r0[0] * ktmp[2];
9317                     sum3 += r0[0] * ktmp[3];
9318                     sum4 += r0[0] * ktmp[4];
9319                     sum5 += r0[0] * ktmp[5];
9320                     sum6 += r0[0] * ktmp[6];
9321                     sum7 += r0[0] * ktmp[7];
9322                     ktmp += 8;
9323 
9324                     sum0 += r0[1] * ktmp[0];
9325                     sum1 += r0[1] * ktmp[1];
9326                     sum2 += r0[1] * ktmp[2];
9327                     sum3 += r0[1] * ktmp[3];
9328                     sum4 += r0[1] * ktmp[4];
9329                     sum5 += r0[1] * ktmp[5];
9330                     sum6 += r0[1] * ktmp[6];
9331                     sum7 += r0[1] * ktmp[7];
9332                     ktmp += 8;
9333 
9334                     sum0 += r0[2] * ktmp[0];
9335                     sum1 += r0[2] * ktmp[1];
9336                     sum2 += r0[2] * ktmp[2];
9337                     sum3 += r0[2] * ktmp[3];
9338                     sum4 += r0[2] * ktmp[4];
9339                     sum5 += r0[2] * ktmp[5];
9340                     sum6 += r0[2] * ktmp[6];
9341                     sum7 += r0[2] * ktmp[7];
9342                     ktmp += 8;
9343 
9344                     sum0 += r1[0] * ktmp[0];
9345                     sum1 += r1[0] * ktmp[1];
9346                     sum2 += r1[0] * ktmp[2];
9347                     sum3 += r1[0] * ktmp[3];
9348                     sum4 += r1[0] * ktmp[4];
9349                     sum5 += r1[0] * ktmp[5];
9350                     sum6 += r1[0] * ktmp[6];
9351                     sum7 += r1[0] * ktmp[7];
9352                     ktmp += 8;
9353 
9354                     sum0 += r1[1] * ktmp[0];
9355                     sum1 += r1[1] * ktmp[1];
9356                     sum2 += r1[1] * ktmp[2];
9357                     sum3 += r1[1] * ktmp[3];
9358                     sum4 += r1[1] * ktmp[4];
9359                     sum5 += r1[1] * ktmp[5];
9360                     sum6 += r1[1] * ktmp[6];
9361                     sum7 += r1[1] * ktmp[7];
9362                     ktmp += 8;
9363 
9364                     sum0 += r1[2] * ktmp[0];
9365                     sum1 += r1[2] * ktmp[1];
9366                     sum2 += r1[2] * ktmp[2];
9367                     sum3 += r1[2] * ktmp[3];
9368                     sum4 += r1[2] * ktmp[4];
9369                     sum5 += r1[2] * ktmp[5];
9370                     sum6 += r1[2] * ktmp[6];
9371                     sum7 += r1[2] * ktmp[7];
9372                     ktmp += 8;
9373 
9374                     sum0 += r2[0] * ktmp[0];
9375                     sum1 += r2[0] * ktmp[1];
9376                     sum2 += r2[0] * ktmp[2];
9377                     sum3 += r2[0] * ktmp[3];
9378                     sum4 += r2[0] * ktmp[4];
9379                     sum5 += r2[0] * ktmp[5];
9380                     sum6 += r2[0] * ktmp[6];
9381                     sum7 += r2[0] * ktmp[7];
9382                     ktmp += 8;
9383 
9384                     sum0 += r2[1] * ktmp[0];
9385                     sum1 += r2[1] * ktmp[1];
9386                     sum2 += r2[1] * ktmp[2];
9387                     sum3 += r2[1] * ktmp[3];
9388                     sum4 += r2[1] * ktmp[4];
9389                     sum5 += r2[1] * ktmp[5];
9390                     sum6 += r2[1] * ktmp[6];
9391                     sum7 += r2[1] * ktmp[7];
9392                     ktmp += 8;
9393 
9394                     sum0 += r2[2] * ktmp[0];
9395                     sum1 += r2[2] * ktmp[1];
9396                     sum2 += r2[2] * ktmp[2];
9397                     sum3 += r2[2] * ktmp[3];
9398                     sum4 += r2[2] * ktmp[4];
9399                     sum5 += r2[2] * ktmp[5];
9400                     sum6 += r2[2] * ktmp[6];
9401                     sum7 += r2[2] * ktmp[7];
9402                     ktmp += 8;
9403 
9404                     *outptr0 += sum0;
9405                     *outptr1 += sum1;
9406                     *outptr2 += sum2;
9407                     *outptr3 += sum3;
9408                     *outptr4 += sum4;
9409                     *outptr5 += sum5;
9410                     *outptr6 += sum6;
9411                     *outptr7 += sum7;
9412 
9413                     ktmp -= 8 * 9;
9414 
9415                     outptr0++;
9416                     outptr1++;
9417                     outptr2++;
9418                     outptr3++;
9419                     outptr4++;
9420                     outptr5++;
9421                     outptr6++;
9422                     outptr7++;
9423 #endif // __ARM_NEON
9424                     r0 += 2;
9425                     r1 += 2;
9426                     r2 += 2;
9427                 }
9428 
9429                 r0 += tailstep;
9430                 r1 += tailstep;
9431                 r2 += tailstep;
9432             }
9433 
9434             ktmp += 8 * 9;
9435         }
9436     }
9437 
9438     #pragma omp parallel for num_threads(opt.num_threads)
9439     for (int p = remain_outch_start; p < outch; p++)
9440     {
9441         Mat out = top_blob.channel(p);
9442 
9443         const float bias0 = bias ? bias[p] : 0.f;
9444 
9445         out.fill(bias0);
9446 
9447         const float* ktmp = _kernel.channel(p / 8 + p % 8);
9448 
9449         for (int q = 0; q < inch; q++)
9450         {
9451             float* outptr = out;
9452 
9453             const float* img0 = bottom_blob.channel(q);
9454 
9455             const float* r0 = img0;
9456             const float* r1 = img0 + w;
9457             const float* r2 = img0 + w * 2;
9458 
9459             const float* k0 = ktmp;
9460             const float* k1 = ktmp + 3;
9461             const float* k2 = ktmp + 6;
9462 
9463 #if __ARM_NEON
9464             float32x4_t _k0123 = vld1q_f32(k0);
9465             float32x4_t _k3456 = vld1q_f32(k1);
9466             float32x4_t _k6789 = vld1q_f32(k2);
9467 #endif // __ARM_NEON
9468 
9469             int i = 0;
9470 
9471             for (; i < outh; i++)
9472             {
9473 #if __ARM_NEON
9474                 int nn = outw >> 2;
9475                 int remain = outw & 3;
9476 #else
9477                 int remain = outw;
9478 #endif // __ARM_NEON
9479 
9480 #if __ARM_NEON
9481 #if __aarch64__
9482                 if (nn > 0)
9483                 {
9484                     asm volatile(
9485                         "prfm       pldl1keep, [%2, #256]          \n"
9486                         "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
9487                         "0:                                        \n"
9488 
9489                         "prfm       pldl1keep, [%1, #128]          \n"
9490                         "ld1        {v0.4s}, [%1]                  \n"
9491 
9492                         "fmla       v0.4s,  v2.4s, %10.s[0]        \n"
9493                         "fmul       v10.4s, v3.4s, %10.s[1]        \n"
9494 
9495                         "prfm       pldl1keep, [%2, #256]          \n"
9496                         "ld2        {v8.4s, v9.4s}, [%2]           \n"
9497                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
9498 
9499                         "fmul       v11.4s, v1.4s, %10.s[2]        \n"
9500 
9501                         "prfm       pldl1keep, [%3, #256]          \n"
9502                         "ld2        {v2.4s, v3.4s}, [%3], #32      \n"
9503 
9504                         "fmla       v0.4s,  v2.4s, %11.s[0]        \n"
9505                         "fmla       v10.4s, v3.4s, %11.s[1]        \n"
9506 
9507                         "prfm       pldl1keep, [%3, #256]          \n"
9508                         "ld2        {v8.4s, v9.4s}, [%3]           \n"
9509                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
9510 
9511                         "fmla       v11.4s, v1.4s, %11.s[2]        \n"
9512 
9513                         "prfm       pldl1keep, [%4, #256]          \n"
9514                         "ld2        {v2.4s, v3.4s}, [%4], #32      \n"
9515 
9516                         "fmla       v0.4s,  v2.4s, %12.s[0]        \n"
9517                         "fmla       v10.4s, v3.4s, %12.s[1]        \n"
9518 
9519                         "prfm       pldl1keep, [%4, #256]          \n"
9520                         "ld2        {v8.4s, v9.4s}, [%4]           \n"
9521                         "ext        v1.16b, v2.16b, v8.16b, #4     \n"
9522 
9523                         "fmla       v11.4s, v1.4s, %12.s[2]        \n"
9524 
9525                         "prfm       pldl1keep, [%2, #256]          \n"
9526                         "ld2        {v2.4s, v3.4s}, [%2], #32      \n"
9527 
9528                         "fadd       v0.4s, v0.4s, v10.4s           \n"
9529                         "fadd       v0.4s, v0.4s, v11.4s           \n"
9530 
9531                         "subs       %w0, %w0, #1                   \n"
9532                         "st1        {v0.4s}, [%1], #16             \n"
9533                         "bne        0b                             \n"
9534                         "sub        %2, %2, #32                    \n"
9535                         : "=r"(nn),     // %0
9536                         "=r"(outptr), // %1
9537                         "=r"(r0),     // %2
9538                         "=r"(r1),     // %3
9539                         "=r"(r2)      // %4
9540                         : "0"(nn),
9541                         "1"(outptr),
9542                         "2"(r0),
9543                         "3"(r1),
9544                         "4"(r2),
9545                         "w"(_k0123), // %10
9546                         "w"(_k3456), // %11
9547                         "w"(_k6789)  // %12
9548                         : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
9549                 }
9550 #else
9551                 if (nn > 0)
9552                 {
9553                     asm volatile(
9554                         "pld        [%2, #256]          \n"
9555                         "vld2.f32   {d4-d7}, [%2]!      \n"
9556 
9557                         "0:                             \n"
9558                         "pld        [%1, #128]          \n"
9559                         "vld1.f32   {d0-d1}, [%1]       \n"
9560 
9561                         "vmla.f32   q0, q2, %e10[0]     \n"
9562                         "vmul.f32   q10, q3, %e10[1]    \n"
9563 
9564                         "pld        [%2, #128]          \n"
9565                         "vld2.f32   {d16-d17}, [%2]     \n"
9566                         "vext.32    q1, q2, q8, #1      \n"
9567 
9568                         "vmul.f32   q11, q1, %f10[0]    \n"
9569 
9570                         "pld        [%3, #256]          \n"
9571                         "vld2.f32   {d4-d7}, [%3]!      \n"
9572 
9573                         "vmla.f32   q0, q2, %e11[0]     \n"
9574                         "vmla.f32   q10, q3, %e11[1]    \n"
9575 
9576                         "pld        [%3, #128]          \n"
9577                         "vld2.f32   {d16-d17}, [%3]     \n"
9578                         "vext.32    q1, q2, q8, #1      \n"
9579 
9580                         "vmla.f32   q11, q1, %f11[0]    \n"
9581 
9582                         "pld        [%4, #256]          \n"
9583                         "vld2.f32   {d4-d7}, [%4]!      \n"
9584 
9585                         "vmla.f32   q0, q2, %e12[0]     \n"
9586                         "vmla.f32   q10, q3, %e12[1]    \n"
9587 
9588                         "pld        [%4, #128]          \n"
9589                         "vld2.f32   {d16-d17}, [%4]     \n"
9590                         "vext.32    q1, q2, q8, #1      \n"
9591 
9592                         "vmla.f32   q11, q1, %f12[0]    \n"
9593 
9594                         "pld        [%2, #256]          \n"
9595                         "vld2.f32   {d4-d7}, [%2]!      \n"
9596 
9597                         "vadd.f32   q0, q0, q10         \n"
9598                         "vadd.f32   q0, q0, q11         \n"
9599 
9600                         "subs       %0, #1              \n"
9601                         "vst1.f32   {d0-d1}, [%1]!      \n"
9602                         "bne        0b                  \n"
9603                         "sub        %2, #32             \n"
9604                         : "=r"(nn),     // %0
9605                         "=r"(outptr), // %1
9606                         "=r"(r0),     // %2
9607                         "=r"(r1),     // %3
9608                         "=r"(r2)      // %4
9609                         : "0"(nn),
9610                         "1"(outptr),
9611                         "2"(r0),
9612                         "3"(r1),
9613                         "4"(r2),
9614                         "w"(_k0123), // %10
9615                         "w"(_k3456), // %11
9616                         "w"(_k6789)  // %12
9617                         : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9618                 }
9619 #endif // __aarch64__
9620 #endif // __ARM_NEON
9621                 for (; remain > 0; remain--)
9622                 {
9623 #if __ARM_NEON
9624                     float32x4_t _r00 = vld1q_f32(r0);
9625                     float32x4_t _r10 = vld1q_f32(r1);
9626                     float32x4_t _r20 = vld1q_f32(r2);
9627 
9628                     float32x4_t _sum = vmulq_f32(_r00, _k0123);
9629                     _sum = vmlaq_f32(_sum, _r10, _k3456);
9630                     _sum = vmlaq_f32(_sum, _r20, _k6789);
9631 
9632                     _sum = vsetq_lane_f32(*outptr, _sum, 3);
9633 
9634 #if __aarch64__
9635                     *outptr = vaddvq_f32(_sum);
9636 #else
9637                     float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
9638                     _ss = vpadd_f32(_ss, _ss);
9639 
9640                     *outptr = vget_lane_f32(_ss, 0);
9641 #endif // __aarch64__
9642 #else
9643                     float sum = 0;
9644 
9645                     sum += r0[0] * ktmp[0];
9646                     sum += r0[1] * ktmp[1];
9647                     sum += r0[2] * ktmp[2];
9648                     sum += r1[0] * ktmp[3];
9649                     sum += r1[1] * ktmp[4];
9650                     sum += r1[2] * ktmp[5];
9651                     sum += r2[0] * ktmp[6];
9652                     sum += r2[1] * ktmp[7];
9653                     sum += r2[2] * ktmp[8];
9654 
9655                     *outptr += sum;
9656 #endif // __ARM_NEON
9657 
9658                     r0 += 2;
9659                     r1 += 2;
9660                     r2 += 2;
9661                     outptr++;
9662                 }
9663 
9664                 r0 += tailstep;
9665                 r1 += tailstep;
9666                 r2 += tailstep;
9667             }
9668 
9669             ktmp += 9;
9670         }
9671     }
9672 }
9673