1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
conv1x1s1_sgemm_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)15 static void conv1x1s1_sgemm_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
16 {
17     // interleave
18     // src = inch-outch
19     // dst = 4b-4a-inch/4a-outch/4b
20 #if __aarch64__
21     kernel_tm_pack4.create(2 * 1, inch / 4, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
22 #else
23     kernel_tm_pack4.create(1, inch / 4, outch / 4, (size_t)4u * 16, 16);
24 #endif
25 
26     int q = 0;
27 #if __aarch64__
28     for (; q + 7 < outch; q += 8)
29     {
30         const float* k0 = (const float*)kernel + (q + 0) * inch;
31         const float* k1 = (const float*)kernel + (q + 1) * inch;
32         const float* k2 = (const float*)kernel + (q + 2) * inch;
33         const float* k3 = (const float*)kernel + (q + 3) * inch;
34         const float* k4 = (const float*)kernel + (q + 4) * inch;
35         const float* k5 = (const float*)kernel + (q + 5) * inch;
36         const float* k6 = (const float*)kernel + (q + 6) * inch;
37         const float* k7 = (const float*)kernel + (q + 7) * inch;
38 
39         float* g0 = kernel_tm_pack4.channel(q / 8);
40 
41         for (int p = 0; p + 3 < inch; p += 4)
42         {
43             g0[0] = k0[0];
44             g0[1] = k1[0];
45             g0[2] = k2[0];
46             g0[3] = k3[0];
47 
48             g0[4] = k4[0];
49             g0[5] = k5[0];
50             g0[6] = k6[0];
51             g0[7] = k7[0];
52 
53             g0[8] = k0[1];
54             g0[9] = k1[1];
55             g0[10] = k2[1];
56             g0[11] = k3[1];
57 
58             g0[12] = k4[1];
59             g0[13] = k5[1];
60             g0[14] = k6[1];
61             g0[15] = k7[1];
62 
63             g0[16] = k0[2];
64             g0[17] = k1[2];
65             g0[18] = k2[2];
66             g0[19] = k3[2];
67 
68             g0[20] = k4[2];
69             g0[21] = k5[2];
70             g0[22] = k6[2];
71             g0[23] = k7[2];
72 
73             g0[24] = k0[3];
74             g0[25] = k1[3];
75             g0[26] = k2[3];
76             g0[27] = k3[3];
77 
78             g0[28] = k4[3];
79             g0[29] = k5[3];
80             g0[30] = k6[3];
81             g0[31] = k7[3];
82 
83             k0 += 4;
84             k1 += 4;
85             k2 += 4;
86             k3 += 4;
87             k4 += 4;
88             k5 += 4;
89             k6 += 4;
90             k7 += 4;
91             g0 += 32;
92         }
93     }
94 #endif // __aarch64__
95     for (; q + 3 < outch; q += 4)
96     {
97         const float* k0 = (const float*)kernel + (q + 0) * inch;
98         const float* k1 = (const float*)kernel + (q + 1) * inch;
99         const float* k2 = (const float*)kernel + (q + 2) * inch;
100         const float* k3 = (const float*)kernel + (q + 3) * inch;
101 
102 #if __aarch64__
103         float* g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
104 #else
105         float* g0 = kernel_tm_pack4.channel(q / 4);
106 #endif
107 
108         for (int p = 0; p + 3 < inch; p += 4)
109         {
110             g0[0] = k0[0];
111             g0[1] = k1[0];
112             g0[2] = k2[0];
113             g0[3] = k3[0];
114 
115             g0[4] = k0[1];
116             g0[5] = k1[1];
117             g0[6] = k2[1];
118             g0[7] = k3[1];
119 
120             g0[8] = k0[2];
121             g0[9] = k1[2];
122             g0[10] = k2[2];
123             g0[11] = k3[2];
124 
125             g0[12] = k0[3];
126             g0[13] = k1[3];
127             g0[14] = k2[3];
128             g0[15] = k3[3];
129 
130             k0 += 4;
131             k1 += 4;
132             k2 += 4;
133             k3 += 4;
134             g0 += 16;
135         }
136     }
137 }
138 
conv1x1s1_sgemm_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)139 static void conv1x1s1_sgemm_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
140 {
141     int w = bottom_blob.w;
142     int h = bottom_blob.h;
143     int inch = bottom_blob.c;
144     int outch = top_blob.c;
145 
146     size_t elemsize = bottom_blob.elemsize;
147     int elempack = bottom_blob.elempack;
148 
149     const int size = w * h;
150 
151     const float* bias = _bias;
152 
153     // interleave
154 #if __aarch64__
155     Mat tmp(12, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, elemsize, elempack, opt.workspace_allocator);
156 #else
157     Mat tmp(8, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, elemsize, elempack, opt.workspace_allocator);
158 #endif
159     {
160         int nn_size;
161         int remain_size_start;
162 
163 #if __aarch64__
164         nn_size = size / 12;
165         remain_size_start = nn_size * 12;
166 
167         #pragma omp parallel for num_threads(opt.num_threads)
168         for (int ii = 0; ii < nn_size; ii++)
169         {
170             int i = ii * 12;
171 
172             const float* img0 = bottom_blob.channel(0);
173             img0 += i * 4;
174 
175             float* tmpptr = tmp.channel(i / 12);
176 
177             for (int q = 0; q < inch; q++)
178             {
179                 asm volatile(
180                     "prfm   pldl1keep, [%0, #512]       \n"
181                     "ld4    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
182                     "prfm   pldl1keep, [%0, #512]       \n"
183                     "ld4    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
184                     "prfm   pldl1keep, [%0, #512]       \n"
185                     "ld4    {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
186                     "st1    {v0.4s}, [%1], #16          \n"
187                     "st1    {v4.4s}, [%1], #16          \n"
188                     "st1    {v8.4s}, [%1], #16          \n"
189                     "sub    %0, %0, #128                \n"
190                     "st1    {v1.4s}, [%1], #16          \n"
191                     "st1    {v5.4s}, [%1], #16          \n"
192                     "st1    {v9.4s}, [%1], #16          \n"
193                     "st1    {v2.4s}, [%1], #16          \n"
194                     "st1    {v6.4s}, [%1], #16          \n"
195                     "st1    {v10.4s}, [%1], #16         \n"
196                     "st1    {v3.4s}, [%1], #16          \n"
197                     "st1    {v7.4s}, [%1], #16          \n"
198                     "st1    {v11.4s}, [%1], #16         \n"
199                     : "=r"(img0),  // %0
200                     "=r"(tmpptr) // %1
201                     : "0"(img0),
202                     "1"(tmpptr)
203                     : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
204                 img0 += bottom_blob.cstep * 4;
205             }
206         }
207 #else
208         remain_size_start = 0;
209 #endif
210         nn_size = (size - remain_size_start) >> 3;
211 
212         #pragma omp parallel for num_threads(opt.num_threads)
213         for (int ii = 0; ii < nn_size; ii++)
214         {
215             int i = remain_size_start + ii * 8;
216 
217             const float* img0 = bottom_blob.channel(0);
218             img0 += i * 4;
219 
220 #if __aarch64__
221             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
222 #else
223             float* tmpptr = tmp.channel(i / 8);
224 #endif
225 
226             for (int q = 0; q < inch; q++)
227             {
228 #if __aarch64__
229                 asm volatile(
230                     "prfm   pldl1keep, [%0, #512]       \n"
231                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
232                     "prfm   pldl1keep, [%0, #512]       \n"
233                     "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
234                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
235                     "sub    %0, %0, #64                 \n"
236                     "st1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
237                     : "=r"(img0),  // %0
238                     "=r"(tmpptr) // %1
239                     : "0"(img0),
240                     "1"(tmpptr)
241                     : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
242 #else
243                 asm volatile(
244                     "pld        [%0, #512]          \n"
245                     "vldm       %0!, {d0-d7}        \n"
246                     "pld        [%0, #512]          \n"
247                     "vldm       %0, {d16-d23}       \n"
248 
249                     // transpose 8x4
250                     "vtrn.32    q0, q1              \n"
251                     "vtrn.32    q2, q3              \n"
252                     "vtrn.32    q8, q9              \n"
253                     "vtrn.32    q10, q11            \n"
254                     "vswp       d1, d4              \n"
255                     "vswp       d3, d6              \n"
256                     "vswp       d17, d20            \n"
257                     "vswp       d19, d22            \n"
258                     "vswp       q1, q8              \n"
259                     "vswp       q3, q10             \n"
260 
261                     "vst1.f32   {d0-d3}, [%1 :128]! \n"
262                     "vst1.f32   {d16-d19}, [%1 :128]! \n"
263                     "sub        %0, %0, #64         \n"
264                     "vst1.f32   {d4-d7}, [%1 :128]! \n"
265                     "vst1.f32   {d20-d23}, [%1 :128]! \n"
266                     : "=r"(img0),  // %0
267                     "=r"(tmpptr) // %1
268                     : "0"(img0),
269                     "1"(tmpptr)
270                     : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
271 #endif // __aarch64__
272                 img0 += bottom_blob.cstep * 4;
273             }
274         }
275 
276         remain_size_start += nn_size << 3;
277         nn_size = (size - remain_size_start) >> 2;
278 
279         #pragma omp parallel for num_threads(opt.num_threads)
280         for (int ii = 0; ii < nn_size; ii++)
281         {
282             int i = remain_size_start + ii * 4;
283 
284             const float* img0 = bottom_blob.channel(0);
285             img0 += i * 4;
286 
287 #if __aarch64__
288             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
289 #else
290             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
291 #endif
292 
293             for (int q = 0; q < inch; q++)
294             {
295 #if __aarch64__
296                 asm volatile(
297                     "prfm   pldl1keep, [%0, #512]       \n"
298                     "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
299                     "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
300                     : "=r"(img0),  // %0
301                     "=r"(tmpptr) // %1
302                     : "0"(img0),
303                     "1"(tmpptr)
304                     : "memory", "v0", "v1", "v2", "v3");
305 #else
306                 asm volatile(
307                     "pld        [%0, #512]          \n"
308                     "vldm       %0, {d0-d7}         \n"
309                     "vstm       %1!, {d0-d7}        \n"
310                     : "=r"(img0),  // %0
311                     "=r"(tmpptr) // %1
312                     : "0"(img0),
313                     "1"(tmpptr)
314                     : "memory", "q0", "q1", "q2", "q3");
315 #endif // __aarch64__
316                 img0 += bottom_blob.cstep * 4;
317             }
318         }
319 
320         remain_size_start += nn_size << 2;
321         nn_size = (size - remain_size_start) >> 1;
322 
323         #pragma omp parallel for num_threads(opt.num_threads)
324         for (int ii = 0; ii < nn_size; ii++)
325         {
326             int i = remain_size_start + ii * 2;
327 
328             const float* img0 = bottom_blob.channel(0);
329             img0 += i * 4;
330 
331 #if __aarch64__
332             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
333 #else
334             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
335 #endif
336 
337             for (int q = 0; q < inch; q++)
338             {
339 #if __aarch64__
340                 asm volatile(
341                     "prfm   pldl1keep, [%0, #256]       \n"
342                     "ld1    {v0.4s, v1.4s}, [%0]        \n"
343                     "st1    {v0.4s, v1.4s}, [%1], #32   \n"
344                     : "=r"(img0),  // %0
345                     "=r"(tmpptr) // %1
346                     : "0"(img0),
347                     "1"(tmpptr)
348                     : "memory", "v0", "v1");
349 #else
350                 asm volatile(
351                     "pld        [%0, #256]          \n"
352                     "vld1.f32   {d0-d3}, [%0 :128]  \n"
353                     "vst1.f32   {d0-d3}, [%1 :128]! \n"
354                     : "=r"(img0),  // %0
355                     "=r"(tmpptr) // %1
356                     : "0"(img0),
357                     "1"(tmpptr)
358                     : "memory", "q0", "q1");
359 #endif // __aarch64__
360                 img0 += bottom_blob.cstep * 4;
361             }
362         }
363 
364         remain_size_start += nn_size << 1;
365 
366         #pragma omp parallel for num_threads(opt.num_threads)
367         for (int i = remain_size_start; i < size; i++)
368         {
369             const float* img0 = bottom_blob.channel(0);
370             img0 += i * 4;
371 
372 #if __aarch64__
373             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
374 #else
375             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
376 #endif
377 
378             for (int q = 0; q < inch; q++)
379             {
380 #if __aarch64__
381                 asm volatile(
382                     "prfm   pldl1keep, [%0, #128]       \n"
383                     "ld1    {v0.4s}, [%0]               \n"
384                     "st1    {v0.4s}, [%1], #16          \n"
385                     : "=r"(img0),  // %0
386                     "=r"(tmpptr) // %1
387                     : "0"(img0),
388                     "1"(tmpptr)
389                     : "memory", "v0");
390 #else
391                 asm volatile(
392                     "pld        [%0, #128]          \n"
393                     "vld1.f32   {d0-d1}, [%0 :128]  \n"
394                     "vst1.f32   {d0-d1}, [%1 :128]! \n"
395                     : "=r"(img0),  // %0
396                     "=r"(tmpptr) // %1
397                     : "0"(img0),
398                     "1"(tmpptr)
399                     : "memory", "q0");
400 #endif // __aarch64__
401                 img0 += bottom_blob.cstep * 4;
402             }
403         }
404     }
405 
406     int remain_outch_start = 0;
407 
408 #if __ARM_NEON && __aarch64__
409     int nn_outch = 0;
410     nn_outch = outch >> 1;
411     remain_outch_start = nn_outch << 1;
412 
413     #pragma omp parallel for num_threads(opt.num_threads)
414     for (int pp = 0; pp < nn_outch; pp++)
415     {
416         int p = pp * 2;
417 
418         float* outptr0 = top_blob.channel(p);
419         float* outptr1 = top_blob.channel(p + 1);
420 
421         const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
422         const float* biasptr = bias ? bias + p * 4 : zeros;
423 
424         int i = 0;
425         for (; i + 11 < size; i += 12)
426         {
427             const float* tmpptr = tmp.channel(i / 12);
428 
429             const float* kptr01 = (const float*)kernel.channel(pp);
430 
431             int nn = inch; // inch always > 0
432 
433             asm volatile(
434                 "ld1    {v0.4s, v1.4s}, [%10]       \n"
435                 "mov    v8.16b, v0.16b              \n"
436                 "mov    v9.16b, v0.16b              \n"
437                 "mov    v10.16b, v0.16b             \n"
438                 "mov    v11.16b, v0.16b             \n"
439                 "mov    v12.16b, v0.16b             \n"
440                 "mov    v13.16b, v0.16b             \n"
441                 "mov    v14.16b, v0.16b             \n"
442                 "mov    v15.16b, v0.16b             \n"
443                 "mov    v16.16b, v0.16b             \n"
444                 "mov    v17.16b, v0.16b             \n"
445                 "mov    v18.16b, v0.16b             \n"
446                 "mov    v19.16b, v0.16b             \n"
447                 "mov    v20.16b, v1.16b             \n"
448                 "mov    v21.16b, v1.16b             \n"
449                 "mov    v22.16b, v1.16b             \n"
450                 "mov    v23.16b, v1.16b             \n"
451                 "mov    v24.16b, v1.16b             \n"
452                 "mov    v25.16b, v1.16b             \n"
453                 "mov    v26.16b, v1.16b             \n"
454                 "mov    v27.16b, v1.16b             \n"
455                 "mov    v28.16b, v1.16b             \n"
456                 "mov    v29.16b, v1.16b             \n"
457                 "mov    v30.16b, v1.16b             \n"
458                 "mov    v31.16b, v1.16b             \n"
459 
460                 "0:                                 \n"
461 
462                 "prfm   pldl1keep, [%3, #512]       \n"
463                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
464 
465                 "prfm   pldl1keep, [%4, #512]       \n"
466                 "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w0011_01
467 
468                 "fmla   v8.4s, v4.4s, v0.s[0]       \n"
469                 "fmla   v9.4s, v4.4s, v0.s[1]       \n"
470                 "fmla   v10.4s, v4.4s, v0.s[2]      \n"
471                 "fmla   v11.4s, v4.4s, v0.s[3]      \n"
472                 "fmla   v12.4s, v4.4s, v1.s[0]      \n"
473                 "fmla   v13.4s, v4.4s, v1.s[1]      \n"
474                 "fmla   v14.4s, v4.4s, v1.s[2]      \n"
475                 "fmla   v15.4s, v4.4s, v1.s[3]      \n"
476                 "fmla   v16.4s, v4.4s, v2.s[0]      \n"
477                 "fmla   v17.4s, v4.4s, v2.s[1]      \n"
478                 "fmla   v18.4s, v4.4s, v2.s[2]      \n"
479                 "fmla   v19.4s, v4.4s, v2.s[3]      \n"
480 
481                 "fmla   v20.4s, v5.4s, v0.s[0]      \n"
482                 "fmla   v21.4s, v5.4s, v0.s[1]      \n"
483                 "fmla   v22.4s, v5.4s, v0.s[2]      \n"
484                 "fmla   v23.4s, v5.4s, v0.s[3]      \n"
485                 "fmla   v24.4s, v5.4s, v1.s[0]      \n"
486                 "fmla   v25.4s, v5.4s, v1.s[1]      \n"
487                 "fmla   v26.4s, v5.4s, v1.s[2]      \n"
488                 "fmla   v27.4s, v5.4s, v1.s[3]      \n"
489                 "fmla   v28.4s, v5.4s, v2.s[0]      \n"
490                 "fmla   v29.4s, v5.4s, v2.s[1]      \n"
491                 "fmla   v30.4s, v5.4s, v2.s[2]      \n"
492                 "fmla   v31.4s, v5.4s, v2.s[3]      \n"
493 
494                 "fmla   v8.4s, v6.4s, v3.s[0]       \n"
495                 "fmla   v9.4s, v6.4s, v3.s[1]       \n"
496                 "fmla   v10.4s, v6.4s, v3.s[2]      \n"
497                 "fmla   v11.4s, v6.4s, v3.s[3]      \n"
498 
499                 "fmla   v20.4s, v7.4s, v3.s[0]      \n"
500                 "fmla   v21.4s, v7.4s, v3.s[1]      \n"
501                 "fmla   v22.4s, v7.4s, v3.s[2]      \n"
502                 "fmla   v23.4s, v7.4s, v3.s[3]      \n"
503 
504                 "prfm   pldl1keep, [%3, #512]       \n"
505                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
506 
507                 "fmla   v12.4s, v6.4s, v0.s[0]      \n"
508                 "fmla   v13.4s, v6.4s, v0.s[1]      \n"
509                 "fmla   v14.4s, v6.4s, v0.s[2]      \n"
510                 "fmla   v15.4s, v6.4s, v0.s[3]      \n"
511                 "fmla   v16.4s, v6.4s, v1.s[0]      \n"
512                 "fmla   v17.4s, v6.4s, v1.s[1]      \n"
513                 "fmla   v18.4s, v6.4s, v1.s[2]      \n"
514                 "fmla   v19.4s, v6.4s, v1.s[3]      \n"
515 
516                 "fmla   v24.4s, v7.4s, v0.s[0]      \n"
517                 "fmla   v25.4s, v7.4s, v0.s[1]      \n"
518                 "fmla   v26.4s, v7.4s, v0.s[2]      \n"
519                 "fmla   v27.4s, v7.4s, v0.s[3]      \n"
520                 "fmla   v28.4s, v7.4s, v1.s[0]      \n"
521                 "fmla   v29.4s, v7.4s, v1.s[1]      \n"
522                 "fmla   v30.4s, v7.4s, v1.s[2]      \n"
523                 "fmla   v31.4s, v7.4s, v1.s[3]      \n"
524 
525                 "prfm   pldl1keep, [%4, #512]       \n"
526                 "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w2233_01
527 
528                 "fmla   v8.4s, v4.4s, v2.s[0]       \n"
529                 "fmla   v9.4s, v4.4s, v2.s[1]       \n"
530                 "fmla   v10.4s, v4.4s, v2.s[2]      \n"
531                 "fmla   v11.4s, v4.4s, v2.s[3]      \n"
532                 "fmla   v12.4s, v4.4s, v3.s[0]      \n"
533                 "fmla   v13.4s, v4.4s, v3.s[1]      \n"
534                 "fmla   v14.4s, v4.4s, v3.s[2]      \n"
535                 "fmla   v15.4s, v4.4s, v3.s[3]      \n"
536 
537                 "fmla   v20.4s, v5.4s, v2.s[0]      \n"
538                 "fmla   v21.4s, v5.4s, v2.s[1]      \n"
539                 "fmla   v22.4s, v5.4s, v2.s[2]      \n"
540                 "fmla   v23.4s, v5.4s, v2.s[3]      \n"
541                 "fmla   v24.4s, v5.4s, v3.s[0]      \n"
542                 "fmla   v25.4s, v5.4s, v3.s[1]      \n"
543                 "fmla   v26.4s, v5.4s, v3.s[2]      \n"
544                 "fmla   v27.4s, v5.4s, v3.s[3]      \n"
545 
546                 "prfm   pldl1keep, [%3, #512]       \n"
547                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
548 
549                 "fmla   v16.4s, v4.4s, v0.s[0]      \n"
550                 "fmla   v17.4s, v4.4s, v0.s[1]      \n"
551                 "fmla   v18.4s, v4.4s, v0.s[2]      \n"
552                 "fmla   v19.4s, v4.4s, v0.s[3]      \n"
553 
554                 "fmla   v28.4s, v5.4s, v0.s[0]      \n"
555                 "fmla   v29.4s, v5.4s, v0.s[1]      \n"
556                 "fmla   v30.4s, v5.4s, v0.s[2]      \n"
557                 "fmla   v31.4s, v5.4s, v0.s[3]      \n"
558 
559                 "fmla   v8.4s, v6.4s, v1.s[0]       \n"
560                 "fmla   v9.4s, v6.4s, v1.s[1]       \n"
561                 "fmla   v10.4s, v6.4s, v1.s[2]      \n"
562                 "fmla   v11.4s, v6.4s, v1.s[3]      \n"
563                 "fmla   v12.4s, v6.4s, v2.s[0]      \n"
564                 "fmla   v13.4s, v6.4s, v2.s[1]      \n"
565                 "fmla   v14.4s, v6.4s, v2.s[2]      \n"
566                 "fmla   v15.4s, v6.4s, v2.s[3]      \n"
567                 "fmla   v16.4s, v6.4s, v3.s[0]      \n"
568                 "fmla   v17.4s, v6.4s, v3.s[1]      \n"
569                 "fmla   v18.4s, v6.4s, v3.s[2]      \n"
570                 "fmla   v19.4s, v6.4s, v3.s[3]      \n"
571 
572                 "subs   %w0, %w0, #1                \n"
573 
574                 "fmla   v20.4s, v7.4s, v1.s[0]      \n"
575                 "fmla   v21.4s, v7.4s, v1.s[1]      \n"
576                 "fmla   v22.4s, v7.4s, v1.s[2]      \n"
577                 "fmla   v23.4s, v7.4s, v1.s[3]      \n"
578                 "fmla   v24.4s, v7.4s, v2.s[0]      \n"
579                 "fmla   v25.4s, v7.4s, v2.s[1]      \n"
580                 "fmla   v26.4s, v7.4s, v2.s[2]      \n"
581                 "fmla   v27.4s, v7.4s, v2.s[3]      \n"
582                 "fmla   v28.4s, v7.4s, v3.s[0]      \n"
583                 "fmla   v29.4s, v7.4s, v3.s[1]      \n"
584                 "fmla   v30.4s, v7.4s, v3.s[2]      \n"
585                 "fmla   v31.4s, v7.4s, v3.s[3]      \n"
586 
587                 "bne    0b                          \n"
588 
589                 "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
590                 "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
591                 "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
592                 "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
593                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
594                 "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
595 
596                 : "=r"(nn),      // %0
597                 "=r"(outptr0), // %1
598                 "=r"(outptr1), // %2
599                 "=r"(tmpptr),  // %3
600                 "=r"(kptr01)   // %4
601                 : "0"(nn),
602                 "1"(outptr0),
603                 "2"(outptr1),
604                 "3"(tmpptr),
605                 "4"(kptr01),
606                 "r"(biasptr) // %10
607                 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
608         }
609         for (; i + 7 < size; i += 8)
610         {
611             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
612 
613             const float* kptr01 = (const float*)kernel.channel(pp);
614 
615             int nn = inch; // inch always > 0
616 
617             asm volatile(
618                 "ld1    {v0.4s, v1.4s}, [%10]       \n"
619                 "mov    v16.16b, v0.16b             \n"
620                 "mov    v17.16b, v0.16b             \n"
621                 "mov    v18.16b, v0.16b             \n"
622                 "mov    v19.16b, v0.16b             \n"
623                 "mov    v20.16b, v0.16b             \n"
624                 "mov    v21.16b, v0.16b             \n"
625                 "mov    v22.16b, v0.16b             \n"
626                 "mov    v23.16b, v0.16b             \n"
627                 "mov    v24.16b, v1.16b             \n"
628                 "mov    v25.16b, v1.16b             \n"
629                 "mov    v26.16b, v1.16b             \n"
630                 "mov    v27.16b, v1.16b             \n"
631                 "mov    v28.16b, v1.16b             \n"
632                 "mov    v29.16b, v1.16b             \n"
633                 "mov    v30.16b, v1.16b             \n"
634                 "mov    v31.16b, v1.16b             \n"
635 
636                 "0:                                 \n"
637 
638                 "prfm   pldl1keep, [%3, #512]       \n"
639                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
640 
641                 "prfm   pldl1keep, [%4, #512]       \n"
642                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
643 
644                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
645                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
646                 "fmla   v18.4s, v8.4s, v2.s[0]      \n"
647                 "fmla   v19.4s, v8.4s, v3.s[0]      \n"
648 
649                 "prfm   pldl1keep, [%3, #512]       \n"
650                 "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
651 
652                 "fmla   v20.4s, v8.4s, v4.s[0]      \n"
653                 "fmla   v21.4s, v8.4s, v5.s[0]      \n"
654                 "fmla   v22.4s, v8.4s, v6.s[0]      \n"
655                 "fmla   v23.4s, v8.4s, v7.s[0]      \n"
656 
657                 "fmla   v24.4s, v9.4s, v0.s[0]      \n"
658                 "fmla   v25.4s, v9.4s, v1.s[0]      \n"
659                 "fmla   v26.4s, v9.4s, v2.s[0]      \n"
660                 "fmla   v27.4s, v9.4s, v3.s[0]      \n"
661                 "fmla   v28.4s, v9.4s, v4.s[0]      \n"
662                 "fmla   v29.4s, v9.4s, v5.s[0]      \n"
663                 "fmla   v30.4s, v9.4s, v6.s[0]      \n"
664                 "fmla   v31.4s, v9.4s, v7.s[0]      \n"
665 
666                 "fmla   v16.4s, v10.4s, v0.s[1]     \n"
667                 "fmla   v17.4s, v10.4s, v1.s[1]     \n"
668                 "fmla   v18.4s, v10.4s, v2.s[1]     \n"
669                 "fmla   v19.4s, v10.4s, v3.s[1]     \n"
670                 "fmla   v20.4s, v10.4s, v4.s[1]     \n"
671                 "fmla   v21.4s, v10.4s, v5.s[1]     \n"
672                 "fmla   v22.4s, v10.4s, v6.s[1]     \n"
673                 "fmla   v23.4s, v10.4s, v7.s[1]     \n"
674 
675                 "fmla   v24.4s, v11.4s, v0.s[1]     \n"
676                 "fmla   v25.4s, v11.4s, v1.s[1]     \n"
677                 "fmla   v26.4s, v11.4s, v2.s[1]     \n"
678                 "fmla   v27.4s, v11.4s, v3.s[1]     \n"
679                 "fmla   v28.4s, v11.4s, v4.s[1]     \n"
680                 "fmla   v29.4s, v11.4s, v5.s[1]     \n"
681                 "fmla   v30.4s, v11.4s, v6.s[1]     \n"
682                 "fmla   v31.4s, v11.4s, v7.s[1]     \n"
683 
684                 "prfm   pldl1keep, [%4, #512]       \n"
685                 "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
686 
687                 "fmla   v16.4s, v12.4s, v0.s[2]     \n"
688                 "fmla   v17.4s, v12.4s, v1.s[2]     \n"
689                 "fmla   v18.4s, v12.4s, v2.s[2]     \n"
690                 "fmla   v19.4s, v12.4s, v3.s[2]     \n"
691                 "fmla   v20.4s, v12.4s, v4.s[2]     \n"
692                 "fmla   v21.4s, v12.4s, v5.s[2]     \n"
693                 "fmla   v22.4s, v12.4s, v6.s[2]     \n"
694                 "fmla   v23.4s, v12.4s, v7.s[2]     \n"
695 
696                 "fmla   v24.4s, v13.4s, v0.s[2]     \n"
697                 "fmla   v25.4s, v13.4s, v1.s[2]     \n"
698                 "fmla   v26.4s, v13.4s, v2.s[2]     \n"
699                 "fmla   v27.4s, v13.4s, v3.s[2]     \n"
700                 "fmla   v28.4s, v13.4s, v4.s[2]     \n"
701                 "fmla   v29.4s, v13.4s, v5.s[2]     \n"
702                 "fmla   v30.4s, v13.4s, v6.s[2]     \n"
703                 "fmla   v31.4s, v13.4s, v7.s[2]     \n"
704 
705                 "fmla   v16.4s, v14.4s, v0.s[3]     \n"
706                 "fmla   v17.4s, v14.4s, v1.s[3]     \n"
707                 "fmla   v18.4s, v14.4s, v2.s[3]     \n"
708                 "fmla   v19.4s, v14.4s, v3.s[3]     \n"
709                 "fmla   v20.4s, v14.4s, v4.s[3]     \n"
710                 "fmla   v21.4s, v14.4s, v5.s[3]     \n"
711                 "fmla   v22.4s, v14.4s, v6.s[3]     \n"
712                 "fmla   v23.4s, v14.4s, v7.s[3]     \n"
713 
714                 "subs   %w0, %w0, #1                \n"
715 
716                 "fmla   v24.4s, v15.4s, v0.s[3]     \n"
717                 "fmla   v25.4s, v15.4s, v1.s[3]     \n"
718                 "fmla   v26.4s, v15.4s, v2.s[3]     \n"
719                 "fmla   v27.4s, v15.4s, v3.s[3]     \n"
720                 "fmla   v28.4s, v15.4s, v4.s[3]     \n"
721                 "fmla   v29.4s, v15.4s, v5.s[3]     \n"
722                 "fmla   v30.4s, v15.4s, v6.s[3]     \n"
723                 "fmla   v31.4s, v15.4s, v7.s[3]     \n"
724 
725                 "bne    0b                          \n"
726 
727                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
728                 "st1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
729                 "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
730                 "st1    {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
731 
732                 : "=r"(nn),      // %0
733                 "=r"(outptr0), // %1
734                 "=r"(outptr1), // %2
735                 "=r"(tmpptr),  // %3
736                 "=r"(kptr01)   // %4
737                 : "0"(nn),
738                 "1"(outptr0),
739                 "2"(outptr1),
740                 "3"(tmpptr),
741                 "4"(kptr01),
742                 "r"(biasptr) // %10
743                 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
744         }
745         for (; i + 3 < size; i += 4)
746         {
747             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
748 
749             const float* kptr01 = (const float*)kernel.channel(pp);
750 
751             int nn = inch; // inch always > 0
752 
753             asm volatile(
754                 "ld1    {v0.4s, v1.4s}, [%10]       \n"
755                 "mov    v16.16b, v0.16b             \n"
756                 "mov    v17.16b, v0.16b             \n"
757                 "mov    v18.16b, v0.16b             \n"
758                 "mov    v19.16b, v0.16b             \n"
759                 "mov    v20.16b, v1.16b             \n"
760                 "mov    v21.16b, v1.16b             \n"
761                 "mov    v22.16b, v1.16b             \n"
762                 "mov    v23.16b, v1.16b             \n"
763 
764                 "0:                                 \n"
765 
766                 "prfm   pldl1keep, [%3, #512]       \n"
767                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
768 
769                 "prfm   pldl1keep, [%4, #512]       \n"
770                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
771 
772                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
773                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
774                 "fmla   v18.4s, v8.4s, v2.s[0]      \n"
775                 "fmla   v19.4s, v8.4s, v3.s[0]      \n"
776 
777                 "fmla   v20.4s, v9.4s, v0.s[0]      \n"
778                 "fmla   v21.4s, v9.4s, v1.s[0]      \n"
779                 "fmla   v22.4s, v9.4s, v2.s[0]      \n"
780                 "fmla   v23.4s, v9.4s, v3.s[0]      \n"
781 
782                 "prfm   pldl1keep, [%4, #512]       \n"
783                 "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
784 
785                 "fmla   v16.4s, v10.4s, v0.s[1]     \n"
786                 "fmla   v17.4s, v10.4s, v1.s[1]     \n"
787                 "fmla   v18.4s, v10.4s, v2.s[1]     \n"
788                 "fmla   v19.4s, v10.4s, v3.s[1]     \n"
789 
790                 "fmla   v20.4s, v11.4s, v0.s[1]     \n"
791                 "fmla   v21.4s, v11.4s, v1.s[1]     \n"
792                 "fmla   v22.4s, v11.4s, v2.s[1]     \n"
793                 "fmla   v23.4s, v11.4s, v3.s[1]     \n"
794 
795                 "fmla   v16.4s, v12.4s, v0.s[2]     \n"
796                 "fmla   v17.4s, v12.4s, v1.s[2]     \n"
797                 "fmla   v18.4s, v12.4s, v2.s[2]     \n"
798                 "fmla   v19.4s, v12.4s, v3.s[2]     \n"
799 
800                 "fmla   v20.4s, v13.4s, v0.s[2]     \n"
801                 "fmla   v21.4s, v13.4s, v1.s[2]     \n"
802                 "fmla   v22.4s, v13.4s, v2.s[2]     \n"
803                 "fmla   v23.4s, v13.4s, v3.s[2]     \n"
804 
805                 "subs   %w0, %w0, #1                \n"
806 
807                 "fmla   v16.4s, v14.4s, v0.s[3]     \n"
808                 "fmla   v17.4s, v14.4s, v1.s[3]     \n"
809                 "fmla   v18.4s, v14.4s, v2.s[3]     \n"
810                 "fmla   v19.4s, v14.4s, v3.s[3]     \n"
811 
812                 "fmla   v20.4s, v15.4s, v0.s[3]     \n"
813                 "fmla   v21.4s, v15.4s, v1.s[3]     \n"
814                 "fmla   v22.4s, v15.4s, v2.s[3]     \n"
815                 "fmla   v23.4s, v15.4s, v3.s[3]     \n"
816 
817                 "bne    0b                          \n"
818 
819                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
820                 "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
821 
822                 : "=r"(nn),      // %0
823                 "=r"(outptr0), // %1
824                 "=r"(outptr1), // %2
825                 "=r"(tmpptr),  // %3
826                 "=r"(kptr01)   // %4
827                 : "0"(nn),
828                 "1"(outptr0),
829                 "2"(outptr1),
830                 "3"(tmpptr),
831                 "4"(kptr01),
832                 "r"(biasptr) // %10
833                 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
834         }
835         for (; i + 1 < size; i += 2)
836         {
837             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
838 
839             const float* kptr01 = (const float*)kernel.channel(pp);
840 
841             int nn = inch; // inch always > 0
842 
843             asm volatile(
844                 "ld1    {v0.4s, v1.4s}, [%10]       \n"
845                 "mov    v16.16b, v0.16b             \n"
846                 "mov    v17.16b, v0.16b             \n"
847                 "mov    v18.16b, v1.16b             \n"
848                 "mov    v19.16b, v1.16b             \n"
849 
850                 "0:                                 \n"
851 
852                 "prfm   pldl1keep, [%3, #256]       \n"
853                 "ld1    {v0.4s, v1.4s}, [%3], #32   \n" // r0 r1
854 
855                 "prfm   pldl1keep, [%4, #512]       \n"
856                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
857 
858                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
859                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
860                 "fmla   v18.4s, v9.4s, v0.s[0]     \n"
861                 "fmla   v19.4s, v9.4s, v1.s[0]     \n"
862 
863                 "prfm   pldl1keep, [%4, #512]       \n"
864                 "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
865 
866                 "fmla   v16.4s, v10.4s, v0.s[1]      \n"
867                 "fmla   v17.4s, v10.4s, v1.s[1]      \n"
868                 "fmla   v18.4s, v11.4s, v0.s[1]     \n"
869                 "fmla   v19.4s, v11.4s, v1.s[1]     \n"
870 
871                 "fmla   v16.4s, v12.4s, v0.s[2]     \n"
872                 "fmla   v17.4s, v12.4s, v1.s[2]     \n"
873                 "fmla   v18.4s, v13.4s, v0.s[2]     \n"
874                 "fmla   v19.4s, v13.4s, v1.s[2]     \n"
875 
876                 "subs   %w0, %w0, #1                \n"
877 
878                 "fmla   v16.4s, v14.4s, v0.s[3]     \n"
879                 "fmla   v17.4s, v14.4s, v1.s[3]     \n"
880                 "fmla   v18.4s, v15.4s, v0.s[3]     \n"
881                 "fmla   v19.4s, v15.4s, v1.s[3]     \n"
882 
883                 "bne    0b                          \n"
884 
885                 "st1    {v16.4s, v17.4s}, [%1], #32 \n"
886                 "st1    {v18.4s, v19.4s}, [%2], #32 \n"
887 
888                 : "=r"(nn),      // %0
889                 "=r"(outptr0), // %1
890                 "=r"(outptr1), // %2
891                 "=r"(tmpptr),  // %3
892                 "=r"(kptr01)   // %4
893                 : "0"(nn),
894                 "1"(outptr0),
895                 "2"(outptr1),
896                 "3"(tmpptr),
897                 "4"(kptr01),
898                 "r"(biasptr) // %10
899                 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
900         }
901         for (; i < size; i++)
902         {
903             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
904 
905             const float* kptr01 = (const float*)kernel.channel(pp);
906 
907             int nn = inch; // inch always > 0
908 
909             asm volatile(
910                 "ld1    {v16.4s, v17.4s}, [%10]     \n"
911 
912                 "0:                                 \n"
913 
914                 "prfm   pldl1keep, [%3, #128]       \n"
915                 "ld1    {v0.4s}, [%3], #16          \n" // r0
916 
917                 "prfm   pldl1keep, [%4, #512]       \n"
918                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
919 
920                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
921                 "fmla   v17.4s, v9.4s, v0.s[0]      \n"
922 
923                 "prfm   pldl1keep, [%4, #512]       \n"
924                 "ld1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
925 
926                 "fmla   v16.4s, v10.4s, v0.s[1]     \n"
927                 "fmla   v17.4s, v11.4s, v0.s[1]     \n"
928 
929                 "fmla   v16.4s, v12.4s, v0.s[2]     \n"
930                 "fmla   v17.4s, v13.4s, v0.s[2]     \n"
931 
932                 "subs   %w0, %w0, #1                \n"
933 
934                 "fmla   v16.4s, v14.4s, v0.s[3]     \n"
935                 "fmla   v17.4s, v15.4s, v0.s[3]     \n"
936 
937                 "bne    0b                          \n"
938 
939                 "st1    {v16.4s}, [%1], #16         \n"
940                 "st1    {v17.4s}, [%2], #16         \n"
941 
942                 : "=r"(nn),      // %0
943                 "=r"(outptr0), // %1
944                 "=r"(outptr1), // %2
945                 "=r"(tmpptr),  // %3
946                 "=r"(kptr01)   // %4
947                 : "0"(nn),
948                 "1"(outptr0),
949                 "2"(outptr1),
950                 "3"(tmpptr),
951                 "4"(kptr01),
952                 "r"(biasptr) // %10
953                 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
954         }
955     }
956 
957 #endif // __ARM_NEON && __aarch64__
958 
959     #pragma omp parallel for num_threads(opt.num_threads)
960     for (int p = remain_outch_start; p < outch; p++)
961     {
962         float* outptr0 = top_blob.channel(p);
963 
964         const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
965         const float* biasptr = bias ? bias + p * 4 : zeros;
966 
967         int i = 0;
968 #if __aarch64__
969         for (; i + 11 < size; i += 12)
970         {
971             float* tmpptr = tmp.channel(i / 12);
972 
973             const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
974 
975             int nn = inch; // inch always > 0
976 
977             asm volatile(
978                 "ld1    {v0.4s}, [%8]               \n"
979                 "mov    v8.16b, v0.16b              \n"
980                 "mov    v9.16b, v0.16b              \n"
981                 "mov    v10.16b, v0.16b             \n"
982                 "mov    v11.16b, v0.16b             \n"
983                 "mov    v12.16b, v0.16b             \n"
984                 "mov    v13.16b, v0.16b             \n"
985                 "mov    v14.16b, v0.16b             \n"
986                 "mov    v15.16b, v0.16b             \n"
987                 "mov    v16.16b, v0.16b             \n"
988                 "mov    v17.16b, v0.16b             \n"
989                 "mov    v18.16b, v0.16b             \n"
990                 "mov    v19.16b, v0.16b             \n"
991 
992                 "0:                                 \n"
993 
994                 "prfm   pldl1keep, [%2, #512]       \n"
995                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
996 
997                 "prfm   pldl1keep, [%3, #512]       \n"
998                 "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
999 
1000                 "fmla   v8.4s, v4.4s, v0.s[0]       \n"
1001                 "fmla   v9.4s, v4.4s, v0.s[1]       \n"
1002                 "fmla   v10.4s, v4.4s, v0.s[2]      \n"
1003                 "fmla   v11.4s, v4.4s, v0.s[3]      \n"
1004                 "fmla   v12.4s, v4.4s, v1.s[0]      \n"
1005                 "fmla   v13.4s, v4.4s, v1.s[1]      \n"
1006                 "fmla   v14.4s, v4.4s, v1.s[2]      \n"
1007                 "fmla   v15.4s, v4.4s, v1.s[3]      \n"
1008                 "fmla   v16.4s, v4.4s, v2.s[0]      \n"
1009                 "fmla   v17.4s, v4.4s, v2.s[1]      \n"
1010                 "fmla   v18.4s, v4.4s, v2.s[2]      \n"
1011                 "fmla   v19.4s, v4.4s, v2.s[3]      \n"
1012 
1013                 "prfm   pldl1keep, [%2, #512]       \n"
1014                 "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1015 
1016                 "fmla   v8.4s, v5.4s, v3.s[0]       \n"
1017                 "fmla   v9.4s, v5.4s, v3.s[1]       \n"
1018                 "fmla   v10.4s, v5.4s, v3.s[2]      \n"
1019                 "fmla   v11.4s, v5.4s, v3.s[3]      \n"
1020                 "fmla   v12.4s, v5.4s, v20.s[0]     \n"
1021                 "fmla   v13.4s, v5.4s, v20.s[1]     \n"
1022                 "fmla   v14.4s, v5.4s, v20.s[2]     \n"
1023                 "fmla   v15.4s, v5.4s, v20.s[3]     \n"
1024                 "fmla   v16.4s, v5.4s, v21.s[0]     \n"
1025                 "fmla   v17.4s, v5.4s, v21.s[1]     \n"
1026                 "fmla   v18.4s, v5.4s, v21.s[2]     \n"
1027                 "fmla   v19.4s, v5.4s, v21.s[3]     \n"
1028 
1029                 "prfm   pldl1keep, [%2, #512]       \n"
1030                 "ld1    {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1031 
1032                 "fmla   v8.4s, v6.4s, v22.s[0]      \n"
1033                 "fmla   v9.4s, v6.4s, v22.s[1]      \n"
1034                 "fmla   v10.4s, v6.4s, v22.s[2]     \n"
1035                 "fmla   v11.4s, v6.4s, v22.s[3]     \n"
1036                 "fmla   v12.4s, v6.4s, v23.s[0]     \n"
1037                 "fmla   v13.4s, v6.4s, v23.s[1]     \n"
1038                 "fmla   v14.4s, v6.4s, v23.s[2]     \n"
1039                 "fmla   v15.4s, v6.4s, v23.s[3]     \n"
1040                 "fmla   v16.4s, v6.4s, v24.s[0]     \n"
1041                 "fmla   v17.4s, v6.4s, v24.s[1]     \n"
1042                 "fmla   v18.4s, v6.4s, v24.s[2]     \n"
1043                 "fmla   v19.4s, v6.4s, v24.s[3]     \n"
1044 
1045                 "subs   %w0, %w0, #1                \n"
1046 
1047                 "fmla   v8.4s, v7.4s, v25.s[0]      \n"
1048                 "fmla   v9.4s, v7.4s, v25.s[1]      \n"
1049                 "fmla   v10.4s, v7.4s, v25.s[2]     \n"
1050                 "fmla   v11.4s, v7.4s, v25.s[3]     \n"
1051                 "fmla   v12.4s, v7.4s, v26.s[0]     \n"
1052                 "fmla   v13.4s, v7.4s, v26.s[1]     \n"
1053                 "fmla   v14.4s, v7.4s, v26.s[2]     \n"
1054                 "fmla   v15.4s, v7.4s, v26.s[3]     \n"
1055                 "fmla   v16.4s, v7.4s, v27.s[0]     \n"
1056                 "fmla   v17.4s, v7.4s, v27.s[1]     \n"
1057                 "fmla   v18.4s, v7.4s, v27.s[2]     \n"
1058                 "fmla   v19.4s, v7.4s, v27.s[3]     \n"
1059 
1060                 "bne    0b                          \n"
1061 
1062                 "st1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
1063                 "st1    {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
1064                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1065 
1066                 : "=r"(nn),      // %0
1067                 "=r"(outptr0), // %1
1068                 "=r"(tmpptr),  // %2
1069                 "=r"(kptr0)    // %3
1070                 : "0"(nn),
1071                 "1"(outptr0),
1072                 "2"(tmpptr),
1073                 "3"(kptr0),
1074                 "r"(biasptr) // %8
1075                 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1076         }
1077 #endif
1078         for (; i + 7 < size; i += 8)
1079         {
1080 #if __aarch64__
1081             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
1082             const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1083 #else
1084             float* tmpptr = tmp.channel(i / 8);
1085             const float* kptr0 = (const float*)kernel.channel(p);
1086 #endif
1087 
1088             int nn = inch; // inch always > 0
1089 
1090 #if __aarch64__
1091             asm volatile(
1092                 "ld1    {v0.4s}, [%8]               \n"
1093                 "mov    v16.16b, v0.16b             \n"
1094                 "mov    v17.16b, v0.16b             \n"
1095                 "mov    v18.16b, v0.16b             \n"
1096                 "mov    v19.16b, v0.16b             \n"
1097                 "mov    v20.16b, v0.16b             \n"
1098                 "mov    v21.16b, v0.16b             \n"
1099                 "mov    v22.16b, v0.16b             \n"
1100                 "mov    v23.16b, v0.16b             \n"
1101 
1102                 "0:                                 \n"
1103 
1104                 "prfm   pldl1keep, [%2, #512]       \n"
1105                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1106 
1107                 "prfm   pldl1keep, [%3, #512]       \n"
1108                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1109 
1110                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1111                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1112                 "fmla   v18.4s, v8.4s, v2.s[0]      \n"
1113                 "fmla   v19.4s, v8.4s, v3.s[0]      \n"
1114 
1115                 "prfm   pldl1keep, [%2, #512]       \n"
1116                 "ld1    {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
1117 
1118                 "fmla   v20.4s, v8.4s, v4.s[0]      \n"
1119                 "fmla   v21.4s, v8.4s, v5.s[0]      \n"
1120                 "fmla   v22.4s, v8.4s, v6.s[0]      \n"
1121                 "fmla   v23.4s, v8.4s, v7.s[0]      \n"
1122 
1123                 "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1124                 "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1125                 "fmla   v18.4s, v9.4s, v2.s[1]      \n"
1126                 "fmla   v19.4s, v9.4s, v3.s[1]      \n"
1127                 "fmla   v20.4s, v9.4s, v4.s[1]      \n"
1128                 "fmla   v21.4s, v9.4s, v5.s[1]      \n"
1129                 "fmla   v22.4s, v9.4s, v6.s[1]      \n"
1130                 "fmla   v23.4s, v9.4s, v7.s[1]      \n"
1131 
1132                 "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1133                 "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1134                 "fmla   v18.4s, v10.4s, v2.s[2]     \n"
1135                 "fmla   v19.4s, v10.4s, v3.s[2]     \n"
1136                 "fmla   v20.4s, v10.4s, v4.s[2]     \n"
1137                 "fmla   v21.4s, v10.4s, v5.s[2]     \n"
1138                 "fmla   v22.4s, v10.4s, v6.s[2]     \n"
1139                 "fmla   v23.4s, v10.4s, v7.s[2]     \n"
1140 
1141                 "subs   %w0, %w0, #1                \n"
1142 
1143                 "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1144                 "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1145                 "fmla   v18.4s, v11.4s, v2.s[3]     \n"
1146                 "fmla   v19.4s, v11.4s, v3.s[3]     \n"
1147                 "fmla   v20.4s, v11.4s, v4.s[3]     \n"
1148                 "fmla   v21.4s, v11.4s, v5.s[3]     \n"
1149                 "fmla   v22.4s, v11.4s, v6.s[3]     \n"
1150                 "fmla   v23.4s, v11.4s, v7.s[3]     \n"
1151 
1152                 "bne    0b                          \n"
1153 
1154                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1155                 "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1156 
1157                 : "=r"(nn),      // %0
1158                 "=r"(outptr0), // %1
1159                 "=r"(tmpptr),  // %2
1160                 "=r"(kptr0)    // %3
1161                 : "0"(nn),
1162                 "1"(outptr0),
1163                 "2"(tmpptr),
1164                 "3"(kptr0),
1165                 "r"(biasptr) // %8
1166                 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1167 #else
1168             asm volatile(
1169                 "vld1.f32   {d0-d1}, [%8]   \n"
1170                 "vmov       q8, q0          \n"
1171                 "vmov       q9, q0          \n"
1172                 "vmov       q10, q0         \n"
1173                 "vmov       q11, q0         \n"
1174                 "vmov       q12, q0         \n"
1175                 "vmov       q13, q0         \n"
1176                 "vmov       q14, q0         \n"
1177                 "vmov       q15, q0         \n"
1178 
1179                 "0:                         \n"
1180 
1181                 "pld        [%2, #512]      \n"
1182                 "vldm       %2!, {d0-d7}    \n"
1183 
1184                 "pld        [%3, #512]      \n"
1185                 "vldm       %3!, {d8-d15}   \n"
1186 
1187                 "vmla.f32   q8, q4, d0[0]   \n"
1188                 "vmla.f32   q9, q4, d0[1]   \n"
1189                 "vmla.f32   q10, q4, d1[0]  \n"
1190                 "vmla.f32   q11, q4, d1[1]  \n"
1191                 "vmla.f32   q12, q4, d2[0]  \n"
1192                 "vmla.f32   q13, q4, d2[1]  \n"
1193                 "vmla.f32   q14, q4, d3[0]  \n"
1194                 "vmla.f32   q15, q4, d3[1]  \n"
1195 
1196                 "vmla.f32   q8, q5, d4[0]   \n"
1197                 "vmla.f32   q9, q5, d4[1]   \n"
1198                 "vmla.f32   q10, q5, d5[0]  \n"
1199                 "vmla.f32   q11, q5, d5[1]  \n"
1200                 "vmla.f32   q12, q5, d6[0]  \n"
1201                 "vmla.f32   q13, q5, d6[1]  \n"
1202                 "vmla.f32   q14, q5, d7[0]  \n"
1203                 "vmla.f32   q15, q5, d7[1]  \n"
1204 
1205                 "pld        [%2, #512]      \n"
1206                 "vldm       %2!, {d0-d7}    \n"
1207 
1208                 "vmla.f32   q8, q6, d0[0]   \n"
1209                 "vmla.f32   q9, q6, d0[1]   \n"
1210                 "vmla.f32   q10, q6, d1[0]  \n"
1211                 "vmla.f32   q11, q6, d1[1]  \n"
1212                 "vmla.f32   q12, q6, d2[0]  \n"
1213                 "vmla.f32   q13, q6, d2[1]  \n"
1214                 "vmla.f32   q14, q6, d3[0]  \n"
1215                 "vmla.f32   q15, q6, d3[1]  \n"
1216 
1217                 "subs       %0, %0, #1      \n"
1218 
1219                 "vmla.f32   q8, q7, d4[0]   \n"
1220                 "vmla.f32   q9, q7, d4[1]   \n"
1221                 "vmla.f32   q10, q7, d5[0]  \n"
1222                 "vmla.f32   q11, q7, d5[1]  \n"
1223                 "vmla.f32   q12, q7, d6[0]  \n"
1224                 "vmla.f32   q13, q7, d6[1]  \n"
1225                 "vmla.f32   q14, q7, d7[0]  \n"
1226                 "vmla.f32   q15, q7, d7[1]  \n"
1227 
1228                 "bne        0b              \n"
1229 
1230                 "vstm       %1!, {d16-d23}  \n"
1231                 "vstm       %1!, {d24-d31}  \n"
1232 
1233                 : "=r"(nn),      // %0
1234                 "=r"(outptr0), // %1
1235                 "=r"(tmpptr),  // %2
1236                 "=r"(kptr0)    // %3
1237                 : "0"(nn),
1238                 "1"(outptr0),
1239                 "2"(tmpptr),
1240                 "3"(kptr0),
1241                 "r"(biasptr) // %8
1242                 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1243 #endif
1244         }
1245         for (; i + 3 < size; i += 4)
1246         {
1247 #if __aarch64__
1248             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1249             const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1250 #else
1251             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
1252             const float* kptr0 = (const float*)kernel.channel(p);
1253 #endif
1254 
1255             int nn = inch; // inch always > 0
1256 
1257 #if __aarch64__
1258             asm volatile(
1259                 "ld1    {v0.4s}, [%8]               \n"
1260                 "mov    v16.16b, v0.16b             \n"
1261                 "mov    v17.16b, v0.16b             \n"
1262                 "mov    v18.16b, v0.16b             \n"
1263                 "mov    v19.16b, v0.16b             \n"
1264 
1265                 "0:                                 \n"
1266 
1267                 "prfm   pldl1keep, [%2, #512]       \n"
1268                 "ld1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1269 
1270                 "prfm   pldl1keep, [%3, #512]       \n"
1271                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1272 
1273                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1274                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1275                 "fmla   v18.4s, v8.4s, v2.s[0]      \n"
1276                 "fmla   v19.4s, v8.4s, v3.s[0]      \n"
1277 
1278                 "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1279                 "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1280                 "fmla   v18.4s, v9.4s, v2.s[1]      \n"
1281                 "fmla   v19.4s, v9.4s, v3.s[1]      \n"
1282 
1283                 "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1284                 "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1285                 "fmla   v18.4s, v10.4s, v2.s[2]     \n"
1286                 "fmla   v19.4s, v10.4s, v3.s[2]     \n"
1287 
1288                 "subs   %w0, %w0, #1                \n"
1289 
1290                 "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1291                 "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1292                 "fmla   v18.4s, v11.4s, v2.s[3]     \n"
1293                 "fmla   v19.4s, v11.4s, v3.s[3]     \n"
1294 
1295                 "bne    0b                          \n"
1296 
1297                 "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1298 
1299                 : "=r"(nn),      // %0
1300                 "=r"(outptr0), // %1
1301                 "=r"(tmpptr),  // %2
1302                 "=r"(kptr0)    // %3
1303                 : "0"(nn),
1304                 "1"(outptr0),
1305                 "2"(tmpptr),
1306                 "3"(kptr0),
1307                 "r"(biasptr) // %8
1308                 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1309 #else
1310             asm volatile(
1311                 "vld1.f32   {d0-d1}, [%8]   \n"
1312                 "vmov       q8, q0          \n"
1313                 "vmov       q9, q0          \n"
1314                 "vmov       q10, q0         \n"
1315                 "vmov       q11, q0         \n"
1316 
1317                 "0:                         \n"
1318 
1319                 "pld        [%2, #512]      \n"
1320                 "vldm       %2!, {d0-d7}    \n"
1321 
1322                 "pld        [%3, #512]      \n"
1323                 "vldm       %3!, {d8-d15}   \n"
1324 
1325                 "vmla.f32   q8, q4, d0[0]   \n"
1326                 "vmla.f32   q9, q4, d2[0]   \n"
1327                 "vmla.f32   q10, q4, d4[0]  \n"
1328                 "vmla.f32   q11, q4, d6[0]  \n"
1329 
1330                 "vmla.f32   q8, q5, d0[1]   \n"
1331                 "vmla.f32   q9, q5, d2[1]   \n"
1332                 "vmla.f32   q10, q5, d4[1]  \n"
1333                 "vmla.f32   q11, q5, d6[1]  \n"
1334 
1335                 "vmla.f32   q8, q6, d1[0]   \n"
1336                 "vmla.f32   q9, q6, d3[0]   \n"
1337                 "vmla.f32   q10, q6, d5[0]  \n"
1338                 "vmla.f32   q11, q6, d7[0]  \n"
1339 
1340                 "subs       %0, %0, #1      \n"
1341 
1342                 "vmla.f32   q8, q7, d1[1]   \n"
1343                 "vmla.f32   q9, q7, d3[1]   \n"
1344                 "vmla.f32   q10, q7, d5[1]  \n"
1345                 "vmla.f32   q11, q7, d7[1]  \n"
1346 
1347                 "bne        0b              \n"
1348 
1349                 "vstm       %1!, {d16-d23}  \n"
1350 
1351                 : "=r"(nn),      // %0
1352                 "=r"(outptr0), // %1
1353                 "=r"(tmpptr),  // %2
1354                 "=r"(kptr0)    // %3
1355                 : "0"(nn),
1356                 "1"(outptr0),
1357                 "2"(tmpptr),
1358                 "3"(kptr0),
1359                 "r"(biasptr) // %8
1360                 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1361 #endif
1362         }
1363         for (; i + 1 < size; i += 2)
1364         {
1365 #if __aarch64__
1366             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1367             const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1368 #else
1369             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
1370             const float* kptr0 = (const float*)kernel.channel(p);
1371 #endif
1372 
1373             int nn = inch; // inch always > 0
1374 
1375 #if __aarch64__
1376             asm volatile(
1377                 "ld1    {v0.4s}, [%8]               \n"
1378                 "mov    v16.16b, v0.16b             \n"
1379                 "mov    v17.16b, v0.16b             \n"
1380 
1381                 "0:                                 \n"
1382 
1383                 "prfm   pldl1keep, [%2, #256]       \n"
1384                 "ld1    {v0.4s, v1.4s}, [%2], #32   \n" // r0 r1
1385 
1386                 "prfm   pldl1keep, [%3, #512]       \n"
1387                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1388 
1389                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1390                 "fmla   v17.4s, v8.4s, v1.s[0]      \n"
1391 
1392                 "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1393                 "fmla   v17.4s, v9.4s, v1.s[1]      \n"
1394 
1395                 "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1396                 "fmla   v17.4s, v10.4s, v1.s[2]     \n"
1397 
1398                 "subs   %w0, %w0, #1                \n"
1399 
1400                 "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1401                 "fmla   v17.4s, v11.4s, v1.s[3]     \n"
1402 
1403                 "bne    0b                          \n"
1404 
1405                 "st1    {v16.4s, v17.4s}, [%1], #32 \n"
1406 
1407                 : "=r"(nn),      // %0
1408                 "=r"(outptr0), // %1
1409                 "=r"(tmpptr),  // %2
1410                 "=r"(kptr0)    // %3
1411                 : "0"(nn),
1412                 "1"(outptr0),
1413                 "2"(tmpptr),
1414                 "3"(kptr0),
1415                 "r"(biasptr) // %8
1416                 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
1417 #else
1418             asm volatile(
1419                 "vld1.f32   {d0-d1}, [%8]   \n"
1420                 "vmov       q8, q0          \n"
1421                 "vmov       q9, q0          \n"
1422 
1423                 "0:                         \n"
1424 
1425                 "pld        [%2, #256]      \n"
1426                 "vld1.f32   {d0-d3}, [%2 :128]! \n"
1427 
1428                 "pld        [%3, #512]      \n"
1429                 "vldm       %3!, {d8-d15}   \n"
1430 
1431                 "vmla.f32   q8, q4, d0[0]   \n"
1432                 "vmla.f32   q9, q4, d2[0]   \n"
1433 
1434                 "vmla.f32   q8, q5, d0[1]   \n"
1435                 "vmla.f32   q9, q5, d2[1]   \n"
1436 
1437                 "vmla.f32   q8, q6, d1[0]   \n"
1438                 "vmla.f32   q9, q6, d3[0]   \n"
1439 
1440                 "subs       %0, %0, #1      \n"
1441 
1442                 "vmla.f32   q8, q7, d1[1]   \n"
1443                 "vmla.f32   q9, q7, d3[1]   \n"
1444 
1445                 "bne        0b              \n"
1446 
1447                 "vst1.f32   {d16-d19}, [%1 :128]! \n"
1448 
1449                 : "=r"(nn),      // %0
1450                 "=r"(outptr0), // %1
1451                 "=r"(tmpptr),  // %2
1452                 "=r"(kptr0)    // %3
1453                 : "0"(nn),
1454                 "1"(outptr0),
1455                 "2"(tmpptr),
1456                 "3"(kptr0),
1457                 "r"(biasptr) // %8
1458                 : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
1459 #endif
1460         }
1461         for (; i < size; i++)
1462         {
1463 #if __aarch64__
1464             float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1465             const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1466 #else
1467             float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
1468             const float* kptr0 = (const float*)kernel.channel(p);
1469 #endif
1470 
1471             int nn = inch; // inch always > 0
1472 
1473 #if __aarch64__
1474             asm volatile(
1475                 "ld1    {v16.4s}, [%8]              \n"
1476 
1477                 "0:                                 \n"
1478 
1479                 "prfm   pldl1keep, [%2, #128]       \n"
1480                 "ld1    {v0.4s}, [%2], #16          \n" // r0
1481 
1482                 "prfm   pldl1keep, [%3, #512]       \n"
1483                 "ld1    {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1484 
1485                 "fmla   v16.4s, v8.4s, v0.s[0]      \n"
1486                 "fmla   v16.4s, v9.4s, v0.s[1]      \n"
1487 
1488                 "subs   %w0, %w0, #1                \n"
1489 
1490                 "fmla   v16.4s, v10.4s, v0.s[2]     \n"
1491                 "fmla   v16.4s, v11.4s, v0.s[3]     \n"
1492 
1493                 "bne    0b                          \n"
1494 
1495                 "st1    {v16.4s}, [%1], #16         \n"
1496 
1497                 : "=r"(nn),      // %0
1498                 "=r"(outptr0), // %1
1499                 "=r"(tmpptr),  // %2
1500                 "=r"(kptr0)    // %3
1501                 : "0"(nn),
1502                 "1"(outptr0),
1503                 "2"(tmpptr),
1504                 "3"(kptr0),
1505                 "r"(biasptr) // %8
1506                 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
1507 #else
1508             asm volatile(
1509                 "vld1.f32   {d16-d17}, [%8] \n"
1510 
1511                 "0:                         \n"
1512 
1513                 "pld        [%2, #128]      \n"
1514                 "vld1.f32   {d0-d1}, [%2 :128]! \n"
1515 
1516                 "pld        [%3, #512]      \n"
1517                 "vldm       %3!, {d8-d15}   \n"
1518 
1519                 "vmla.f32   q8, q4, d0[0]   \n"
1520                 "vmla.f32   q8, q5, d0[1]   \n"
1521 
1522                 "subs       %0, %0, #1      \n"
1523 
1524                 "vmla.f32   q8, q6, d1[0]   \n"
1525                 "vmla.f32   q8, q7, d1[1]   \n"
1526 
1527                 "bne        0b              \n"
1528 
1529                 "vst1.f32   {d16-d17}, [%1 :128]! \n"
1530 
1531                 : "=r"(nn),      // %0
1532                 "=r"(outptr0), // %1
1533                 "=r"(tmpptr),  // %2
1534                 "=r"(kptr0)    // %3
1535                 : "0"(nn),
1536                 "1"(outptr0),
1537                 "2"(tmpptr),
1538                 "3"(kptr0),
1539                 "r"(biasptr) // %8
1540                 : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
1541 #endif
1542         }
1543     }
1544 
1545     //     // NOTE sgemm
1546     //     for (; p<outch; p++)
1547     //     {
1548     //         Mat out0 = top_blob.channel(p);
1549     //
1550     //         const float bias0 = bias ? bias[p] : 0.f;
1551     //
1552     //         float* outptr0 = out0;
1553     //
1554     //         for (int i=0; i<size; i++)
1555     //         {
1556     //             float sum = bias0;
1557     //
1558     //             const float* kptr = _kernel.channel(p);
1559     //
1560     //             for (int q=0; q<inch; q++)
1561     //             {
1562     //                 const float* img0 = bottom_blob.channel(q);
1563     //
1564     //                 sum += img0[i] * kptr[0];
1565     //                 kptr ++;
1566     //             }
1567     //
1568     //             outptr0[i] = sum;
1569     //         }
1570     //     }
1571 }
1572 
conv1x1s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)1573 static void conv1x1s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
1574 {
1575     int w = bottom_blob.w;
1576     int channels = bottom_blob.c;
1577     size_t elemsize = bottom_blob.elemsize;
1578     int elempack = bottom_blob.elempack;
1579 
1580     int outw = top_blob.w;
1581     int outh = top_blob.h;
1582 
1583     const int tailstep = (w - 2 * outw + w) * 4;
1584 
1585     Mat bottom_blob_shrinked;
1586     bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
1587 
1588     #pragma omp parallel for num_threads(opt.num_threads)
1589     for (int p = 0; p < channels; p++)
1590     {
1591         const float* r0 = bottom_blob.channel(p);
1592         float* outptr = bottom_blob_shrinked.channel(p);
1593 
1594         for (int i = 0; i < outh; i++)
1595         {
1596             for (int j = 0; j < outw; j++)
1597             {
1598                 float32x4_t _v = vld1q_f32(r0);
1599                 vst1q_f32(outptr, _v);
1600 
1601                 r0 += 8;
1602                 outptr += 4;
1603             }
1604 
1605             r0 += tailstep;
1606         }
1607     }
1608 
1609     conv1x1s1_sgemm_pack4_neon(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
1610 }
1611