1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv1x1s1_sgemm_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)15 static void conv1x1s1_sgemm_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
16 {
17 // interleave
18 // src = inch-outch
19 // dst = 4b-4a-inch/4a-outch/4b
20 #if __aarch64__
21 kernel_tm_pack4.create(2 * 1, inch / 4, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
22 #else
23 kernel_tm_pack4.create(1, inch / 4, outch / 4, (size_t)4u * 16, 16);
24 #endif
25
26 int q = 0;
27 #if __aarch64__
28 for (; q + 7 < outch; q += 8)
29 {
30 const float* k0 = (const float*)kernel + (q + 0) * inch;
31 const float* k1 = (const float*)kernel + (q + 1) * inch;
32 const float* k2 = (const float*)kernel + (q + 2) * inch;
33 const float* k3 = (const float*)kernel + (q + 3) * inch;
34 const float* k4 = (const float*)kernel + (q + 4) * inch;
35 const float* k5 = (const float*)kernel + (q + 5) * inch;
36 const float* k6 = (const float*)kernel + (q + 6) * inch;
37 const float* k7 = (const float*)kernel + (q + 7) * inch;
38
39 float* g0 = kernel_tm_pack4.channel(q / 8);
40
41 for (int p = 0; p + 3 < inch; p += 4)
42 {
43 g0[0] = k0[0];
44 g0[1] = k1[0];
45 g0[2] = k2[0];
46 g0[3] = k3[0];
47
48 g0[4] = k4[0];
49 g0[5] = k5[0];
50 g0[6] = k6[0];
51 g0[7] = k7[0];
52
53 g0[8] = k0[1];
54 g0[9] = k1[1];
55 g0[10] = k2[1];
56 g0[11] = k3[1];
57
58 g0[12] = k4[1];
59 g0[13] = k5[1];
60 g0[14] = k6[1];
61 g0[15] = k7[1];
62
63 g0[16] = k0[2];
64 g0[17] = k1[2];
65 g0[18] = k2[2];
66 g0[19] = k3[2];
67
68 g0[20] = k4[2];
69 g0[21] = k5[2];
70 g0[22] = k6[2];
71 g0[23] = k7[2];
72
73 g0[24] = k0[3];
74 g0[25] = k1[3];
75 g0[26] = k2[3];
76 g0[27] = k3[3];
77
78 g0[28] = k4[3];
79 g0[29] = k5[3];
80 g0[30] = k6[3];
81 g0[31] = k7[3];
82
83 k0 += 4;
84 k1 += 4;
85 k2 += 4;
86 k3 += 4;
87 k4 += 4;
88 k5 += 4;
89 k6 += 4;
90 k7 += 4;
91 g0 += 32;
92 }
93 }
94 #endif // __aarch64__
95 for (; q + 3 < outch; q += 4)
96 {
97 const float* k0 = (const float*)kernel + (q + 0) * inch;
98 const float* k1 = (const float*)kernel + (q + 1) * inch;
99 const float* k2 = (const float*)kernel + (q + 2) * inch;
100 const float* k3 = (const float*)kernel + (q + 3) * inch;
101
102 #if __aarch64__
103 float* g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
104 #else
105 float* g0 = kernel_tm_pack4.channel(q / 4);
106 #endif
107
108 for (int p = 0; p + 3 < inch; p += 4)
109 {
110 g0[0] = k0[0];
111 g0[1] = k1[0];
112 g0[2] = k2[0];
113 g0[3] = k3[0];
114
115 g0[4] = k0[1];
116 g0[5] = k1[1];
117 g0[6] = k2[1];
118 g0[7] = k3[1];
119
120 g0[8] = k0[2];
121 g0[9] = k1[2];
122 g0[10] = k2[2];
123 g0[11] = k3[2];
124
125 g0[12] = k0[3];
126 g0[13] = k1[3];
127 g0[14] = k2[3];
128 g0[15] = k3[3];
129
130 k0 += 4;
131 k1 += 4;
132 k2 += 4;
133 k3 += 4;
134 g0 += 16;
135 }
136 }
137 }
138
conv1x1s1_sgemm_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)139 static void conv1x1s1_sgemm_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
140 {
141 int w = bottom_blob.w;
142 int h = bottom_blob.h;
143 int inch = bottom_blob.c;
144 int outch = top_blob.c;
145
146 size_t elemsize = bottom_blob.elemsize;
147 int elempack = bottom_blob.elempack;
148
149 const int size = w * h;
150
151 const float* bias = _bias;
152
153 // interleave
154 #if __aarch64__
155 Mat tmp(12, inch, size / 12 + (size % 12) / 8 + (size % 12 % 8) / 4 + (size % 12 % 4) / 2 + size % 12 % 2, elemsize, elempack, opt.workspace_allocator);
156 #else
157 Mat tmp(8, inch, size / 8 + (size % 8) / 4 + (size % 4) / 2 + size % 2, elemsize, elempack, opt.workspace_allocator);
158 #endif
159 {
160 int nn_size;
161 int remain_size_start;
162
163 #if __aarch64__
164 nn_size = size / 12;
165 remain_size_start = nn_size * 12;
166
167 #pragma omp parallel for num_threads(opt.num_threads)
168 for (int ii = 0; ii < nn_size; ii++)
169 {
170 int i = ii * 12;
171
172 const float* img0 = bottom_blob.channel(0);
173 img0 += i * 4;
174
175 float* tmpptr = tmp.channel(i / 12);
176
177 for (int q = 0; q < inch; q++)
178 {
179 asm volatile(
180 "prfm pldl1keep, [%0, #512] \n"
181 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
182 "prfm pldl1keep, [%0, #512] \n"
183 "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
184 "prfm pldl1keep, [%0, #512] \n"
185 "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
186 "st1 {v0.4s}, [%1], #16 \n"
187 "st1 {v4.4s}, [%1], #16 \n"
188 "st1 {v8.4s}, [%1], #16 \n"
189 "sub %0, %0, #128 \n"
190 "st1 {v1.4s}, [%1], #16 \n"
191 "st1 {v5.4s}, [%1], #16 \n"
192 "st1 {v9.4s}, [%1], #16 \n"
193 "st1 {v2.4s}, [%1], #16 \n"
194 "st1 {v6.4s}, [%1], #16 \n"
195 "st1 {v10.4s}, [%1], #16 \n"
196 "st1 {v3.4s}, [%1], #16 \n"
197 "st1 {v7.4s}, [%1], #16 \n"
198 "st1 {v11.4s}, [%1], #16 \n"
199 : "=r"(img0), // %0
200 "=r"(tmpptr) // %1
201 : "0"(img0),
202 "1"(tmpptr)
203 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
204 img0 += bottom_blob.cstep * 4;
205 }
206 }
207 #else
208 remain_size_start = 0;
209 #endif
210 nn_size = (size - remain_size_start) >> 3;
211
212 #pragma omp parallel for num_threads(opt.num_threads)
213 for (int ii = 0; ii < nn_size; ii++)
214 {
215 int i = remain_size_start + ii * 8;
216
217 const float* img0 = bottom_blob.channel(0);
218 img0 += i * 4;
219
220 #if __aarch64__
221 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
222 #else
223 float* tmpptr = tmp.channel(i / 8);
224 #endif
225
226 for (int q = 0; q < inch; q++)
227 {
228 #if __aarch64__
229 asm volatile(
230 "prfm pldl1keep, [%0, #512] \n"
231 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
232 "prfm pldl1keep, [%0, #512] \n"
233 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
234 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
235 "sub %0, %0, #64 \n"
236 "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
237 : "=r"(img0), // %0
238 "=r"(tmpptr) // %1
239 : "0"(img0),
240 "1"(tmpptr)
241 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
242 #else
243 asm volatile(
244 "pld [%0, #512] \n"
245 "vldm %0!, {d0-d7} \n"
246 "pld [%0, #512] \n"
247 "vldm %0, {d16-d23} \n"
248
249 // transpose 8x4
250 "vtrn.32 q0, q1 \n"
251 "vtrn.32 q2, q3 \n"
252 "vtrn.32 q8, q9 \n"
253 "vtrn.32 q10, q11 \n"
254 "vswp d1, d4 \n"
255 "vswp d3, d6 \n"
256 "vswp d17, d20 \n"
257 "vswp d19, d22 \n"
258 "vswp q1, q8 \n"
259 "vswp q3, q10 \n"
260
261 "vst1.f32 {d0-d3}, [%1 :128]! \n"
262 "vst1.f32 {d16-d19}, [%1 :128]! \n"
263 "sub %0, %0, #64 \n"
264 "vst1.f32 {d4-d7}, [%1 :128]! \n"
265 "vst1.f32 {d20-d23}, [%1 :128]! \n"
266 : "=r"(img0), // %0
267 "=r"(tmpptr) // %1
268 : "0"(img0),
269 "1"(tmpptr)
270 : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
271 #endif // __aarch64__
272 img0 += bottom_blob.cstep * 4;
273 }
274 }
275
276 remain_size_start += nn_size << 3;
277 nn_size = (size - remain_size_start) >> 2;
278
279 #pragma omp parallel for num_threads(opt.num_threads)
280 for (int ii = 0; ii < nn_size; ii++)
281 {
282 int i = remain_size_start + ii * 4;
283
284 const float* img0 = bottom_blob.channel(0);
285 img0 += i * 4;
286
287 #if __aarch64__
288 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
289 #else
290 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
291 #endif
292
293 for (int q = 0; q < inch; q++)
294 {
295 #if __aarch64__
296 asm volatile(
297 "prfm pldl1keep, [%0, #512] \n"
298 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
299 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
300 : "=r"(img0), // %0
301 "=r"(tmpptr) // %1
302 : "0"(img0),
303 "1"(tmpptr)
304 : "memory", "v0", "v1", "v2", "v3");
305 #else
306 asm volatile(
307 "pld [%0, #512] \n"
308 "vldm %0, {d0-d7} \n"
309 "vstm %1!, {d0-d7} \n"
310 : "=r"(img0), // %0
311 "=r"(tmpptr) // %1
312 : "0"(img0),
313 "1"(tmpptr)
314 : "memory", "q0", "q1", "q2", "q3");
315 #endif // __aarch64__
316 img0 += bottom_blob.cstep * 4;
317 }
318 }
319
320 remain_size_start += nn_size << 2;
321 nn_size = (size - remain_size_start) >> 1;
322
323 #pragma omp parallel for num_threads(opt.num_threads)
324 for (int ii = 0; ii < nn_size; ii++)
325 {
326 int i = remain_size_start + ii * 2;
327
328 const float* img0 = bottom_blob.channel(0);
329 img0 += i * 4;
330
331 #if __aarch64__
332 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
333 #else
334 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
335 #endif
336
337 for (int q = 0; q < inch; q++)
338 {
339 #if __aarch64__
340 asm volatile(
341 "prfm pldl1keep, [%0, #256] \n"
342 "ld1 {v0.4s, v1.4s}, [%0] \n"
343 "st1 {v0.4s, v1.4s}, [%1], #32 \n"
344 : "=r"(img0), // %0
345 "=r"(tmpptr) // %1
346 : "0"(img0),
347 "1"(tmpptr)
348 : "memory", "v0", "v1");
349 #else
350 asm volatile(
351 "pld [%0, #256] \n"
352 "vld1.f32 {d0-d3}, [%0 :128] \n"
353 "vst1.f32 {d0-d3}, [%1 :128]! \n"
354 : "=r"(img0), // %0
355 "=r"(tmpptr) // %1
356 : "0"(img0),
357 "1"(tmpptr)
358 : "memory", "q0", "q1");
359 #endif // __aarch64__
360 img0 += bottom_blob.cstep * 4;
361 }
362 }
363
364 remain_size_start += nn_size << 1;
365
366 #pragma omp parallel for num_threads(opt.num_threads)
367 for (int i = remain_size_start; i < size; i++)
368 {
369 const float* img0 = bottom_blob.channel(0);
370 img0 += i * 4;
371
372 #if __aarch64__
373 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
374 #else
375 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
376 #endif
377
378 for (int q = 0; q < inch; q++)
379 {
380 #if __aarch64__
381 asm volatile(
382 "prfm pldl1keep, [%0, #128] \n"
383 "ld1 {v0.4s}, [%0] \n"
384 "st1 {v0.4s}, [%1], #16 \n"
385 : "=r"(img0), // %0
386 "=r"(tmpptr) // %1
387 : "0"(img0),
388 "1"(tmpptr)
389 : "memory", "v0");
390 #else
391 asm volatile(
392 "pld [%0, #128] \n"
393 "vld1.f32 {d0-d1}, [%0 :128] \n"
394 "vst1.f32 {d0-d1}, [%1 :128]! \n"
395 : "=r"(img0), // %0
396 "=r"(tmpptr) // %1
397 : "0"(img0),
398 "1"(tmpptr)
399 : "memory", "q0");
400 #endif // __aarch64__
401 img0 += bottom_blob.cstep * 4;
402 }
403 }
404 }
405
406 int remain_outch_start = 0;
407
408 #if __ARM_NEON && __aarch64__
409 int nn_outch = 0;
410 nn_outch = outch >> 1;
411 remain_outch_start = nn_outch << 1;
412
413 #pragma omp parallel for num_threads(opt.num_threads)
414 for (int pp = 0; pp < nn_outch; pp++)
415 {
416 int p = pp * 2;
417
418 float* outptr0 = top_blob.channel(p);
419 float* outptr1 = top_blob.channel(p + 1);
420
421 const float zeros[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
422 const float* biasptr = bias ? bias + p * 4 : zeros;
423
424 int i = 0;
425 for (; i + 11 < size; i += 12)
426 {
427 const float* tmpptr = tmp.channel(i / 12);
428
429 const float* kptr01 = (const float*)kernel.channel(pp);
430
431 int nn = inch; // inch always > 0
432
433 asm volatile(
434 "ld1 {v0.4s, v1.4s}, [%10] \n"
435 "mov v8.16b, v0.16b \n"
436 "mov v9.16b, v0.16b \n"
437 "mov v10.16b, v0.16b \n"
438 "mov v11.16b, v0.16b \n"
439 "mov v12.16b, v0.16b \n"
440 "mov v13.16b, v0.16b \n"
441 "mov v14.16b, v0.16b \n"
442 "mov v15.16b, v0.16b \n"
443 "mov v16.16b, v0.16b \n"
444 "mov v17.16b, v0.16b \n"
445 "mov v18.16b, v0.16b \n"
446 "mov v19.16b, v0.16b \n"
447 "mov v20.16b, v1.16b \n"
448 "mov v21.16b, v1.16b \n"
449 "mov v22.16b, v1.16b \n"
450 "mov v23.16b, v1.16b \n"
451 "mov v24.16b, v1.16b \n"
452 "mov v25.16b, v1.16b \n"
453 "mov v26.16b, v1.16b \n"
454 "mov v27.16b, v1.16b \n"
455 "mov v28.16b, v1.16b \n"
456 "mov v29.16b, v1.16b \n"
457 "mov v30.16b, v1.16b \n"
458 "mov v31.16b, v1.16b \n"
459
460 "0: \n"
461
462 "prfm pldl1keep, [%3, #512] \n"
463 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
464
465 "prfm pldl1keep, [%4, #512] \n"
466 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w0011_01
467
468 "fmla v8.4s, v4.4s, v0.s[0] \n"
469 "fmla v9.4s, v4.4s, v0.s[1] \n"
470 "fmla v10.4s, v4.4s, v0.s[2] \n"
471 "fmla v11.4s, v4.4s, v0.s[3] \n"
472 "fmla v12.4s, v4.4s, v1.s[0] \n"
473 "fmla v13.4s, v4.4s, v1.s[1] \n"
474 "fmla v14.4s, v4.4s, v1.s[2] \n"
475 "fmla v15.4s, v4.4s, v1.s[3] \n"
476 "fmla v16.4s, v4.4s, v2.s[0] \n"
477 "fmla v17.4s, v4.4s, v2.s[1] \n"
478 "fmla v18.4s, v4.4s, v2.s[2] \n"
479 "fmla v19.4s, v4.4s, v2.s[3] \n"
480
481 "fmla v20.4s, v5.4s, v0.s[0] \n"
482 "fmla v21.4s, v5.4s, v0.s[1] \n"
483 "fmla v22.4s, v5.4s, v0.s[2] \n"
484 "fmla v23.4s, v5.4s, v0.s[3] \n"
485 "fmla v24.4s, v5.4s, v1.s[0] \n"
486 "fmla v25.4s, v5.4s, v1.s[1] \n"
487 "fmla v26.4s, v5.4s, v1.s[2] \n"
488 "fmla v27.4s, v5.4s, v1.s[3] \n"
489 "fmla v28.4s, v5.4s, v2.s[0] \n"
490 "fmla v29.4s, v5.4s, v2.s[1] \n"
491 "fmla v30.4s, v5.4s, v2.s[2] \n"
492 "fmla v31.4s, v5.4s, v2.s[3] \n"
493
494 "fmla v8.4s, v6.4s, v3.s[0] \n"
495 "fmla v9.4s, v6.4s, v3.s[1] \n"
496 "fmla v10.4s, v6.4s, v3.s[2] \n"
497 "fmla v11.4s, v6.4s, v3.s[3] \n"
498
499 "fmla v20.4s, v7.4s, v3.s[0] \n"
500 "fmla v21.4s, v7.4s, v3.s[1] \n"
501 "fmla v22.4s, v7.4s, v3.s[2] \n"
502 "fmla v23.4s, v7.4s, v3.s[3] \n"
503
504 "prfm pldl1keep, [%3, #512] \n"
505 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
506
507 "fmla v12.4s, v6.4s, v0.s[0] \n"
508 "fmla v13.4s, v6.4s, v0.s[1] \n"
509 "fmla v14.4s, v6.4s, v0.s[2] \n"
510 "fmla v15.4s, v6.4s, v0.s[3] \n"
511 "fmla v16.4s, v6.4s, v1.s[0] \n"
512 "fmla v17.4s, v6.4s, v1.s[1] \n"
513 "fmla v18.4s, v6.4s, v1.s[2] \n"
514 "fmla v19.4s, v6.4s, v1.s[3] \n"
515
516 "fmla v24.4s, v7.4s, v0.s[0] \n"
517 "fmla v25.4s, v7.4s, v0.s[1] \n"
518 "fmla v26.4s, v7.4s, v0.s[2] \n"
519 "fmla v27.4s, v7.4s, v0.s[3] \n"
520 "fmla v28.4s, v7.4s, v1.s[0] \n"
521 "fmla v29.4s, v7.4s, v1.s[1] \n"
522 "fmla v30.4s, v7.4s, v1.s[2] \n"
523 "fmla v31.4s, v7.4s, v1.s[3] \n"
524
525 "prfm pldl1keep, [%4, #512] \n"
526 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w2233_01
527
528 "fmla v8.4s, v4.4s, v2.s[0] \n"
529 "fmla v9.4s, v4.4s, v2.s[1] \n"
530 "fmla v10.4s, v4.4s, v2.s[2] \n"
531 "fmla v11.4s, v4.4s, v2.s[3] \n"
532 "fmla v12.4s, v4.4s, v3.s[0] \n"
533 "fmla v13.4s, v4.4s, v3.s[1] \n"
534 "fmla v14.4s, v4.4s, v3.s[2] \n"
535 "fmla v15.4s, v4.4s, v3.s[3] \n"
536
537 "fmla v20.4s, v5.4s, v2.s[0] \n"
538 "fmla v21.4s, v5.4s, v2.s[1] \n"
539 "fmla v22.4s, v5.4s, v2.s[2] \n"
540 "fmla v23.4s, v5.4s, v2.s[3] \n"
541 "fmla v24.4s, v5.4s, v3.s[0] \n"
542 "fmla v25.4s, v5.4s, v3.s[1] \n"
543 "fmla v26.4s, v5.4s, v3.s[2] \n"
544 "fmla v27.4s, v5.4s, v3.s[3] \n"
545
546 "prfm pldl1keep, [%3, #512] \n"
547 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
548
549 "fmla v16.4s, v4.4s, v0.s[0] \n"
550 "fmla v17.4s, v4.4s, v0.s[1] \n"
551 "fmla v18.4s, v4.4s, v0.s[2] \n"
552 "fmla v19.4s, v4.4s, v0.s[3] \n"
553
554 "fmla v28.4s, v5.4s, v0.s[0] \n"
555 "fmla v29.4s, v5.4s, v0.s[1] \n"
556 "fmla v30.4s, v5.4s, v0.s[2] \n"
557 "fmla v31.4s, v5.4s, v0.s[3] \n"
558
559 "fmla v8.4s, v6.4s, v1.s[0] \n"
560 "fmla v9.4s, v6.4s, v1.s[1] \n"
561 "fmla v10.4s, v6.4s, v1.s[2] \n"
562 "fmla v11.4s, v6.4s, v1.s[3] \n"
563 "fmla v12.4s, v6.4s, v2.s[0] \n"
564 "fmla v13.4s, v6.4s, v2.s[1] \n"
565 "fmla v14.4s, v6.4s, v2.s[2] \n"
566 "fmla v15.4s, v6.4s, v2.s[3] \n"
567 "fmla v16.4s, v6.4s, v3.s[0] \n"
568 "fmla v17.4s, v6.4s, v3.s[1] \n"
569 "fmla v18.4s, v6.4s, v3.s[2] \n"
570 "fmla v19.4s, v6.4s, v3.s[3] \n"
571
572 "subs %w0, %w0, #1 \n"
573
574 "fmla v20.4s, v7.4s, v1.s[0] \n"
575 "fmla v21.4s, v7.4s, v1.s[1] \n"
576 "fmla v22.4s, v7.4s, v1.s[2] \n"
577 "fmla v23.4s, v7.4s, v1.s[3] \n"
578 "fmla v24.4s, v7.4s, v2.s[0] \n"
579 "fmla v25.4s, v7.4s, v2.s[1] \n"
580 "fmla v26.4s, v7.4s, v2.s[2] \n"
581 "fmla v27.4s, v7.4s, v2.s[3] \n"
582 "fmla v28.4s, v7.4s, v3.s[0] \n"
583 "fmla v29.4s, v7.4s, v3.s[1] \n"
584 "fmla v30.4s, v7.4s, v3.s[2] \n"
585 "fmla v31.4s, v7.4s, v3.s[3] \n"
586
587 "bne 0b \n"
588
589 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
590 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
591 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
592 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
593 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
594 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
595
596 : "=r"(nn), // %0
597 "=r"(outptr0), // %1
598 "=r"(outptr1), // %2
599 "=r"(tmpptr), // %3
600 "=r"(kptr01) // %4
601 : "0"(nn),
602 "1"(outptr0),
603 "2"(outptr1),
604 "3"(tmpptr),
605 "4"(kptr01),
606 "r"(biasptr) // %10
607 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
608 }
609 for (; i + 7 < size; i += 8)
610 {
611 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
612
613 const float* kptr01 = (const float*)kernel.channel(pp);
614
615 int nn = inch; // inch always > 0
616
617 asm volatile(
618 "ld1 {v0.4s, v1.4s}, [%10] \n"
619 "mov v16.16b, v0.16b \n"
620 "mov v17.16b, v0.16b \n"
621 "mov v18.16b, v0.16b \n"
622 "mov v19.16b, v0.16b \n"
623 "mov v20.16b, v0.16b \n"
624 "mov v21.16b, v0.16b \n"
625 "mov v22.16b, v0.16b \n"
626 "mov v23.16b, v0.16b \n"
627 "mov v24.16b, v1.16b \n"
628 "mov v25.16b, v1.16b \n"
629 "mov v26.16b, v1.16b \n"
630 "mov v27.16b, v1.16b \n"
631 "mov v28.16b, v1.16b \n"
632 "mov v29.16b, v1.16b \n"
633 "mov v30.16b, v1.16b \n"
634 "mov v31.16b, v1.16b \n"
635
636 "0: \n"
637
638 "prfm pldl1keep, [%3, #512] \n"
639 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
640
641 "prfm pldl1keep, [%4, #512] \n"
642 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
643
644 "fmla v16.4s, v8.4s, v0.s[0] \n"
645 "fmla v17.4s, v8.4s, v1.s[0] \n"
646 "fmla v18.4s, v8.4s, v2.s[0] \n"
647 "fmla v19.4s, v8.4s, v3.s[0] \n"
648
649 "prfm pldl1keep, [%3, #512] \n"
650 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
651
652 "fmla v20.4s, v8.4s, v4.s[0] \n"
653 "fmla v21.4s, v8.4s, v5.s[0] \n"
654 "fmla v22.4s, v8.4s, v6.s[0] \n"
655 "fmla v23.4s, v8.4s, v7.s[0] \n"
656
657 "fmla v24.4s, v9.4s, v0.s[0] \n"
658 "fmla v25.4s, v9.4s, v1.s[0] \n"
659 "fmla v26.4s, v9.4s, v2.s[0] \n"
660 "fmla v27.4s, v9.4s, v3.s[0] \n"
661 "fmla v28.4s, v9.4s, v4.s[0] \n"
662 "fmla v29.4s, v9.4s, v5.s[0] \n"
663 "fmla v30.4s, v9.4s, v6.s[0] \n"
664 "fmla v31.4s, v9.4s, v7.s[0] \n"
665
666 "fmla v16.4s, v10.4s, v0.s[1] \n"
667 "fmla v17.4s, v10.4s, v1.s[1] \n"
668 "fmla v18.4s, v10.4s, v2.s[1] \n"
669 "fmla v19.4s, v10.4s, v3.s[1] \n"
670 "fmla v20.4s, v10.4s, v4.s[1] \n"
671 "fmla v21.4s, v10.4s, v5.s[1] \n"
672 "fmla v22.4s, v10.4s, v6.s[1] \n"
673 "fmla v23.4s, v10.4s, v7.s[1] \n"
674
675 "fmla v24.4s, v11.4s, v0.s[1] \n"
676 "fmla v25.4s, v11.4s, v1.s[1] \n"
677 "fmla v26.4s, v11.4s, v2.s[1] \n"
678 "fmla v27.4s, v11.4s, v3.s[1] \n"
679 "fmla v28.4s, v11.4s, v4.s[1] \n"
680 "fmla v29.4s, v11.4s, v5.s[1] \n"
681 "fmla v30.4s, v11.4s, v6.s[1] \n"
682 "fmla v31.4s, v11.4s, v7.s[1] \n"
683
684 "prfm pldl1keep, [%4, #512] \n"
685 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
686
687 "fmla v16.4s, v12.4s, v0.s[2] \n"
688 "fmla v17.4s, v12.4s, v1.s[2] \n"
689 "fmla v18.4s, v12.4s, v2.s[2] \n"
690 "fmla v19.4s, v12.4s, v3.s[2] \n"
691 "fmla v20.4s, v12.4s, v4.s[2] \n"
692 "fmla v21.4s, v12.4s, v5.s[2] \n"
693 "fmla v22.4s, v12.4s, v6.s[2] \n"
694 "fmla v23.4s, v12.4s, v7.s[2] \n"
695
696 "fmla v24.4s, v13.4s, v0.s[2] \n"
697 "fmla v25.4s, v13.4s, v1.s[2] \n"
698 "fmla v26.4s, v13.4s, v2.s[2] \n"
699 "fmla v27.4s, v13.4s, v3.s[2] \n"
700 "fmla v28.4s, v13.4s, v4.s[2] \n"
701 "fmla v29.4s, v13.4s, v5.s[2] \n"
702 "fmla v30.4s, v13.4s, v6.s[2] \n"
703 "fmla v31.4s, v13.4s, v7.s[2] \n"
704
705 "fmla v16.4s, v14.4s, v0.s[3] \n"
706 "fmla v17.4s, v14.4s, v1.s[3] \n"
707 "fmla v18.4s, v14.4s, v2.s[3] \n"
708 "fmla v19.4s, v14.4s, v3.s[3] \n"
709 "fmla v20.4s, v14.4s, v4.s[3] \n"
710 "fmla v21.4s, v14.4s, v5.s[3] \n"
711 "fmla v22.4s, v14.4s, v6.s[3] \n"
712 "fmla v23.4s, v14.4s, v7.s[3] \n"
713
714 "subs %w0, %w0, #1 \n"
715
716 "fmla v24.4s, v15.4s, v0.s[3] \n"
717 "fmla v25.4s, v15.4s, v1.s[3] \n"
718 "fmla v26.4s, v15.4s, v2.s[3] \n"
719 "fmla v27.4s, v15.4s, v3.s[3] \n"
720 "fmla v28.4s, v15.4s, v4.s[3] \n"
721 "fmla v29.4s, v15.4s, v5.s[3] \n"
722 "fmla v30.4s, v15.4s, v6.s[3] \n"
723 "fmla v31.4s, v15.4s, v7.s[3] \n"
724
725 "bne 0b \n"
726
727 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
728 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
729 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
730 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
731
732 : "=r"(nn), // %0
733 "=r"(outptr0), // %1
734 "=r"(outptr1), // %2
735 "=r"(tmpptr), // %3
736 "=r"(kptr01) // %4
737 : "0"(nn),
738 "1"(outptr0),
739 "2"(outptr1),
740 "3"(tmpptr),
741 "4"(kptr01),
742 "r"(biasptr) // %10
743 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
744 }
745 for (; i + 3 < size; i += 4)
746 {
747 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
748
749 const float* kptr01 = (const float*)kernel.channel(pp);
750
751 int nn = inch; // inch always > 0
752
753 asm volatile(
754 "ld1 {v0.4s, v1.4s}, [%10] \n"
755 "mov v16.16b, v0.16b \n"
756 "mov v17.16b, v0.16b \n"
757 "mov v18.16b, v0.16b \n"
758 "mov v19.16b, v0.16b \n"
759 "mov v20.16b, v1.16b \n"
760 "mov v21.16b, v1.16b \n"
761 "mov v22.16b, v1.16b \n"
762 "mov v23.16b, v1.16b \n"
763
764 "0: \n"
765
766 "prfm pldl1keep, [%3, #512] \n"
767 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
768
769 "prfm pldl1keep, [%4, #512] \n"
770 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
771
772 "fmla v16.4s, v8.4s, v0.s[0] \n"
773 "fmla v17.4s, v8.4s, v1.s[0] \n"
774 "fmla v18.4s, v8.4s, v2.s[0] \n"
775 "fmla v19.4s, v8.4s, v3.s[0] \n"
776
777 "fmla v20.4s, v9.4s, v0.s[0] \n"
778 "fmla v21.4s, v9.4s, v1.s[0] \n"
779 "fmla v22.4s, v9.4s, v2.s[0] \n"
780 "fmla v23.4s, v9.4s, v3.s[0] \n"
781
782 "prfm pldl1keep, [%4, #512] \n"
783 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
784
785 "fmla v16.4s, v10.4s, v0.s[1] \n"
786 "fmla v17.4s, v10.4s, v1.s[1] \n"
787 "fmla v18.4s, v10.4s, v2.s[1] \n"
788 "fmla v19.4s, v10.4s, v3.s[1] \n"
789
790 "fmla v20.4s, v11.4s, v0.s[1] \n"
791 "fmla v21.4s, v11.4s, v1.s[1] \n"
792 "fmla v22.4s, v11.4s, v2.s[1] \n"
793 "fmla v23.4s, v11.4s, v3.s[1] \n"
794
795 "fmla v16.4s, v12.4s, v0.s[2] \n"
796 "fmla v17.4s, v12.4s, v1.s[2] \n"
797 "fmla v18.4s, v12.4s, v2.s[2] \n"
798 "fmla v19.4s, v12.4s, v3.s[2] \n"
799
800 "fmla v20.4s, v13.4s, v0.s[2] \n"
801 "fmla v21.4s, v13.4s, v1.s[2] \n"
802 "fmla v22.4s, v13.4s, v2.s[2] \n"
803 "fmla v23.4s, v13.4s, v3.s[2] \n"
804
805 "subs %w0, %w0, #1 \n"
806
807 "fmla v16.4s, v14.4s, v0.s[3] \n"
808 "fmla v17.4s, v14.4s, v1.s[3] \n"
809 "fmla v18.4s, v14.4s, v2.s[3] \n"
810 "fmla v19.4s, v14.4s, v3.s[3] \n"
811
812 "fmla v20.4s, v15.4s, v0.s[3] \n"
813 "fmla v21.4s, v15.4s, v1.s[3] \n"
814 "fmla v22.4s, v15.4s, v2.s[3] \n"
815 "fmla v23.4s, v15.4s, v3.s[3] \n"
816
817 "bne 0b \n"
818
819 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
820 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
821
822 : "=r"(nn), // %0
823 "=r"(outptr0), // %1
824 "=r"(outptr1), // %2
825 "=r"(tmpptr), // %3
826 "=r"(kptr01) // %4
827 : "0"(nn),
828 "1"(outptr0),
829 "2"(outptr1),
830 "3"(tmpptr),
831 "4"(kptr01),
832 "r"(biasptr) // %10
833 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
834 }
835 for (; i + 1 < size; i += 2)
836 {
837 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
838
839 const float* kptr01 = (const float*)kernel.channel(pp);
840
841 int nn = inch; // inch always > 0
842
843 asm volatile(
844 "ld1 {v0.4s, v1.4s}, [%10] \n"
845 "mov v16.16b, v0.16b \n"
846 "mov v17.16b, v0.16b \n"
847 "mov v18.16b, v1.16b \n"
848 "mov v19.16b, v1.16b \n"
849
850 "0: \n"
851
852 "prfm pldl1keep, [%3, #256] \n"
853 "ld1 {v0.4s, v1.4s}, [%3], #32 \n" // r0 r1
854
855 "prfm pldl1keep, [%4, #512] \n"
856 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
857
858 "fmla v16.4s, v8.4s, v0.s[0] \n"
859 "fmla v17.4s, v8.4s, v1.s[0] \n"
860 "fmla v18.4s, v9.4s, v0.s[0] \n"
861 "fmla v19.4s, v9.4s, v1.s[0] \n"
862
863 "prfm pldl1keep, [%4, #512] \n"
864 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
865
866 "fmla v16.4s, v10.4s, v0.s[1] \n"
867 "fmla v17.4s, v10.4s, v1.s[1] \n"
868 "fmla v18.4s, v11.4s, v0.s[1] \n"
869 "fmla v19.4s, v11.4s, v1.s[1] \n"
870
871 "fmla v16.4s, v12.4s, v0.s[2] \n"
872 "fmla v17.4s, v12.4s, v1.s[2] \n"
873 "fmla v18.4s, v13.4s, v0.s[2] \n"
874 "fmla v19.4s, v13.4s, v1.s[2] \n"
875
876 "subs %w0, %w0, #1 \n"
877
878 "fmla v16.4s, v14.4s, v0.s[3] \n"
879 "fmla v17.4s, v14.4s, v1.s[3] \n"
880 "fmla v18.4s, v15.4s, v0.s[3] \n"
881 "fmla v19.4s, v15.4s, v1.s[3] \n"
882
883 "bne 0b \n"
884
885 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
886 "st1 {v18.4s, v19.4s}, [%2], #32 \n"
887
888 : "=r"(nn), // %0
889 "=r"(outptr0), // %1
890 "=r"(outptr1), // %2
891 "=r"(tmpptr), // %3
892 "=r"(kptr01) // %4
893 : "0"(nn),
894 "1"(outptr0),
895 "2"(outptr1),
896 "3"(tmpptr),
897 "4"(kptr01),
898 "r"(biasptr) // %10
899 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
900 }
901 for (; i < size; i++)
902 {
903 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
904
905 const float* kptr01 = (const float*)kernel.channel(pp);
906
907 int nn = inch; // inch always > 0
908
909 asm volatile(
910 "ld1 {v16.4s, v17.4s}, [%10] \n"
911
912 "0: \n"
913
914 "prfm pldl1keep, [%3, #128] \n"
915 "ld1 {v0.4s}, [%3], #16 \n" // r0
916
917 "prfm pldl1keep, [%4, #512] \n"
918 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
919
920 "fmla v16.4s, v8.4s, v0.s[0] \n"
921 "fmla v17.4s, v9.4s, v0.s[0] \n"
922
923 "prfm pldl1keep, [%4, #512] \n"
924 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
925
926 "fmla v16.4s, v10.4s, v0.s[1] \n"
927 "fmla v17.4s, v11.4s, v0.s[1] \n"
928
929 "fmla v16.4s, v12.4s, v0.s[2] \n"
930 "fmla v17.4s, v13.4s, v0.s[2] \n"
931
932 "subs %w0, %w0, #1 \n"
933
934 "fmla v16.4s, v14.4s, v0.s[3] \n"
935 "fmla v17.4s, v15.4s, v0.s[3] \n"
936
937 "bne 0b \n"
938
939 "st1 {v16.4s}, [%1], #16 \n"
940 "st1 {v17.4s}, [%2], #16 \n"
941
942 : "=r"(nn), // %0
943 "=r"(outptr0), // %1
944 "=r"(outptr1), // %2
945 "=r"(tmpptr), // %3
946 "=r"(kptr01) // %4
947 : "0"(nn),
948 "1"(outptr0),
949 "2"(outptr1),
950 "3"(tmpptr),
951 "4"(kptr01),
952 "r"(biasptr) // %10
953 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
954 }
955 }
956
957 #endif // __ARM_NEON && __aarch64__
958
959 #pragma omp parallel for num_threads(opt.num_threads)
960 for (int p = remain_outch_start; p < outch; p++)
961 {
962 float* outptr0 = top_blob.channel(p);
963
964 const float zeros[4] = {0.f, 0.f, 0.f, 0.f};
965 const float* biasptr = bias ? bias + p * 4 : zeros;
966
967 int i = 0;
968 #if __aarch64__
969 for (; i + 11 < size; i += 12)
970 {
971 float* tmpptr = tmp.channel(i / 12);
972
973 const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
974
975 int nn = inch; // inch always > 0
976
977 asm volatile(
978 "ld1 {v0.4s}, [%8] \n"
979 "mov v8.16b, v0.16b \n"
980 "mov v9.16b, v0.16b \n"
981 "mov v10.16b, v0.16b \n"
982 "mov v11.16b, v0.16b \n"
983 "mov v12.16b, v0.16b \n"
984 "mov v13.16b, v0.16b \n"
985 "mov v14.16b, v0.16b \n"
986 "mov v15.16b, v0.16b \n"
987 "mov v16.16b, v0.16b \n"
988 "mov v17.16b, v0.16b \n"
989 "mov v18.16b, v0.16b \n"
990 "mov v19.16b, v0.16b \n"
991
992 "0: \n"
993
994 "prfm pldl1keep, [%2, #512] \n"
995 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
996
997 "prfm pldl1keep, [%3, #512] \n"
998 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
999
1000 "fmla v8.4s, v4.4s, v0.s[0] \n"
1001 "fmla v9.4s, v4.4s, v0.s[1] \n"
1002 "fmla v10.4s, v4.4s, v0.s[2] \n"
1003 "fmla v11.4s, v4.4s, v0.s[3] \n"
1004 "fmla v12.4s, v4.4s, v1.s[0] \n"
1005 "fmla v13.4s, v4.4s, v1.s[1] \n"
1006 "fmla v14.4s, v4.4s, v1.s[2] \n"
1007 "fmla v15.4s, v4.4s, v1.s[3] \n"
1008 "fmla v16.4s, v4.4s, v2.s[0] \n"
1009 "fmla v17.4s, v4.4s, v2.s[1] \n"
1010 "fmla v18.4s, v4.4s, v2.s[2] \n"
1011 "fmla v19.4s, v4.4s, v2.s[3] \n"
1012
1013 "prfm pldl1keep, [%2, #512] \n"
1014 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1015
1016 "fmla v8.4s, v5.4s, v3.s[0] \n"
1017 "fmla v9.4s, v5.4s, v3.s[1] \n"
1018 "fmla v10.4s, v5.4s, v3.s[2] \n"
1019 "fmla v11.4s, v5.4s, v3.s[3] \n"
1020 "fmla v12.4s, v5.4s, v20.s[0] \n"
1021 "fmla v13.4s, v5.4s, v20.s[1] \n"
1022 "fmla v14.4s, v5.4s, v20.s[2] \n"
1023 "fmla v15.4s, v5.4s, v20.s[3] \n"
1024 "fmla v16.4s, v5.4s, v21.s[0] \n"
1025 "fmla v17.4s, v5.4s, v21.s[1] \n"
1026 "fmla v18.4s, v5.4s, v21.s[2] \n"
1027 "fmla v19.4s, v5.4s, v21.s[3] \n"
1028
1029 "prfm pldl1keep, [%2, #512] \n"
1030 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1031
1032 "fmla v8.4s, v6.4s, v22.s[0] \n"
1033 "fmla v9.4s, v6.4s, v22.s[1] \n"
1034 "fmla v10.4s, v6.4s, v22.s[2] \n"
1035 "fmla v11.4s, v6.4s, v22.s[3] \n"
1036 "fmla v12.4s, v6.4s, v23.s[0] \n"
1037 "fmla v13.4s, v6.4s, v23.s[1] \n"
1038 "fmla v14.4s, v6.4s, v23.s[2] \n"
1039 "fmla v15.4s, v6.4s, v23.s[3] \n"
1040 "fmla v16.4s, v6.4s, v24.s[0] \n"
1041 "fmla v17.4s, v6.4s, v24.s[1] \n"
1042 "fmla v18.4s, v6.4s, v24.s[2] \n"
1043 "fmla v19.4s, v6.4s, v24.s[3] \n"
1044
1045 "subs %w0, %w0, #1 \n"
1046
1047 "fmla v8.4s, v7.4s, v25.s[0] \n"
1048 "fmla v9.4s, v7.4s, v25.s[1] \n"
1049 "fmla v10.4s, v7.4s, v25.s[2] \n"
1050 "fmla v11.4s, v7.4s, v25.s[3] \n"
1051 "fmla v12.4s, v7.4s, v26.s[0] \n"
1052 "fmla v13.4s, v7.4s, v26.s[1] \n"
1053 "fmla v14.4s, v7.4s, v26.s[2] \n"
1054 "fmla v15.4s, v7.4s, v26.s[3] \n"
1055 "fmla v16.4s, v7.4s, v27.s[0] \n"
1056 "fmla v17.4s, v7.4s, v27.s[1] \n"
1057 "fmla v18.4s, v7.4s, v27.s[2] \n"
1058 "fmla v19.4s, v7.4s, v27.s[3] \n"
1059
1060 "bne 0b \n"
1061
1062 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
1063 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
1064 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1065
1066 : "=r"(nn), // %0
1067 "=r"(outptr0), // %1
1068 "=r"(tmpptr), // %2
1069 "=r"(kptr0) // %3
1070 : "0"(nn),
1071 "1"(outptr0),
1072 "2"(tmpptr),
1073 "3"(kptr0),
1074 "r"(biasptr) // %8
1075 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1076 }
1077 #endif
1078 for (; i + 7 < size; i += 8)
1079 {
1080 #if __aarch64__
1081 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8);
1082 const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1083 #else
1084 float* tmpptr = tmp.channel(i / 8);
1085 const float* kptr0 = (const float*)kernel.channel(p);
1086 #endif
1087
1088 int nn = inch; // inch always > 0
1089
1090 #if __aarch64__
1091 asm volatile(
1092 "ld1 {v0.4s}, [%8] \n"
1093 "mov v16.16b, v0.16b \n"
1094 "mov v17.16b, v0.16b \n"
1095 "mov v18.16b, v0.16b \n"
1096 "mov v19.16b, v0.16b \n"
1097 "mov v20.16b, v0.16b \n"
1098 "mov v21.16b, v0.16b \n"
1099 "mov v22.16b, v0.16b \n"
1100 "mov v23.16b, v0.16b \n"
1101
1102 "0: \n"
1103
1104 "prfm pldl1keep, [%2, #512] \n"
1105 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1106
1107 "prfm pldl1keep, [%3, #512] \n"
1108 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1109
1110 "fmla v16.4s, v8.4s, v0.s[0] \n"
1111 "fmla v17.4s, v8.4s, v1.s[0] \n"
1112 "fmla v18.4s, v8.4s, v2.s[0] \n"
1113 "fmla v19.4s, v8.4s, v3.s[0] \n"
1114
1115 "prfm pldl1keep, [%2, #512] \n"
1116 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
1117
1118 "fmla v20.4s, v8.4s, v4.s[0] \n"
1119 "fmla v21.4s, v8.4s, v5.s[0] \n"
1120 "fmla v22.4s, v8.4s, v6.s[0] \n"
1121 "fmla v23.4s, v8.4s, v7.s[0] \n"
1122
1123 "fmla v16.4s, v9.4s, v0.s[1] \n"
1124 "fmla v17.4s, v9.4s, v1.s[1] \n"
1125 "fmla v18.4s, v9.4s, v2.s[1] \n"
1126 "fmla v19.4s, v9.4s, v3.s[1] \n"
1127 "fmla v20.4s, v9.4s, v4.s[1] \n"
1128 "fmla v21.4s, v9.4s, v5.s[1] \n"
1129 "fmla v22.4s, v9.4s, v6.s[1] \n"
1130 "fmla v23.4s, v9.4s, v7.s[1] \n"
1131
1132 "fmla v16.4s, v10.4s, v0.s[2] \n"
1133 "fmla v17.4s, v10.4s, v1.s[2] \n"
1134 "fmla v18.4s, v10.4s, v2.s[2] \n"
1135 "fmla v19.4s, v10.4s, v3.s[2] \n"
1136 "fmla v20.4s, v10.4s, v4.s[2] \n"
1137 "fmla v21.4s, v10.4s, v5.s[2] \n"
1138 "fmla v22.4s, v10.4s, v6.s[2] \n"
1139 "fmla v23.4s, v10.4s, v7.s[2] \n"
1140
1141 "subs %w0, %w0, #1 \n"
1142
1143 "fmla v16.4s, v11.4s, v0.s[3] \n"
1144 "fmla v17.4s, v11.4s, v1.s[3] \n"
1145 "fmla v18.4s, v11.4s, v2.s[3] \n"
1146 "fmla v19.4s, v11.4s, v3.s[3] \n"
1147 "fmla v20.4s, v11.4s, v4.s[3] \n"
1148 "fmla v21.4s, v11.4s, v5.s[3] \n"
1149 "fmla v22.4s, v11.4s, v6.s[3] \n"
1150 "fmla v23.4s, v11.4s, v7.s[3] \n"
1151
1152 "bne 0b \n"
1153
1154 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1155 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1156
1157 : "=r"(nn), // %0
1158 "=r"(outptr0), // %1
1159 "=r"(tmpptr), // %2
1160 "=r"(kptr0) // %3
1161 : "0"(nn),
1162 "1"(outptr0),
1163 "2"(tmpptr),
1164 "3"(kptr0),
1165 "r"(biasptr) // %8
1166 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1167 #else
1168 asm volatile(
1169 "vld1.f32 {d0-d1}, [%8] \n"
1170 "vmov q8, q0 \n"
1171 "vmov q9, q0 \n"
1172 "vmov q10, q0 \n"
1173 "vmov q11, q0 \n"
1174 "vmov q12, q0 \n"
1175 "vmov q13, q0 \n"
1176 "vmov q14, q0 \n"
1177 "vmov q15, q0 \n"
1178
1179 "0: \n"
1180
1181 "pld [%2, #512] \n"
1182 "vldm %2!, {d0-d7} \n"
1183
1184 "pld [%3, #512] \n"
1185 "vldm %3!, {d8-d15} \n"
1186
1187 "vmla.f32 q8, q4, d0[0] \n"
1188 "vmla.f32 q9, q4, d0[1] \n"
1189 "vmla.f32 q10, q4, d1[0] \n"
1190 "vmla.f32 q11, q4, d1[1] \n"
1191 "vmla.f32 q12, q4, d2[0] \n"
1192 "vmla.f32 q13, q4, d2[1] \n"
1193 "vmla.f32 q14, q4, d3[0] \n"
1194 "vmla.f32 q15, q4, d3[1] \n"
1195
1196 "vmla.f32 q8, q5, d4[0] \n"
1197 "vmla.f32 q9, q5, d4[1] \n"
1198 "vmla.f32 q10, q5, d5[0] \n"
1199 "vmla.f32 q11, q5, d5[1] \n"
1200 "vmla.f32 q12, q5, d6[0] \n"
1201 "vmla.f32 q13, q5, d6[1] \n"
1202 "vmla.f32 q14, q5, d7[0] \n"
1203 "vmla.f32 q15, q5, d7[1] \n"
1204
1205 "pld [%2, #512] \n"
1206 "vldm %2!, {d0-d7} \n"
1207
1208 "vmla.f32 q8, q6, d0[0] \n"
1209 "vmla.f32 q9, q6, d0[1] \n"
1210 "vmla.f32 q10, q6, d1[0] \n"
1211 "vmla.f32 q11, q6, d1[1] \n"
1212 "vmla.f32 q12, q6, d2[0] \n"
1213 "vmla.f32 q13, q6, d2[1] \n"
1214 "vmla.f32 q14, q6, d3[0] \n"
1215 "vmla.f32 q15, q6, d3[1] \n"
1216
1217 "subs %0, %0, #1 \n"
1218
1219 "vmla.f32 q8, q7, d4[0] \n"
1220 "vmla.f32 q9, q7, d4[1] \n"
1221 "vmla.f32 q10, q7, d5[0] \n"
1222 "vmla.f32 q11, q7, d5[1] \n"
1223 "vmla.f32 q12, q7, d6[0] \n"
1224 "vmla.f32 q13, q7, d6[1] \n"
1225 "vmla.f32 q14, q7, d7[0] \n"
1226 "vmla.f32 q15, q7, d7[1] \n"
1227
1228 "bne 0b \n"
1229
1230 "vstm %1!, {d16-d23} \n"
1231 "vstm %1!, {d24-d31} \n"
1232
1233 : "=r"(nn), // %0
1234 "=r"(outptr0), // %1
1235 "=r"(tmpptr), // %2
1236 "=r"(kptr0) // %3
1237 : "0"(nn),
1238 "1"(outptr0),
1239 "2"(tmpptr),
1240 "3"(kptr0),
1241 "r"(biasptr) // %8
1242 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1243 #endif
1244 }
1245 for (; i + 3 < size; i += 4)
1246 {
1247 #if __aarch64__
1248 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1249 const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1250 #else
1251 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4);
1252 const float* kptr0 = (const float*)kernel.channel(p);
1253 #endif
1254
1255 int nn = inch; // inch always > 0
1256
1257 #if __aarch64__
1258 asm volatile(
1259 "ld1 {v0.4s}, [%8] \n"
1260 "mov v16.16b, v0.16b \n"
1261 "mov v17.16b, v0.16b \n"
1262 "mov v18.16b, v0.16b \n"
1263 "mov v19.16b, v0.16b \n"
1264
1265 "0: \n"
1266
1267 "prfm pldl1keep, [%2, #512] \n"
1268 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1269
1270 "prfm pldl1keep, [%3, #512] \n"
1271 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1272
1273 "fmla v16.4s, v8.4s, v0.s[0] \n"
1274 "fmla v17.4s, v8.4s, v1.s[0] \n"
1275 "fmla v18.4s, v8.4s, v2.s[0] \n"
1276 "fmla v19.4s, v8.4s, v3.s[0] \n"
1277
1278 "fmla v16.4s, v9.4s, v0.s[1] \n"
1279 "fmla v17.4s, v9.4s, v1.s[1] \n"
1280 "fmla v18.4s, v9.4s, v2.s[1] \n"
1281 "fmla v19.4s, v9.4s, v3.s[1] \n"
1282
1283 "fmla v16.4s, v10.4s, v0.s[2] \n"
1284 "fmla v17.4s, v10.4s, v1.s[2] \n"
1285 "fmla v18.4s, v10.4s, v2.s[2] \n"
1286 "fmla v19.4s, v10.4s, v3.s[2] \n"
1287
1288 "subs %w0, %w0, #1 \n"
1289
1290 "fmla v16.4s, v11.4s, v0.s[3] \n"
1291 "fmla v17.4s, v11.4s, v1.s[3] \n"
1292 "fmla v18.4s, v11.4s, v2.s[3] \n"
1293 "fmla v19.4s, v11.4s, v3.s[3] \n"
1294
1295 "bne 0b \n"
1296
1297 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1298
1299 : "=r"(nn), // %0
1300 "=r"(outptr0), // %1
1301 "=r"(tmpptr), // %2
1302 "=r"(kptr0) // %3
1303 : "0"(nn),
1304 "1"(outptr0),
1305 "2"(tmpptr),
1306 "3"(kptr0),
1307 "r"(biasptr) // %8
1308 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1309 #else
1310 asm volatile(
1311 "vld1.f32 {d0-d1}, [%8] \n"
1312 "vmov q8, q0 \n"
1313 "vmov q9, q0 \n"
1314 "vmov q10, q0 \n"
1315 "vmov q11, q0 \n"
1316
1317 "0: \n"
1318
1319 "pld [%2, #512] \n"
1320 "vldm %2!, {d0-d7} \n"
1321
1322 "pld [%3, #512] \n"
1323 "vldm %3!, {d8-d15} \n"
1324
1325 "vmla.f32 q8, q4, d0[0] \n"
1326 "vmla.f32 q9, q4, d2[0] \n"
1327 "vmla.f32 q10, q4, d4[0] \n"
1328 "vmla.f32 q11, q4, d6[0] \n"
1329
1330 "vmla.f32 q8, q5, d0[1] \n"
1331 "vmla.f32 q9, q5, d2[1] \n"
1332 "vmla.f32 q10, q5, d4[1] \n"
1333 "vmla.f32 q11, q5, d6[1] \n"
1334
1335 "vmla.f32 q8, q6, d1[0] \n"
1336 "vmla.f32 q9, q6, d3[0] \n"
1337 "vmla.f32 q10, q6, d5[0] \n"
1338 "vmla.f32 q11, q6, d7[0] \n"
1339
1340 "subs %0, %0, #1 \n"
1341
1342 "vmla.f32 q8, q7, d1[1] \n"
1343 "vmla.f32 q9, q7, d3[1] \n"
1344 "vmla.f32 q10, q7, d5[1] \n"
1345 "vmla.f32 q11, q7, d7[1] \n"
1346
1347 "bne 0b \n"
1348
1349 "vstm %1!, {d16-d23} \n"
1350
1351 : "=r"(nn), // %0
1352 "=r"(outptr0), // %1
1353 "=r"(tmpptr), // %2
1354 "=r"(kptr0) // %3
1355 : "0"(nn),
1356 "1"(outptr0),
1357 "2"(tmpptr),
1358 "3"(kptr0),
1359 "r"(biasptr) // %8
1360 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1361 #endif
1362 }
1363 for (; i + 1 < size; i += 2)
1364 {
1365 #if __aarch64__
1366 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1367 const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1368 #else
1369 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2);
1370 const float* kptr0 = (const float*)kernel.channel(p);
1371 #endif
1372
1373 int nn = inch; // inch always > 0
1374
1375 #if __aarch64__
1376 asm volatile(
1377 "ld1 {v0.4s}, [%8] \n"
1378 "mov v16.16b, v0.16b \n"
1379 "mov v17.16b, v0.16b \n"
1380
1381 "0: \n"
1382
1383 "prfm pldl1keep, [%2, #256] \n"
1384 "ld1 {v0.4s, v1.4s}, [%2], #32 \n" // r0 r1
1385
1386 "prfm pldl1keep, [%3, #512] \n"
1387 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1388
1389 "fmla v16.4s, v8.4s, v0.s[0] \n"
1390 "fmla v17.4s, v8.4s, v1.s[0] \n"
1391
1392 "fmla v16.4s, v9.4s, v0.s[1] \n"
1393 "fmla v17.4s, v9.4s, v1.s[1] \n"
1394
1395 "fmla v16.4s, v10.4s, v0.s[2] \n"
1396 "fmla v17.4s, v10.4s, v1.s[2] \n"
1397
1398 "subs %w0, %w0, #1 \n"
1399
1400 "fmla v16.4s, v11.4s, v0.s[3] \n"
1401 "fmla v17.4s, v11.4s, v1.s[3] \n"
1402
1403 "bne 0b \n"
1404
1405 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
1406
1407 : "=r"(nn), // %0
1408 "=r"(outptr0), // %1
1409 "=r"(tmpptr), // %2
1410 "=r"(kptr0) // %3
1411 : "0"(nn),
1412 "1"(outptr0),
1413 "2"(tmpptr),
1414 "3"(kptr0),
1415 "r"(biasptr) // %8
1416 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
1417 #else
1418 asm volatile(
1419 "vld1.f32 {d0-d1}, [%8] \n"
1420 "vmov q8, q0 \n"
1421 "vmov q9, q0 \n"
1422
1423 "0: \n"
1424
1425 "pld [%2, #256] \n"
1426 "vld1.f32 {d0-d3}, [%2 :128]! \n"
1427
1428 "pld [%3, #512] \n"
1429 "vldm %3!, {d8-d15} \n"
1430
1431 "vmla.f32 q8, q4, d0[0] \n"
1432 "vmla.f32 q9, q4, d2[0] \n"
1433
1434 "vmla.f32 q8, q5, d0[1] \n"
1435 "vmla.f32 q9, q5, d2[1] \n"
1436
1437 "vmla.f32 q8, q6, d1[0] \n"
1438 "vmla.f32 q9, q6, d3[0] \n"
1439
1440 "subs %0, %0, #1 \n"
1441
1442 "vmla.f32 q8, q7, d1[1] \n"
1443 "vmla.f32 q9, q7, d3[1] \n"
1444
1445 "bne 0b \n"
1446
1447 "vst1.f32 {d16-d19}, [%1 :128]! \n"
1448
1449 : "=r"(nn), // %0
1450 "=r"(outptr0), // %1
1451 "=r"(tmpptr), // %2
1452 "=r"(kptr0) // %3
1453 : "0"(nn),
1454 "1"(outptr0),
1455 "2"(tmpptr),
1456 "3"(kptr0),
1457 "r"(biasptr) // %8
1458 : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
1459 #endif
1460 }
1461 for (; i < size; i++)
1462 {
1463 #if __aarch64__
1464 float* tmpptr = tmp.channel(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1465 const float* kptr0 = (const float*)kernel.channel(p / 2 + p % 2);
1466 #else
1467 float* tmpptr = tmp.channel(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
1468 const float* kptr0 = (const float*)kernel.channel(p);
1469 #endif
1470
1471 int nn = inch; // inch always > 0
1472
1473 #if __aarch64__
1474 asm volatile(
1475 "ld1 {v16.4s}, [%8] \n"
1476
1477 "0: \n"
1478
1479 "prfm pldl1keep, [%2, #128] \n"
1480 "ld1 {v0.4s}, [%2], #16 \n" // r0
1481
1482 "prfm pldl1keep, [%3, #512] \n"
1483 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1484
1485 "fmla v16.4s, v8.4s, v0.s[0] \n"
1486 "fmla v16.4s, v9.4s, v0.s[1] \n"
1487
1488 "subs %w0, %w0, #1 \n"
1489
1490 "fmla v16.4s, v10.4s, v0.s[2] \n"
1491 "fmla v16.4s, v11.4s, v0.s[3] \n"
1492
1493 "bne 0b \n"
1494
1495 "st1 {v16.4s}, [%1], #16 \n"
1496
1497 : "=r"(nn), // %0
1498 "=r"(outptr0), // %1
1499 "=r"(tmpptr), // %2
1500 "=r"(kptr0) // %3
1501 : "0"(nn),
1502 "1"(outptr0),
1503 "2"(tmpptr),
1504 "3"(kptr0),
1505 "r"(biasptr) // %8
1506 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
1507 #else
1508 asm volatile(
1509 "vld1.f32 {d16-d17}, [%8] \n"
1510
1511 "0: \n"
1512
1513 "pld [%2, #128] \n"
1514 "vld1.f32 {d0-d1}, [%2 :128]! \n"
1515
1516 "pld [%3, #512] \n"
1517 "vldm %3!, {d8-d15} \n"
1518
1519 "vmla.f32 q8, q4, d0[0] \n"
1520 "vmla.f32 q8, q5, d0[1] \n"
1521
1522 "subs %0, %0, #1 \n"
1523
1524 "vmla.f32 q8, q6, d1[0] \n"
1525 "vmla.f32 q8, q7, d1[1] \n"
1526
1527 "bne 0b \n"
1528
1529 "vst1.f32 {d16-d17}, [%1 :128]! \n"
1530
1531 : "=r"(nn), // %0
1532 "=r"(outptr0), // %1
1533 "=r"(tmpptr), // %2
1534 "=r"(kptr0) // %3
1535 : "0"(nn),
1536 "1"(outptr0),
1537 "2"(tmpptr),
1538 "3"(kptr0),
1539 "r"(biasptr) // %8
1540 : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
1541 #endif
1542 }
1543 }
1544
1545 // // NOTE sgemm
1546 // for (; p<outch; p++)
1547 // {
1548 // Mat out0 = top_blob.channel(p);
1549 //
1550 // const float bias0 = bias ? bias[p] : 0.f;
1551 //
1552 // float* outptr0 = out0;
1553 //
1554 // for (int i=0; i<size; i++)
1555 // {
1556 // float sum = bias0;
1557 //
1558 // const float* kptr = _kernel.channel(p);
1559 //
1560 // for (int q=0; q<inch; q++)
1561 // {
1562 // const float* img0 = bottom_blob.channel(q);
1563 //
1564 // sum += img0[i] * kptr[0];
1565 // kptr ++;
1566 // }
1567 //
1568 // outptr0[i] = sum;
1569 // }
1570 // }
1571 }
1572
conv1x1s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)1573 static void conv1x1s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
1574 {
1575 int w = bottom_blob.w;
1576 int channels = bottom_blob.c;
1577 size_t elemsize = bottom_blob.elemsize;
1578 int elempack = bottom_blob.elempack;
1579
1580 int outw = top_blob.w;
1581 int outh = top_blob.h;
1582
1583 const int tailstep = (w - 2 * outw + w) * 4;
1584
1585 Mat bottom_blob_shrinked;
1586 bottom_blob_shrinked.create(outw, outh, channels, elemsize, elempack, opt.workspace_allocator);
1587
1588 #pragma omp parallel for num_threads(opt.num_threads)
1589 for (int p = 0; p < channels; p++)
1590 {
1591 const float* r0 = bottom_blob.channel(p);
1592 float* outptr = bottom_blob_shrinked.channel(p);
1593
1594 for (int i = 0; i < outh; i++)
1595 {
1596 for (int j = 0; j < outw; j++)
1597 {
1598 float32x4_t _v = vld1q_f32(r0);
1599 vst1q_f32(outptr, _v);
1600
1601 r0 += 8;
1602 outptr += 4;
1603 }
1604
1605 r0 += tailstep;
1606 }
1607 }
1608
1609 conv1x1s1_sgemm_pack4_neon(bottom_blob_shrinked, top_blob, kernel, _bias, opt);
1610 }
1611