1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)15 static void conv3x3s1_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 int inch = bottom_blob.c;
19
20 int outw = top_blob.w;
21 int outh = top_blob.h;
22 int outch = top_blob.c;
23
24 const float* kernel = _kernel;
25 const float* bias = _bias;
26
27 int nn_outch = outch >> 1;
28 int remain_outch_start = nn_outch << 1;
29
30 #pragma omp parallel for num_threads(opt.num_threads)
31 for (int pp = 0; pp < nn_outch; pp++)
32 {
33 int p = pp * 2;
34
35 Mat out0 = top_blob.channel(p);
36 Mat out1 = top_blob.channel(p + 1);
37
38 const float bias0 = bias ? bias[p] : 0.f;
39 const float bias1 = bias ? bias[p + 1] : 0.f;
40
41 out0.fill(bias0);
42 out1.fill(bias1);
43
44 const float* k0 = kernel + p * inch * 9;
45 const float* k1 = kernel + (p + 1) * inch * 9;
46
47 for (int q = 0; q < inch; q++)
48 {
49 float* outptr0 = out0;
50 float* outptr1 = out1;
51 float* outptr0n = outptr0 + outw;
52 float* outptr1n = outptr1 + outw;
53
54 const float* img0 = bottom_blob.channel(q);
55
56 const float* r0 = img0;
57 const float* r1 = img0 + w;
58 const float* r2 = img0 + w * 2;
59 const float* r3 = img0 + w * 3;
60
61 #if __ARM_NEON
62 float32x4_t _k00 = vld1q_f32(k0);
63 float32x4_t _k03 = vld1q_f32(k0 + 3);
64 float32x4_t _k06 = vld1q_f32(k0 + 6);
65
66 float32x4_t _k10 = vld1q_f32(k1);
67 float32x4_t _k13 = vld1q_f32(k1 + 3);
68 float32x4_t _k16 = vld1q_f32(k1 + 6);
69 #endif // __ARM_NEON
70
71 int i = 0;
72
73 for (; i + 1 < outh; i += 2)
74 {
75 #if __ARM_NEON
76 int nn = outw >> 2;
77 int remain = outw & 3;
78 #else
79 int remain = outw;
80 #endif // __ARM_NEON
81
82 #if __ARM_NEON
83 #if __aarch64__
84 if (nn > 0)
85 {
86 asm volatile(
87 "prfm pldl1keep, [%5, #256] \n"
88 "ld1 {v8.4s, v9.4s}, [%5] \n" // r0
89 "add %5, %5, #16 \n"
90
91 "prfm pldl1keep, [%8, #256] \n"
92 "ld1 {v14.4s, v15.4s}, [%8] \n" // r3
93 "add %8, %8, #16 \n"
94
95 "ext v10.16b, v8.16b, v9.16b, #4 \n"
96 "ext v11.16b, v14.16b, v15.16b, #8 \n"
97
98 "0: \n"
99
100 "prfm pldl1keep, [%1, #128] \n"
101 "ld1 {v6.4s}, [%1] \n" // _sum0
102
103 "prfm pldl1keep, [%2, #128] \n"
104 "ld1 {v7.4s}, [%2] \n" // _sum1
105
106 "fmla v6.4s, v8.4s, %18.s[0] \n"
107 "fmla v7.4s, v8.4s, %21.s[0] \n"
108
109 "prfm pldl1keep, [%3, #128] \n"
110 "ld1 {v12.4s}, [%3] \n" // _sum0n
111
112 "prfm pldl1keep, [%4, #128] \n"
113 "ld1 {v13.4s}, [%4] \n" // _sum1n
114
115 "fmla v12.4s, v14.4s, %20.s[0] \n"
116 "fmla v13.4s, v14.4s, %23.s[0] \n"
117
118 "ext v8.16b, v8.16b, v9.16b, #8 \n"
119 "ext v9.16b, v14.16b, v15.16b, #4 \n"
120
121 "fmla v6.4s, v10.4s, %18.s[1] \n"
122 "fmla v7.4s, v10.4s, %21.s[1] \n"
123 "fmla v12.4s, v11.4s, %20.s[2] \n"
124 "fmla v13.4s, v11.4s, %23.s[2] \n"
125
126 "prfm pldl1keep, [%6, #256] \n"
127 "ld1 {v14.4s, v15.4s}, [%6] \n" // r1
128 "add %6, %6, #16 \n"
129
130 "fmla v6.4s, v8.4s, %18.s[2] \n"
131 "fmla v7.4s, v8.4s, %21.s[2] \n"
132 "fmla v12.4s, v9.4s, %20.s[1] \n"
133 "fmla v13.4s, v9.4s, %23.s[1] \n"
134
135 "ext v10.16b, v14.16b, v15.16b, #4 \n"
136
137 "fmla v6.4s, v14.4s, %19.s[0] \n"
138 "fmla v7.4s, v14.4s, %22.s[0] \n"
139 "fmla v12.4s, v14.4s, %18.s[0] \n"
140 "fmla v13.4s, v14.4s, %21.s[0] \n"
141
142 "ext v11.16b, v14.16b, v15.16b, #8 \n"
143
144 "fmla v6.4s, v10.4s, %19.s[1] \n"
145 "fmla v7.4s, v10.4s, %22.s[1] \n"
146 "fmla v12.4s, v10.4s, %18.s[1] \n"
147 "fmla v13.4s, v10.4s, %21.s[1] \n"
148
149 "prfm pldl1keep, [%7, #256] \n"
150 "ld1 {v8.4s, v9.4s}, [%7] \n" // r2
151 "add %7, %7, #16 \n"
152
153 "fmla v6.4s, v11.4s, %19.s[2] \n"
154 "fmla v7.4s, v11.4s, %22.s[2] \n"
155 "fmla v12.4s, v11.4s, %18.s[2] \n"
156 "fmla v13.4s, v11.4s, %21.s[2] \n"
157
158 "ext v10.16b, v8.16b, v9.16b, #4 \n"
159
160 "fmla v6.4s, v8.4s, %20.s[0] \n"
161 "fmla v7.4s, v8.4s, %23.s[0] \n"
162 "fmla v12.4s, v8.4s, %19.s[0] \n"
163 "fmla v13.4s, v8.4s, %22.s[0] \n"
164
165 "ext v11.16b, v8.16b, v9.16b, #8 \n"
166
167 "fmla v6.4s, v10.4s, %20.s[1] \n"
168 "fmla v7.4s, v10.4s, %23.s[1] \n"
169 "fmla v12.4s, v10.4s, %19.s[1] \n"
170 "fmla v13.4s, v10.4s, %22.s[1] \n"
171
172 "prfm pldl1keep, [%5, #256] \n"
173 "ld1 {v8.4s, v9.4s}, [%5] \n" // r0
174 "add %5, %5, #16 \n"
175
176 "fmla v6.4s, v11.4s, %20.s[2] \n"
177 "fmla v7.4s, v11.4s, %23.s[2] \n"
178 "fmla v12.4s, v11.4s, %19.s[2] \n"
179 "fmla v13.4s, v11.4s, %22.s[2] \n"
180
181 "prfm pldl1keep, [%8, #256] \n"
182 "ld1 {v14.4s, v15.4s}, [%8] \n" // r3
183 "add %8, %8, #16 \n"
184
185 "ext v10.16b, v8.16b, v9.16b, #4 \n"
186
187 "st1 {v6.4s}, [%1], #16 \n"
188 "st1 {v7.4s}, [%2], #16 \n"
189
190 "ext v11.16b, v14.16b, v15.16b, #8 \n"
191
192 "st1 {v12.4s}, [%3], #16 \n"
193 "st1 {v13.4s}, [%4], #16 \n"
194
195 "subs %w0, %w0, #1 \n"
196 "bne 0b \n"
197
198 "sub %5, %5, #16 \n"
199 "sub %8, %8, #16 \n"
200 : "=r"(nn), // %0
201 "=r"(outptr0), // %1
202 "=r"(outptr1), // %2
203 "=r"(outptr0n), // %3
204 "=r"(outptr1n), // %4
205 "=r"(r0), // %5
206 "=r"(r1), // %6
207 "=r"(r2), // %7
208 "=r"(r3) // %8
209 : "0"(nn),
210 "1"(outptr0),
211 "2"(outptr1),
212 "3"(outptr0n),
213 "4"(outptr1n),
214 "5"(r0),
215 "6"(r1),
216 "7"(r2),
217 "8"(r3),
218 "w"(_k00), // %18
219 "w"(_k03), // %19
220 "w"(_k06), // %20
221 "w"(_k10), // %21
222 "w"(_k13), // %22
223 "w"(_k16) // %23
224 : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
225 }
226 #else
227 if (nn > 0)
228 {
229 asm volatile(
230
231 "pld [%5, #192] \n"
232 "vld1.f32 {d16-d18}, [%5 :64] \n" // r0
233 "add %5, #16 \n"
234
235 "pld [%8, #192] \n"
236 "vld1.f32 {d28-d30}, [%8] \n" // r3
237 "add %8, #16 \n"
238
239 "vext.32 q10, q8, q9, #1 \n"
240 "vext.32 q11, q14, q15, #2 \n"
241
242 "0: \n"
243
244 "pld [%1, #128] \n"
245 "vld1.f32 {d12-d13}, [%1 :64] \n" // _sum0
246
247 "pld [%2, #128] \n"
248 "vld1.f32 {d14-d15}, [%2 :64] \n" // _sum1
249
250 "vmla.f32 q6, q8, %e18[0] \n"
251 "vmla.f32 q7, q8, %e21[0] \n"
252
253 "pld [%3, #128] \n"
254 "vld1.f32 {d24-d25}, [%3] \n" // _sum0n
255
256 "pld [%4, #128] \n"
257 "vld1.f32 {d26-d27}, [%4] \n" // _sum1n
258
259 "vmla.f32 q12, q14, %e20[0] \n"
260 "vmla.f32 q13, q14, %e23[0] \n"
261
262 "vext.32 q8, q8, q9, #2 \n"
263 "vext.32 q9, q14, q15, #1 \n"
264
265 "vmla.f32 q6, q10, %e18[1] \n"
266 "vmla.f32 q7, q10, %e21[1] \n"
267 "vmla.f32 q12, q11, %f20[0] \n"
268 "vmla.f32 q13, q11, %f23[0] \n"
269
270 "pld [%6, #192] \n"
271 "vld1.f32 {d28-d30}, [%6] \n" // r1
272 "add %6, #16 \n"
273
274 "vmla.f32 q6, q8, %f18[0] \n"
275 "vmla.f32 q7, q8, %f21[0] \n"
276 "vmla.f32 q12, q9, %e20[1] \n"
277 "vmla.f32 q13, q9, %e23[1] \n"
278
279 "vext.32 q10, q14, q15, #1 \n"
280
281 "vmla.f32 q6, q14, %e19[0] \n"
282 "vmla.f32 q7, q14, %e22[0] \n"
283 "vmla.f32 q12, q14, %e18[0] \n"
284 "vmla.f32 q13, q14, %e21[0] \n"
285
286 "vext.32 q11, q14, q15, #2 \n"
287
288 "vmla.f32 q6, q10, %e19[1] \n"
289 "vmla.f32 q7, q10, %e22[1] \n"
290 "vmla.f32 q12, q10, %e18[1] \n"
291 "vmla.f32 q13, q10, %e21[1] \n"
292
293 "pld [%7, #192] \n"
294 "vld1.f32 {d16-d18}, [%7 :64] \n" // r2
295 "add %7, #16 \n"
296
297 "vmla.f32 q6, q11, %f19[0] \n"
298 "vmla.f32 q7, q11, %f22[0] \n"
299 "vmla.f32 q12, q11, %f18[0] \n"
300 "vmla.f32 q13, q11, %f21[0] \n"
301
302 "vext.32 q10, q8, q9, #1 \n"
303
304 "vmla.f32 q6, q8, %e20[0] \n"
305 "vmla.f32 q7, q8, %e23[0] \n"
306 "vmla.f32 q12, q8, %e19[0] \n"
307 "vmla.f32 q13, q8, %e22[0] \n"
308
309 "vext.32 q11, q8, q9, #2 \n"
310
311 "vmla.f32 q6, q10, %e20[1] \n"
312 "vmla.f32 q7, q10, %e23[1] \n"
313 "vmla.f32 q12, q10, %e19[1] \n"
314 "vmla.f32 q13, q10, %e22[1] \n"
315
316 "pld [%5, #192] \n"
317 "vld1.f32 {d16-d18}, [%5 :64] \n" // r0
318 "add %5, #16 \n"
319
320 "vmla.f32 q6, q11, %f20[0] \n"
321 "vmla.f32 q7, q11, %f23[0] \n"
322 "vmla.f32 q12, q11, %f19[0] \n"
323 "vmla.f32 q13, q11, %f22[0] \n"
324
325 "pld [%8, #192] \n"
326 "vld1.f32 {d28-d30}, [%8] \n" // r3
327 "add %8, #16 \n"
328
329 "vext.32 q10, q8, q9, #1 \n"
330
331 "vst1.f32 {d12-d13}, [%1 : 64]!\n"
332 "vst1.f32 {d14-d15}, [%2 : 64]!\n"
333
334 "vext.32 q11, q14, q15, #2 \n"
335
336 "vst1.f32 {d24-d25}, [%3]! \n"
337 "vst1.f32 {d26-d27}, [%4]! \n"
338
339 "subs %0, #1 \n"
340 "bne 0b \n"
341
342 "sub %5, #16 \n"
343 "sub %8, #16 \n"
344 : "=r"(nn), // %0
345 "=r"(outptr0), // %1
346 "=r"(outptr1), // %2
347 "=r"(outptr0n), // %3
348 "=r"(outptr1n), // %4
349 "=r"(r0), // %5
350 "=r"(r1), // %6
351 "=r"(r2), // %7
352 "=r"(r3) // %8
353 : "0"(nn),
354 "1"(outptr0),
355 "2"(outptr1),
356 "3"(outptr0n),
357 "4"(outptr1n),
358 "5"(r0),
359 "6"(r1),
360 "7"(r2),
361 "8"(r3),
362 "w"(_k00), // %18
363 "w"(_k03), // %19
364 "w"(_k06), // %20
365 "w"(_k10), // %21
366 "w"(_k13), // %22
367 "w"(_k16) // %23
368 : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
369 }
370 #endif // __aarch64__
371 #endif // __ARM_NEON
372 for (; remain > 0; remain--)
373 {
374 #if __ARM_NEON
375 float32x4_t _r00 = vld1q_f32(r0);
376 float32x4_t _r10 = vld1q_f32(r1);
377 float32x4_t _r20 = vld1q_f32(r2);
378 float32x4_t _r30 = vld1q_f32(r3);
379
380 float32x4_t _sum0 = vmulq_f32(_r00, _k00);
381 float32x4_t _sum1 = vmulq_f32(_r00, _k10);
382 _sum0 = vmlaq_f32(_sum0, _r10, _k03);
383 _sum1 = vmlaq_f32(_sum1, _r10, _k13);
384 _sum0 = vmlaq_f32(_sum0, _r20, _k06);
385 _sum1 = vmlaq_f32(_sum1, _r20, _k16);
386
387 float32x4_t _sum0n = vmulq_f32(_r10, _k00);
388 float32x4_t _sum1n = vmulq_f32(_r10, _k10);
389 _sum0n = vmlaq_f32(_sum0n, _r20, _k03);
390 _sum1n = vmlaq_f32(_sum1n, _r20, _k13);
391 _sum0n = vmlaq_f32(_sum0n, _r30, _k06);
392 _sum1n = vmlaq_f32(_sum1n, _r30, _k16);
393
394 _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
395 _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
396 _sum0n = vsetq_lane_f32(*outptr0n, _sum0n, 3);
397 _sum1n = vsetq_lane_f32(*outptr1n, _sum1n, 3);
398 #if __aarch64__
399 *outptr0 = vaddvq_f32(_sum0);
400 *outptr1 = vaddvq_f32(_sum1);
401 *outptr0n = vaddvq_f32(_sum0n);
402 *outptr1n = vaddvq_f32(_sum1n);
403 #else
404 float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
405 float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
406 float32x2_t _ss0n = vadd_f32(vget_low_f32(_sum0n), vget_high_f32(_sum0n));
407 float32x2_t _ss1n = vadd_f32(vget_low_f32(_sum1n), vget_high_f32(_sum1n));
408
409 float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
410 float32x2_t _ss01n = vpadd_f32(_ss0n, _ss1n);
411
412 *outptr0 = vget_lane_f32(_ss01, 0);
413 *outptr1 = vget_lane_f32(_ss01, 1);
414 *outptr0n = vget_lane_f32(_ss01n, 0);
415 *outptr1n = vget_lane_f32(_ss01n, 1);
416 #endif // __aarch64__
417 #else
418 float sum0 = 0.f;
419 float sum0n = 0.f;
420 float sum1 = 0.f;
421 float sum1n = 0.f;
422
423 sum0 += r0[0] * k0[0];
424 sum0 += r0[1] * k0[1];
425 sum0 += r0[2] * k0[2];
426 sum0 += r1[0] * k0[3];
427 sum0 += r1[1] * k0[4];
428 sum0 += r1[2] * k0[5];
429 sum0 += r2[0] * k0[6];
430 sum0 += r2[1] * k0[7];
431 sum0 += r2[2] * k0[8];
432
433 sum1 += r0[0] * k1[0];
434 sum1 += r0[1] * k1[1];
435 sum1 += r0[2] * k1[2];
436 sum1 += r1[0] * k1[3];
437 sum1 += r1[1] * k1[4];
438 sum1 += r1[2] * k1[5];
439 sum1 += r2[0] * k1[6];
440 sum1 += r2[1] * k1[7];
441 sum1 += r2[2] * k1[8];
442
443 sum0n += r1[0] * k0[0];
444 sum0n += r1[1] * k0[1];
445 sum0n += r1[2] * k0[2];
446 sum0n += r2[0] * k0[3];
447 sum0n += r2[1] * k0[4];
448 sum0n += r2[2] * k0[5];
449 sum0n += r3[0] * k0[6];
450 sum0n += r3[1] * k0[7];
451 sum0n += r3[2] * k0[8];
452
453 sum1n += r1[0] * k1[0];
454 sum1n += r1[1] * k1[1];
455 sum1n += r1[2] * k1[2];
456 sum1n += r2[0] * k1[3];
457 sum1n += r2[1] * k1[4];
458 sum1n += r2[2] * k1[5];
459 sum1n += r3[0] * k1[6];
460 sum1n += r3[1] * k1[7];
461 sum1n += r3[2] * k1[8];
462
463 *outptr0 += sum0;
464 *outptr1 += sum1;
465 *outptr0n += sum0n;
466 *outptr1n += sum1n;
467 #endif // __ARM_NEON
468 r0++;
469 r1++;
470 r2++;
471 r3++;
472 outptr0++;
473 outptr1++;
474 outptr0n++;
475 outptr1n++;
476 }
477
478 r0 += 2 + w;
479 r1 += 2 + w;
480 r2 += 2 + w;
481 r3 += 2 + w;
482
483 outptr0 += outw;
484 outptr1 += outw;
485 outptr0n += outw;
486 outptr1n += outw;
487 }
488
489 for (; i < outh; i++)
490 {
491 #if __ARM_NEON
492 int nn = outw >> 2;
493 int remain = outw & 3;
494 #else
495 int remain = outw;
496 #endif // __ARM_NEON
497
498 #if __ARM_NEON
499 #if __aarch64__
500 if (nn > 0)
501 {
502 asm volatile(
503 "0: \n"
504
505 "prfm pldl1keep, [%3, #256] \n"
506 "ld1 {v8.4s, v9.4s}, [%3] \n" // r0
507 "add %3, %3, #16 \n"
508
509 "prfm pldl1keep, [%1, #128] \n"
510 "ld1 {v6.4s}, [%1] \n" // _sum0
511
512 "prfm pldl1keep, [%2, #128] \n"
513 "ld1 {v7.4s}, [%2] \n" // _sum1
514
515 "fmul v14.4s, v8.4s, %12.s[0] \n"
516 "fmul v15.4s, v8.4s, %15.s[0] \n"
517
518 "ext v10.16b, v8.16b, v9.16b, #4 \n"
519 "ext v11.16b, v8.16b, v9.16b, #8 \n"
520
521 "fmla v6.4s, v10.4s, %12.s[1] \n"
522 "fmla v7.4s, v10.4s, %15.s[1] \n"
523
524 "prfm pldl1keep, [%4, #256] \n"
525 "ld1 {v8.4s, v9.4s}, [%4] \n" // r1
526 "add %4, %4, #16 \n"
527
528 "fmla v14.4s, v11.4s, %12.s[2] \n"
529 "fmla v15.4s, v11.4s, %15.s[2] \n"
530
531 "fmla v6.4s, v8.4s, %13.s[0] \n"
532 "fmla v7.4s, v8.4s, %16.s[0] \n"
533
534 "ext v10.16b, v8.16b, v9.16b, #4 \n"
535 "ext v11.16b, v8.16b, v9.16b, #8 \n"
536
537 "fmla v14.4s, v10.4s, %13.s[1] \n"
538 "fmla v15.4s, v10.4s, %16.s[1] \n"
539
540 "prfm pldl1keep, [%5, #256] \n"
541 "ld1 {v8.4s, v9.4s}, [%5] \n" // r2
542 "add %5, %5, #16 \n"
543
544 "fmla v6.4s, v11.4s, %13.s[2] \n"
545 "fmla v7.4s, v11.4s, %16.s[2] \n"
546
547 "fmla v14.4s, v8.4s, %14.s[0] \n"
548 "fmla v15.4s, v8.4s, %17.s[0] \n"
549
550 "ext v10.16b, v8.16b, v9.16b, #4 \n"
551 "ext v11.16b, v8.16b, v9.16b, #8 \n"
552
553 "fmla v6.4s, v10.4s, %14.s[1] \n"
554 "fmla v7.4s, v10.4s, %17.s[1] \n"
555
556 "fmla v14.4s, v11.4s, %14.s[2] \n"
557 "fmla v15.4s, v11.4s, %17.s[2] \n"
558
559 "fadd v6.4s, v6.4s, v14.4s \n"
560 "fadd v7.4s, v7.4s, v15.4s \n"
561
562 "st1 {v6.4s}, [%1], #16 \n"
563 "st1 {v7.4s}, [%2], #16 \n"
564
565 "subs %w0, %w0, #1 \n"
566 "bne 0b \n"
567
568 : "=r"(nn), // %0
569 "=r"(outptr0), // %1
570 "=r"(outptr1), // %2
571 "=r"(r0), // %3
572 "=r"(r1), // %4
573 "=r"(r2) // %5
574 : "0"(nn),
575 "1"(outptr0),
576 "2"(outptr1),
577 "3"(r0),
578 "4"(r1),
579 "5"(r2),
580 "w"(_k00), // %12
581 "w"(_k03), // %13
582 "w"(_k06), // %14
583 "w"(_k10), // %15
584 "w"(_k13), // %16
585 "w"(_k16) // %17
586 : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
587 }
588 #else
589 if (nn > 0)
590 {
591 asm volatile(
592 "0: \n"
593
594 "pld [%3, #192] \n"
595 "vld1.f32 {d16-d18}, [%3] \n" // r0
596 "add %3, #16 \n"
597
598 "pld [%1, #128] \n"
599 "vld1.f32 {d12-d13}, [%1] \n" // _sum0
600
601 "pld [%2, #128] \n"
602 "vld1.f32 {d14-d15}, [%2] \n" // _sum1
603
604 "vmul.f32 q14, q8, %e12[0] \n"
605 "vmul.f32 q15, q8, %e15[0] \n"
606
607 "vext.32 q10, q8, q9, #1 \n"
608 "vext.32 q11, q8, q9, #2 \n"
609
610 "vmla.f32 q6, q10, %e12[1] \n"
611 "vmla.f32 q7, q10, %e15[1] \n"
612
613 "pld [%4, #192] \n"
614 "vld1.f32 {d16-d18}, [%4] \n" // r1
615 "add %4, #16 \n"
616
617 "vmla.f32 q14, q11, %f12[0] \n"
618 "vmla.f32 q15, q11, %f15[0] \n"
619
620 "vmla.f32 q6, q8, %e13[0] \n"
621 "vmla.f32 q7, q8, %e16[0] \n"
622
623 "vext.32 q10, q8, q9, #1 \n"
624 "vext.32 q11, q8, q9, #2 \n"
625
626 "vmla.f32 q14, q10, %e13[1] \n"
627 "vmla.f32 q15, q10, %e16[1] \n"
628
629 "pld [%5, #192] \n"
630 "vld1.f32 {d16-d18}, [%5] \n" // r2
631 "add %5, #16 \n"
632
633 "vmla.f32 q6, q11, %f13[0] \n"
634 "vmla.f32 q7, q11, %f16[0] \n"
635
636 "vmla.f32 q14, q8, %e14[0] \n"
637 "vmla.f32 q15, q8, %e17[0] \n"
638
639 "vext.32 q10, q8, q9, #1 \n"
640 "vext.32 q11, q8, q9, #2 \n"
641
642 "vmla.f32 q6, q10, %e14[1] \n"
643 "vmla.f32 q7, q10, %e17[1] \n"
644
645 "vmla.f32 q14, q11, %f14[0] \n"
646 "vmla.f32 q15, q11, %f17[0] \n"
647
648 "vadd.f32 q6, q6, q14 \n"
649 "vadd.f32 q7, q7, q15 \n"
650
651 "vst1.f32 {d12-d13}, [%1]! \n"
652
653 "vst1.f32 {d14-d15}, [%2]! \n"
654
655 "subs %0, #1 \n"
656 "bne 0b \n"
657
658 : "=r"(nn), // %0
659 "=r"(outptr0), // %1
660 "=r"(outptr1), // %2
661 "=r"(r0), // %3
662 "=r"(r1), // %4
663 "=r"(r2) // %5
664 : "0"(nn),
665 "1"(outptr0),
666 "2"(outptr1),
667 "3"(r0),
668 "4"(r1),
669 "5"(r2),
670 "w"(_k00), // %12
671 "w"(_k03), // %13
672 "w"(_k06), // %14
673 "w"(_k10), // %15
674 "w"(_k13), // %16
675 "w"(_k16) // %17
676 : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
677 }
678 #endif // __aarch64__
679 #endif // __ARM_NEON
680 for (; remain > 0; remain--)
681 {
682 #if __ARM_NEON
683 float32x4_t _r00 = vld1q_f32(r0);
684 float32x4_t _r10 = vld1q_f32(r1);
685 float32x4_t _r20 = vld1q_f32(r2);
686
687 float32x4_t _sum0 = vmulq_f32(_r00, _k00);
688 float32x4_t _sum1 = vmulq_f32(_r00, _k10);
689 _sum0 = vmlaq_f32(_sum0, _r10, _k03);
690 _sum1 = vmlaq_f32(_sum1, _r10, _k13);
691 _sum0 = vmlaq_f32(_sum0, _r20, _k06);
692 _sum1 = vmlaq_f32(_sum1, _r20, _k16);
693
694 _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
695 _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
696 #if __aarch64__
697 *outptr0 = vaddvq_f32(_sum0);
698 *outptr1 = vaddvq_f32(_sum1);
699 #else
700 float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
701 float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
702 float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
703
704 *outptr0 = vget_lane_f32(_ss01, 0);
705 *outptr1 = vget_lane_f32(_ss01, 1);
706 #endif // __aarch64__
707 #else
708 float sum0 = 0.f;
709 float sum1 = 0.f;
710
711 sum0 += r0[0] * k0[0];
712 sum0 += r0[1] * k0[1];
713 sum0 += r0[2] * k0[2];
714 sum0 += r1[0] * k0[3];
715 sum0 += r1[1] * k0[4];
716 sum0 += r1[2] * k0[5];
717 sum0 += r2[0] * k0[6];
718 sum0 += r2[1] * k0[7];
719 sum0 += r2[2] * k0[8];
720
721 sum1 += r0[0] * k1[0];
722 sum1 += r0[1] * k1[1];
723 sum1 += r0[2] * k1[2];
724 sum1 += r1[0] * k1[3];
725 sum1 += r1[1] * k1[4];
726 sum1 += r1[2] * k1[5];
727 sum1 += r2[0] * k1[6];
728 sum1 += r2[1] * k1[7];
729 sum1 += r2[2] * k1[8];
730
731 *outptr0 += sum0;
732 *outptr1 += sum1;
733 #endif // __ARM_NEON
734 r0++;
735 r1++;
736 r2++;
737 outptr0++;
738 outptr1++;
739 }
740
741 r0 += 2;
742 r1 += 2;
743 r2 += 2;
744 }
745
746 k0 += 9;
747 k1 += 9;
748 }
749 }
750
751 #pragma omp parallel for num_threads(opt.num_threads)
752 for (int p = remain_outch_start; p < outch; p++)
753 {
754 Mat out = top_blob.channel(p);
755
756 const float bias0 = bias ? bias[p] : 0.f;
757
758 out.fill(bias0);
759
760 const float* kernel0 = kernel + p * inch * 9;
761
762 for (int q = 0; q < inch; q++)
763 {
764 float* outptr = out;
765 float* outptr2 = outptr + outw;
766
767 const float* img0 = bottom_blob.channel(q);
768
769 const float* r0 = img0;
770 const float* r1 = img0 + w;
771 const float* r2 = img0 + w * 2;
772 const float* r3 = img0 + w * 3;
773
774 #if __ARM_NEON
775 float32x4_t _k0123 = vld1q_f32(kernel0);
776 float32x4_t _k3456 = vld1q_f32(kernel0 + 3);
777 float32x4_t _k6789 = vld1q_f32(kernel0 + 6);
778 #else
779 const float* k0 = kernel0;
780 const float* k1 = kernel0 + 3;
781 const float* k2 = kernel0 + 6;
782 #endif // __ARM_NEON
783
784 int i = 0;
785
786 for (; i + 1 < outh; i += 2)
787 {
788 #if __ARM_NEON
789 int nn = outw >> 2;
790 int remain = outw & 3;
791 #else
792 int remain = outw;
793 #endif // __ARM_NEON
794
795 #if __ARM_NEON
796 #if __aarch64__
797 if (nn > 0)
798 {
799 asm volatile(
800 "prfm pldl1keep, [%3, #256] \n"
801 "ld1 {v9.4s, v10.4s}, [%3] \n" // r0
802 "add %3, %3, #16 \n"
803
804 "ext v11.16b, v9.16b, v10.16b, #4 \n"
805 "ext v12.16b, v9.16b, v10.16b, #8 \n"
806
807 "0: \n"
808
809 "prfm pldl1keep, [%1, #128] \n"
810 "ld1 {v7.4s}, [%1] \n" // _sum
811
812 "fmla v7.4s, v9.4s, %14.s[0] \n"
813 "fmul v6.4s, v11.4s, %14.s[1] \n"
814 "fmul v13.4s, v12.4s, %14.s[2] \n"
815
816 "prfm pldl1keep, [%4, #256] \n"
817 "ld1 {v9.4s, v10.4s}, [%4] \n" // r1
818 "add %4, %4, #16 \n"
819
820 "fmla v7.4s, v9.4s, %15.s[0] \n"
821
822 "ext v11.16b, v9.16b, v10.16b, #4 \n"
823 "ext v12.16b, v9.16b, v10.16b, #8 \n"
824
825 "fmla v6.4s, v11.4s, %15.s[1] \n"
826 "fmla v13.4s, v12.4s, %15.s[2] \n"
827
828 "prfm pldl1keep, [%2, #128] \n"
829 "ld1 {v8.4s}, [%2] \n" // _sum2
830
831 "fmla v8.4s, v9.4s, %14.s[0] \n"
832 "fmul v14.4s, v11.4s, %14.s[1] \n"
833 "fmul v15.4s, v12.4s, %14.s[2] \n"
834
835 "prfm pldl1keep, [%5, #256] \n"
836 "ld1 {v9.4s, v10.4s}, [%5] \n" // r2
837 "add %5, %5, #16 \n"
838
839 "fmla v7.4s, v9.4s, %16.s[0] \n"
840
841 "ext v11.16b, v9.16b, v10.16b, #4 \n"
842 "ext v12.16b, v9.16b, v10.16b, #8 \n"
843
844 "fmla v6.4s, v11.4s, %16.s[1] \n"
845 "fmla v13.4s, v12.4s, %16.s[2] \n"
846
847 "fmla v8.4s, v9.4s, %15.s[0] \n"
848 "fmla v14.4s, v11.4s, %15.s[1] \n"
849 "fmla v15.4s, v12.4s, %15.s[2] \n"
850
851 "prfm pldl1keep, [%6, #256] \n"
852 "ld1 {v9.4s, v10.4s}, [%6] \n" // r3
853 "add %6, %6, #16 \n"
854
855 "fmla v8.4s, v9.4s, %16.s[0] \n"
856
857 "ext v11.16b, v9.16b, v10.16b, #4 \n"
858 "ext v12.16b, v9.16b, v10.16b, #8 \n"
859
860 "fmla v14.4s, v11.4s, %16.s[1] \n"
861 "fmla v15.4s, v12.4s, %16.s[2] \n"
862
863 "fadd v7.4s, v7.4s, v6.4s \n"
864
865 "prfm pldl1keep, [%3, #256] \n"
866 "ld1 {v9.4s, v10.4s}, [%3] \n" // r0
867
868 "fadd v8.4s, v8.4s, v14.4s \n"
869 "fadd v7.4s, v7.4s, v13.4s \n"
870 "fadd v8.4s, v8.4s, v15.4s \n"
871
872 "ext v11.16b, v9.16b, v10.16b, #4 \n"
873 "ext v12.16b, v9.16b, v10.16b, #8 \n"
874
875 "add %3, %3, #16 \n"
876
877 "st1 {v7.4s}, [%1], #16 \n"
878 "st1 {v8.4s}, [%2], #16 \n"
879
880 "subs %w0, %w0, #1 \n"
881 "bne 0b \n"
882
883 "sub %3, %3, #16 \n"
884 : "=r"(nn), // %0
885 "=r"(outptr), // %1
886 "=r"(outptr2), // %2
887 "=r"(r0), // %3
888 "=r"(r1), // %4
889 "=r"(r2), // %5
890 "=r"(r3) // %6
891 : "0"(nn),
892 "1"(outptr),
893 "2"(outptr2),
894 "3"(r0),
895 "4"(r1),
896 "5"(r2),
897 "6"(r3),
898 "w"(_k0123), // %14
899 "w"(_k3456), // %15
900 "w"(_k6789) // %16
901 : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
902 }
903 #else
904 if (nn > 0)
905 {
906 asm volatile(
907 "pld [%3, #192] \n"
908 "vld1.f32 {d18-d20}, [%3 :64] \n" // r0
909 "add %3, #16 \n"
910
911 "vext.32 q11, q9, q10, #1 \n"
912 "vext.32 q12, q9, q10, #2 \n"
913
914 "0: \n"
915
916 "pld [%1, #128] \n"
917 "vld1.f32 {d14-d15}, [%1 :64] \n" // _sum
918
919 "vmla.f32 q7, q9, %e14[0] \n"
920 "vmul.f32 q6, q11, %e14[1] \n"
921 "vmul.f32 q13, q12, %f14[0] \n"
922
923 "pld [%4, #192] \n"
924 "vld1.f32 {d18-d20}, [%4] \n" // r1
925 "add %4, #16 \n"
926
927 "vmla.f32 q7, q9, %e15[0] \n"
928
929 "vext.32 q11, q9, q10, #1 \n"
930 "vext.32 q12, q9, q10, #2 \n"
931
932 "vmla.f32 q6, q11, %e15[1] \n"
933 "vmla.f32 q13, q12, %f15[0] \n"
934
935 "pld [%2, #128] \n"
936 "vld1.f32 {d16-d17}, [%2] \n" // _sum2
937
938 "vmla.f32 q8, q9, %e14[0] \n"
939 "vmul.f32 q14, q11, %e14[1] \n"
940 "vmul.f32 q15, q12, %f14[0] \n"
941
942 "pld [%5, #192] \n"
943 "vld1.f32 {d18-d20}, [%5 :64] \n" // r2
944 "add %5, #16 \n"
945
946 "vmla.f32 q7, q9, %e16[0] \n"
947
948 "vext.32 q11, q9, q10, #1 \n"
949 "vext.32 q12, q9, q10, #2 \n"
950
951 "vmla.f32 q6, q11, %e16[1] \n"
952 "vmla.f32 q13, q12, %f16[0] \n"
953
954 "vmla.f32 q8, q9, %e15[0] \n"
955 "vmla.f32 q14, q11, %e15[1] \n"
956 "vmla.f32 q15, q12, %f15[0] \n"
957
958 "pld [%6, #192] \n"
959 "vld1.f32 {d18-d20}, [%6] \n" // r3
960 "add %6, #16 \n"
961
962 "vmla.f32 q8, q9, %e16[0] \n"
963
964 "vext.32 q11, q9, q10, #1 \n"
965 "vext.32 q12, q9, q10, #2 \n"
966
967 "vmla.f32 q14, q11, %e16[1] \n"
968 "vmla.f32 q15, q12, %f16[0] \n"
969
970 "vadd.f32 q7, q7, q6 \n"
971
972 "pld [%3, #192] \n"
973 "vld1.f32 {d18-d20}, [%3 :64] \n" // r0
974
975 "vadd.f32 q8, q8, q14 \n"
976 "vadd.f32 q7, q7, q13 \n"
977 "vadd.f32 q8, q8, q15 \n"
978
979 "vext.32 q11, q9, q10, #1 \n"
980 "vext.32 q12, q9, q10, #2 \n"
981
982 "add %3, #16 \n"
983
984 "vst1.f32 {d14-d15}, [%1]! \n"
985 "vst1.f32 {d16-d17}, [%2]! \n"
986
987 "subs %0, #1 \n"
988 "bne 0b \n"
989
990 "sub %3, #16 \n"
991 : "=r"(nn), // %0
992 "=r"(outptr), // %1
993 "=r"(outptr2), // %2
994 "=r"(r0), // %3
995 "=r"(r1), // %4
996 "=r"(r2), // %5
997 "=r"(r3) // %6
998 : "0"(nn),
999 "1"(outptr),
1000 "2"(outptr2),
1001 "3"(r0),
1002 "4"(r1),
1003 "5"(r2),
1004 "6"(r3),
1005 "w"(_k0123), // %14
1006 "w"(_k3456), // %15
1007 "w"(_k6789) // %16
1008 : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1009 }
1010 #endif // __aarch64__
1011 #endif // __ARM_NEON
1012 for (; remain > 0; remain--)
1013 {
1014 #if __ARM_NEON
1015 float32x4_t _r00 = vld1q_f32(r0);
1016 float32x4_t _r10 = vld1q_f32(r1);
1017 float32x4_t _r20 = vld1q_f32(r2);
1018 float32x4_t _r30 = vld1q_f32(r3);
1019
1020 float32x4_t _sum = vmulq_f32(_r00, _k0123);
1021 _sum = vmlaq_f32(_sum, _r10, _k3456);
1022 _sum = vmlaq_f32(_sum, _r20, _k6789);
1023
1024 float32x4_t _sum2 = vmulq_f32(_r10, _k0123);
1025 _sum2 = vmlaq_f32(_sum2, _r20, _k3456);
1026 _sum2 = vmlaq_f32(_sum2, _r30, _k6789);
1027
1028 _sum = vsetq_lane_f32(*outptr, _sum, 3);
1029 _sum2 = vsetq_lane_f32(*outptr2, _sum2, 3);
1030
1031 #if __aarch64__
1032 *outptr = vaddvq_f32(_sum);
1033 *outptr2 = vaddvq_f32(_sum2);
1034 #else
1035 float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
1036 float32x2_t _ss2 = vadd_f32(vget_low_f32(_sum2), vget_high_f32(_sum2));
1037
1038 float32x2_t _sss2 = vpadd_f32(_ss, _ss2);
1039
1040 *outptr = vget_lane_f32(_sss2, 0);
1041 *outptr2 = vget_lane_f32(_sss2, 1);
1042 #endif // __aarch64__
1043 #else
1044 float sum = 0;
1045 float sum2 = 0;
1046
1047 sum += r0[0] * k0[0];
1048 sum += r0[1] * k0[1];
1049 sum += r0[2] * k0[2];
1050 sum += r1[0] * k1[0];
1051 sum += r1[1] * k1[1];
1052 sum += r1[2] * k1[2];
1053 sum += r2[0] * k2[0];
1054 sum += r2[1] * k2[1];
1055 sum += r2[2] * k2[2];
1056
1057 sum2 += r1[0] * k0[0];
1058 sum2 += r1[1] * k0[1];
1059 sum2 += r1[2] * k0[2];
1060 sum2 += r2[0] * k1[0];
1061 sum2 += r2[1] * k1[1];
1062 sum2 += r2[2] * k1[2];
1063 sum2 += r3[0] * k2[0];
1064 sum2 += r3[1] * k2[1];
1065 sum2 += r3[2] * k2[2];
1066
1067 *outptr += sum;
1068 *outptr2 += sum2;
1069 #endif
1070 r0++;
1071 r1++;
1072 r2++;
1073 r3++;
1074 outptr++;
1075 outptr2++;
1076 }
1077
1078 r0 += 2 + w;
1079 r1 += 2 + w;
1080 r2 += 2 + w;
1081 r3 += 2 + w;
1082
1083 outptr += outw;
1084 outptr2 += outw;
1085 }
1086
1087 for (; i < outh; i++)
1088 {
1089 #if __ARM_NEON
1090 int nn = outw >> 2;
1091 int remain = outw & 3;
1092 #else
1093 int remain = outw;
1094 #endif // __ARM_NEON
1095
1096 #if __ARM_NEON
1097 #if __aarch64__
1098 if (nn > 0)
1099 {
1100 asm volatile(
1101 "prfm pldl1keep, [%2, #256] \n"
1102 "ld1 {v8.4s, v9.4s}, [%2] \n" // r0
1103 "add %2, %2, #16 \n"
1104
1105 "ext v10.16b, v8.16b, v9.16b, #4 \n"
1106 "ext v11.16b, v8.16b, v9.16b, #8 \n"
1107
1108 "0: \n"
1109
1110 "prfm pldl1keep, [%1, #128] \n"
1111 "ld1 {v7.4s}, [%1] \n" // _sum
1112
1113 "fmla v7.4s, v8.4s, %10.s[0] \n"
1114 "fmul v13.4s, v10.4s, %10.s[1] \n"
1115 "fmul v14.4s, v11.4s, %10.s[2] \n"
1116
1117 "prfm pldl1keep, [%3, #256] \n"
1118 "ld1 {v8.4s, v9.4s}, [%3] \n" // r1
1119 "add %3, %3, #16 \n"
1120
1121 "fmla v7.4s, v8.4s, %11.s[0] \n"
1122
1123 "ext v10.16b, v8.16b, v9.16b, #4 \n"
1124 "ext v11.16b, v8.16b, v9.16b, #8 \n"
1125
1126 "fmla v13.4s, v10.4s, %11.s[1] \n"
1127 "fmla v14.4s, v11.4s, %11.s[2] \n"
1128
1129 "prfm pldl1keep, [%4, #256] \n"
1130 "ld1 {v8.4s, v9.4s}, [%4] \n" // r2
1131 "add %4, %4, #16 \n"
1132
1133 "fmla v7.4s, v8.4s, %12.s[0] \n"
1134
1135 "ext v10.16b, v8.16b, v9.16b, #4 \n"
1136 "ext v11.16b, v8.16b, v9.16b, #8 \n"
1137
1138 "fmla v13.4s, v10.4s, %12.s[1] \n"
1139 "fmla v14.4s, v11.4s, %12.s[2] \n"
1140
1141 "prfm pldl1keep, [%2, #256] \n"
1142 "ld1 {v8.4s, v9.4s}, [%2] \n" // r0
1143 "add %2, %2, #16 \n"
1144
1145 "fadd v7.4s, v7.4s, v13.4s \n"
1146 "fadd v7.4s, v7.4s, v14.4s \n"
1147
1148 "ext v10.16b, v8.16b, v9.16b, #4 \n"
1149 "ext v11.16b, v8.16b, v9.16b, #8 \n"
1150
1151 "st1 {v7.4s}, [%1], #16 \n"
1152
1153 "subs %w0, %w0, #1 \n"
1154 "bne 0b \n"
1155
1156 "sub %2, %2, #16 \n"
1157 : "=r"(nn), // %0
1158 "=r"(outptr), // %1
1159 "=r"(r0), // %2
1160 "=r"(r1), // %3
1161 "=r"(r2) // %4
1162 : "0"(nn),
1163 "1"(outptr),
1164 "2"(r0),
1165 "3"(r1),
1166 "4"(r2),
1167 "w"(_k0123), // %10
1168 "w"(_k3456), // %11
1169 "w"(_k6789) // %12
1170 : "cc", "memory", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
1171 }
1172 #else
1173 if (nn > 0)
1174 {
1175 asm volatile(
1176 "pld [%2, #192] \n"
1177 "vld1.f32 {d16-d18}, [%2] \n" // r0
1178 "add %2, #16 \n"
1179
1180 "vext.32 q10, q8, q9, #1 \n"
1181 "vext.32 q11, q8, q9, #2 \n"
1182
1183 "0: \n"
1184
1185 "pld [%1, #128] \n"
1186 "vld1.f32 {d14-d15}, [%1] \n" // _sum
1187
1188 "vmla.f32 q7, q8, %e10[0] \n"
1189 "vmul.f32 q13, q10, %e10[1] \n"
1190 "vmul.f32 q14, q11, %f10[0] \n"
1191
1192 "pld [%3, #192] \n"
1193 "vld1.f32 {d16-d18}, [%3] \n" // r1
1194 "add %3, #16 \n"
1195
1196 "vmla.f32 q7, q8, %e11[0] \n"
1197
1198 "vext.32 q10, q8, q9, #1 \n"
1199 "vext.32 q11, q8, q9, #2 \n"
1200
1201 "vmla.f32 q13, q10, %e11[1] \n"
1202 "vmla.f32 q14, q11, %f11[0] \n"
1203
1204 "pld [%4, #192] \n"
1205 "vld1.f32 {d16-d18}, [%4] \n" // r2
1206 "add %4, #16 \n"
1207
1208 "vmla.f32 q7, q8, %e12[0] \n"
1209
1210 "vext.32 q10, q8, q9, #1 \n"
1211 "vext.32 q11, q8, q9, #2 \n"
1212
1213 "vmla.f32 q13, q10, %e12[1] \n"
1214 "vmla.f32 q14, q11, %f12[0] \n"
1215
1216 "pld [%2, #192] \n"
1217 "vld1.f32 {d16-d18}, [%2] \n" // r0
1218 "add %2, #16 \n"
1219
1220 "vadd.f32 q7, q7, q13 \n"
1221 "vadd.f32 q7, q7, q14 \n"
1222
1223 "vext.32 q10, q8, q9, #1 \n"
1224 "vext.32 q11, q8, q9, #2 \n"
1225
1226 "vst1.f32 {d14-d15}, [%1]! \n"
1227
1228 "subs %0, #1 \n"
1229 "bne 0b \n"
1230
1231 "sub %2, #16 \n"
1232 : "=r"(nn), // %0
1233 "=r"(outptr), // %1
1234 "=r"(r0), // %2
1235 "=r"(r1), // %3
1236 "=r"(r2) // %4
1237 : "0"(nn),
1238 "1"(outptr),
1239 "2"(r0),
1240 "3"(r1),
1241 "4"(r2),
1242 "w"(_k0123), // %10
1243 "w"(_k3456), // %11
1244 "w"(_k6789) // %12
1245 : "cc", "memory", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1246 }
1247 #endif // __aarch64__
1248 #endif // __ARM_NEON
1249 for (; remain > 0; remain--)
1250 {
1251 #if __ARM_NEON
1252 float32x4_t _r00 = vld1q_f32(r0);
1253 float32x4_t _r10 = vld1q_f32(r1);
1254 float32x4_t _r20 = vld1q_f32(r2);
1255
1256 float32x4_t _sum = vmulq_f32(_r00, _k0123);
1257 _sum = vmlaq_f32(_sum, _r10, _k3456);
1258 _sum = vmlaq_f32(_sum, _r20, _k6789);
1259
1260 _sum = vsetq_lane_f32(*outptr, _sum, 3);
1261
1262 #if __aarch64__
1263 *outptr = vaddvq_f32(_sum);
1264 #else
1265 float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
1266 _ss = vpadd_f32(_ss, _ss);
1267
1268 *outptr = vget_lane_f32(_ss, 0);
1269 #endif // __aarch64__
1270 #else
1271 float sum = 0;
1272
1273 sum += r0[0] * k0[0];
1274 sum += r0[1] * k0[1];
1275 sum += r0[2] * k0[2];
1276 sum += r1[0] * k1[0];
1277 sum += r1[1] * k1[1];
1278 sum += r1[2] * k1[2];
1279 sum += r2[0] * k2[0];
1280 sum += r2[1] * k2[1];
1281 sum += r2[2] * k2[2];
1282
1283 *outptr += sum;
1284 #endif
1285 r0++;
1286 r1++;
1287 r2++;
1288 outptr++;
1289 }
1290
1291 r0 += 2;
1292 r1 += 2;
1293 r2 += 2;
1294 }
1295
1296 kernel0 += 9;
1297 }
1298 }
1299 }
1300
conv3x3s1_winograd64_transform_kernel_neon(const Mat & kernel,Mat & kernel_tm,int inch,int outch)1301 static void conv3x3s1_winograd64_transform_kernel_neon(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
1302 {
1303 kernel_tm.create(8 * 8, inch, outch);
1304
1305 const float ktm[8][3] = {
1306 {1.0f, 0.0f, 0.0f},
1307 {-2.0f / 9, -2.0f / 9, -2.0f / 9},
1308 {-2.0f / 9, 2.0f / 9, -2.0f / 9},
1309 {1.0f / 90, 1.0f / 45, 2.0f / 45},
1310 {1.0f / 90, -1.0f / 45, 2.0f / 45},
1311 {1.0f / 45, 1.0f / 90, 1.0f / 180},
1312 {1.0f / 45, -1.0f / 90, 1.0f / 180},
1313 {0.0f, 0.0f, 1.0f}
1314 };
1315
1316 #pragma omp parallel for
1317 for (int p = 0; p < outch; p++)
1318 {
1319 for (int q = 0; q < inch; q++)
1320 {
1321 const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
1322 float* kernel_tm0 = kernel_tm.channel(p).row(q);
1323
1324 // transform kernel, transposed
1325 const float* k0 = kernel0;
1326 const float* k1 = kernel0 + 3;
1327 const float* k2 = kernel0 + 6;
1328
1329 // h
1330 float tmp[8][3];
1331 for (int i = 0; i < 8; i++)
1332 {
1333 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
1334 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
1335 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
1336 }
1337
1338 // v
1339 for (int j = 0; j < 8; j++)
1340 {
1341 float* tmpp = &tmp[j][0];
1342
1343 for (int i = 0; i < 8; i++)
1344 {
1345 kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
1346 }
1347 }
1348 }
1349 }
1350
1351 // optimized layout for winograd4
1352 // interleave weights
1353 int nn_outch = outch >> 2;
1354 int remain_outch_start = nn_outch << 2;
1355
1356 Mat kernel_tm2(8 * 8 * inch * 4, 1, nn_outch + (outch % 4 + 3) / 4);
1357
1358 #pragma omp parallel for
1359 for (int pp = 0; pp < nn_outch; pp++)
1360 {
1361 int p = pp * 4;
1362
1363 float* ktm2 = kernel_tm2.channel(pp);
1364
1365 const Mat kernel0_tm = kernel_tm.channel(p);
1366 const Mat kernel1_tm = kernel_tm.channel(p + 1);
1367 const Mat kernel2_tm = kernel_tm.channel(p + 2);
1368 const Mat kernel3_tm = kernel_tm.channel(p + 3);
1369
1370 int q = 0;
1371
1372 #if __ARM_NEON && __aarch64__
1373 for (; q + 3 < inch; q += 4)
1374 {
1375 const float* k00 = kernel0_tm.row(q);
1376 const float* k01 = kernel0_tm.row(q + 1);
1377 const float* k02 = kernel0_tm.row(q + 2);
1378 const float* k03 = kernel0_tm.row(q + 3);
1379 const float* k10 = kernel1_tm.row(q);
1380 const float* k11 = kernel1_tm.row(q + 1);
1381 const float* k12 = kernel1_tm.row(q + 2);
1382 const float* k13 = kernel1_tm.row(q + 3);
1383 const float* k20 = kernel2_tm.row(q);
1384 const float* k21 = kernel2_tm.row(q + 1);
1385 const float* k22 = kernel2_tm.row(q + 2);
1386 const float* k23 = kernel2_tm.row(q + 3);
1387 const float* k30 = kernel3_tm.row(q);
1388 const float* k31 = kernel3_tm.row(q + 1);
1389 const float* k32 = kernel3_tm.row(q + 2);
1390 const float* k33 = kernel3_tm.row(q + 3);
1391
1392 for (int r = 0; r < 16; r++)
1393 {
1394 // split into two asm blocks for gcc reject over 30 oprands :(
1395 asm volatile(
1396 "ld1 {v0.4s}, [%1], #16 \n"
1397 "ld1 {v1.4s}, [%2], #16 \n"
1398 "ld1 {v2.4s}, [%3], #16 \n"
1399 "ld1 {v3.4s}, [%4], #16 \n"
1400 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
1401
1402 "ld1 {v0.4s}, [%5], #16 \n"
1403 "ld1 {v1.4s}, [%6], #16 \n"
1404 "ld1 {v2.4s}, [%7], #16 \n"
1405 "ld1 {v3.4s}, [%8], #16 \n"
1406 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
1407
1408 : "=r"(ktm2), // %0
1409 "=r"(k00), // %1
1410 "=r"(k01), // %2
1411 "=r"(k02), // %3
1412 "=r"(k03), // %4
1413 "=r"(k10), // %5
1414 "=r"(k11), // %6
1415 "=r"(k12), // %7
1416 "=r"(k13) // %8
1417 : "0"(ktm2),
1418 "1"(k00),
1419 "2"(k01),
1420 "3"(k02),
1421 "4"(k03),
1422 "5"(k10),
1423 "6"(k11),
1424 "7"(k12),
1425 "8"(k13)
1426 : "cc", "memory", "v0", "v1", "v2", "v3");
1427 asm volatile(
1428 "ld1 {v0.4s}, [%1], #16 \n"
1429 "ld1 {v1.4s}, [%2], #16 \n"
1430 "ld1 {v2.4s}, [%3], #16 \n"
1431 "ld1 {v3.4s}, [%4], #16 \n"
1432 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
1433
1434 "ld1 {v0.4s}, [%5], #16 \n"
1435 "ld1 {v1.4s}, [%6], #16 \n"
1436 "ld1 {v2.4s}, [%7], #16 \n"
1437 "ld1 {v3.4s}, [%8], #16 \n"
1438 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
1439
1440 : "=r"(ktm2), // %0
1441 "=r"(k20), // %1
1442 "=r"(k21), // %2
1443 "=r"(k22), // %3
1444 "=r"(k23), // %4
1445 "=r"(k30), // %5
1446 "=r"(k31), // %6
1447 "=r"(k32), // %7
1448 "=r"(k33) // %8
1449 : "0"(ktm2),
1450 "1"(k20),
1451 "2"(k21),
1452 "3"(k22),
1453 "4"(k23),
1454 "5"(k30),
1455 "6"(k31),
1456 "7"(k32),
1457 "8"(k33)
1458 : "cc", "memory", "v0", "v1", "v2", "v3");
1459 }
1460 }
1461 #endif // __ARM_NEON && __aarch64__
1462
1463 for (; q + 1 < inch; q += 2)
1464 {
1465 const float* k00 = kernel0_tm.row(q);
1466 const float* k01 = kernel0_tm.row(q + 1);
1467 const float* k10 = kernel1_tm.row(q);
1468 const float* k11 = kernel1_tm.row(q + 1);
1469 const float* k20 = kernel2_tm.row(q);
1470 const float* k21 = kernel2_tm.row(q + 1);
1471 const float* k30 = kernel3_tm.row(q);
1472 const float* k31 = kernel3_tm.row(q + 1);
1473
1474 for (int r = 0; r < 16; r++)
1475 {
1476 #if __ARM_NEON
1477 #if __aarch64__
1478 asm volatile(
1479 "ld1 {v0.4s}, [%1], #16 \n"
1480 "ld1 {v1.4s}, [%2], #16 \n"
1481 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1482
1483 "ld1 {v0.4s}, [%3], #16 \n"
1484 "ld1 {v1.4s}, [%4], #16 \n"
1485 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1486
1487 "ld1 {v0.4s}, [%5], #16 \n"
1488 "ld1 {v1.4s}, [%6], #16 \n"
1489 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1490
1491 "ld1 {v0.4s}, [%7], #16 \n"
1492 "ld1 {v1.4s}, [%8], #16 \n"
1493 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1494
1495 : "=r"(ktm2), // %0
1496 "=r"(k00), // %1
1497 "=r"(k01), // %2
1498 "=r"(k10), // %3
1499 "=r"(k11), // %4
1500 "=r"(k20), // %5
1501 "=r"(k21), // %6
1502 "=r"(k30), // %7
1503 "=r"(k31) // %8
1504 : "0"(ktm2),
1505 "1"(k00),
1506 "2"(k01),
1507 "3"(k10),
1508 "4"(k11),
1509 "5"(k20),
1510 "6"(k21),
1511 "7"(k30),
1512 "8"(k31)
1513 : "cc", "memory", "v0", "v1");
1514 #else
1515 asm volatile(
1516 "vld1.f32 {d0-d1}, [%1 :128]! \n"
1517 "vld1.f32 {d2-d3}, [%2 :128]! \n"
1518 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1519
1520 "vld1.f32 {d0-d1}, [%3 :128]! \n"
1521 "vld1.f32 {d2-d3}, [%4 :128]! \n"
1522 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1523
1524 "vld1.f32 {d0-d1}, [%5 :128]! \n"
1525 "vld1.f32 {d2-d3}, [%6 :128]! \n"
1526 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1527
1528 "vld1.f32 {d0-d1}, [%7 :128]! \n"
1529 "vld1.f32 {d2-d3}, [%8 :128]! \n"
1530 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1531
1532 : "=r"(ktm2), // %0
1533 "=r"(k00), // %1
1534 "=r"(k01), // %2
1535 "=r"(k10), // %3
1536 "=r"(k11), // %4
1537 "=r"(k20), // %5
1538 "=r"(k21), // %6
1539 "=r"(k30), // %7
1540 "=r"(k31) // %8
1541 : "0"(ktm2),
1542 "1"(k00),
1543 "2"(k01),
1544 "3"(k10),
1545 "4"(k11),
1546 "5"(k20),
1547 "6"(k21),
1548 "7"(k30),
1549 "8"(k31)
1550 : "cc", "memory", "q0", "q1");
1551 #endif // __aarch64__
1552 #else
1553 for (int m = 0; m < 4; m++)
1554 {
1555 ktm2[0 + m] = k00[m];
1556 ktm2[4 + m] = k01[m];
1557 ktm2[8 + m] = k10[m];
1558 ktm2[12 + m] = k11[m];
1559 ktm2[16 + m] = k20[m];
1560 ktm2[20 + m] = k21[m];
1561 ktm2[24 + m] = k30[m];
1562 ktm2[28 + m] = k31[m];
1563 }
1564
1565 k00 += 4;
1566 k01 += 4;
1567 k10 += 4;
1568 k11 += 4;
1569 k20 += 4;
1570 k21 += 4;
1571 k30 += 4;
1572 k31 += 4;
1573 ktm2 += 32;
1574 #endif // __ARM_NEON
1575 }
1576 }
1577
1578 for (; q < inch; q++)
1579 {
1580 const float* k00 = kernel0_tm.row(q);
1581 const float* k10 = kernel1_tm.row(q);
1582 const float* k20 = kernel2_tm.row(q);
1583 const float* k30 = kernel3_tm.row(q);
1584
1585 for (int r = 0; r < 16; r++)
1586 {
1587 #if __ARM_NEON
1588 #if __aarch64__
1589 asm volatile(
1590 "ld1 {v0.4s}, [%1], #16 \n"
1591 "ld1 {v1.4s}, [%2], #16 \n"
1592 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1593
1594 "ld1 {v0.4s}, [%3], #16 \n"
1595 "ld1 {v1.4s}, [%4], #16 \n"
1596 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
1597
1598 : "=r"(ktm2), // %0
1599 "=r"(k00), // %1
1600 "=r"(k10), // %2
1601 "=r"(k20), // %3
1602 "=r"(k30) // %4
1603 : "0"(ktm2),
1604 "1"(k00),
1605 "2"(k10),
1606 "3"(k20),
1607 "4"(k30)
1608 : "cc", "memory", "v0", "v1");
1609 #else
1610 asm volatile(
1611 "vld1.f32 {d0-d1}, [%1 :128]! \n"
1612 "vld1.f32 {d2-d3}, [%2 :128]! \n"
1613 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1614
1615 "vld1.f32 {d0-d1}, [%3 :128]! \n"
1616 "vld1.f32 {d2-d3}, [%4 :128]! \n"
1617 "vst1.f32 {d0-d3}, [%0 :128]! \n"
1618
1619 : "=r"(ktm2), // %0
1620 "=r"(k00), // %1
1621 "=r"(k10), // %2
1622 "=r"(k20), // %3
1623 "=r"(k30) // %4
1624 : "0"(ktm2),
1625 "1"(k00),
1626 "2"(k10),
1627 "3"(k20),
1628 "4"(k30)
1629 : "cc", "memory", "q0", "q1");
1630 #endif // __aarch64__
1631 #else
1632 for (int m = 0; m < 4; m++)
1633 {
1634 ktm2[0 + m] = k00[m];
1635 ktm2[4 + m] = k10[m];
1636 ktm2[8 + m] = k20[m];
1637 ktm2[12 + m] = k30[m];
1638 }
1639
1640 k00 += 4;
1641 k10 += 4;
1642 k20 += 4;
1643 k30 += 4;
1644 ktm2 += 16;
1645 #endif // __ARM_NEON
1646 }
1647 }
1648 }
1649
1650 #pragma omp parallel for
1651 for (int p = remain_outch_start; p < outch; p++)
1652 {
1653 float* ktm2 = (float*)kernel_tm2.channel(nn_outch) + 8 * 8 * inch * (p - remain_outch_start);
1654
1655 const Mat kernel0_tm = kernel_tm.channel(p);
1656
1657 int q = 0;
1658
1659 for (; q < inch; q++)
1660 {
1661 const float* k00 = kernel0_tm.row(q);
1662
1663 for (int r = 0; r < 16; r++)
1664 {
1665 #if __ARM_NEON
1666 #if __aarch64__
1667 asm volatile(
1668 "ld1 {v0.4s}, [%1], #16 \n"
1669 "st1 {v0.4s}, [%0], #16 \n"
1670 : "=r"(ktm2), // %0
1671 "=r"(k00) // %1
1672 : "0"(ktm2),
1673 "1"(k00)
1674 : "cc", "memory", "v0");
1675 #else
1676 asm volatile(
1677 "vld1.f32 {d0-d1}, [%1 :128]! \n"
1678 "vst1.f32 {d0-d1}, [%0 :128]! \n"
1679 : "=r"(ktm2), // %0
1680 "=r"(k00) // %1
1681 : "0"(ktm2),
1682 "1"(k00)
1683 : "cc", "memory", "q0");
1684 #endif // __aarch64__
1685 #else
1686 for (int m = 0; m < 4; m++)
1687 {
1688 ktm2[m] = k00[m];
1689 }
1690
1691 k00 += 4;
1692 ktm2 += 4;
1693 #endif // __ARM_NEON
1694 }
1695 }
1696 }
1697
1698 kernel_tm = kernel_tm2;
1699 }
1700
conv3x3s1_winograd64_transform_kernel_neon5(const Mat & kernel,Mat & kernel_tm,int inch,int outch)1701 static void conv3x3s1_winograd64_transform_kernel_neon5(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
1702 {
1703 kernel_tm.create(8 * 8, inch, outch);
1704
1705 const float ktm[8][3] = {
1706 {1.0f, 0.0f, 0.0f},
1707 {-2.0f / 9, -2.0f / 9, -2.0f / 9},
1708 {-2.0f / 9, 2.0f / 9, -2.0f / 9},
1709 {1.0f / 90, 1.0f / 45, 2.0f / 45},
1710 {1.0f / 90, -1.0f / 45, 2.0f / 45},
1711 {1.0f / 45, 1.0f / 90, 1.0f / 180},
1712 {1.0f / 45, -1.0f / 90, 1.0f / 180},
1713 {0.0f, 0.0f, 1.0f}
1714 };
1715
1716 #pragma omp parallel for
1717 for (int p = 0; p < outch; p++)
1718 {
1719 for (int q = 0; q < inch; q++)
1720 {
1721 const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
1722 float* kernel_tm0 = kernel_tm.channel(p).row(q);
1723
1724 // transform kernel, transposed
1725 const float* k0 = kernel0;
1726 const float* k1 = kernel0 + 3;
1727 const float* k2 = kernel0 + 6;
1728
1729 // h
1730 float tmp[8][3];
1731 for (int i = 0; i < 8; i++)
1732 {
1733 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
1734 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
1735 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
1736 }
1737
1738 // v
1739 for (int j = 0; j < 8; j++)
1740 {
1741 float* tmpp = &tmp[j][0];
1742
1743 for (int i = 0; i < 8; i++)
1744 {
1745 kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
1746 }
1747 }
1748 }
1749 }
1750
1751 // optimized layout for winograd5
1752 // interleave weights
1753 // Mat kernel_tm2(8*8, inch, outch);
1754 // Mat kernel_tm2(inch, 64, outch);
1755 #if __ARM_NEON && __aarch64__
1756 Mat kernel_tm2(8 * 4 * (inch / 4) + 8 * (inch % 4), 64, outch / 8 + (outch % 8) / 4 + outch % 4);
1757 #else
1758 Mat kernel_tm2(4 * 4 * (inch / 4) + 4 * (inch % 4), 64, outch / 4 + outch % 4);
1759 #endif
1760
1761 int p = 0;
1762 #if __aarch64__
1763 for (; p + 7 < outch; p += 8)
1764 {
1765 const Mat kernel0_tm = kernel_tm.channel(p);
1766 const Mat kernel1_tm = kernel_tm.channel(p + 1);
1767 const Mat kernel2_tm = kernel_tm.channel(p + 2);
1768 const Mat kernel3_tm = kernel_tm.channel(p + 3);
1769 const Mat kernel4_tm = kernel_tm.channel(p + 4);
1770 const Mat kernel5_tm = kernel_tm.channel(p + 5);
1771 const Mat kernel6_tm = kernel_tm.channel(p + 6);
1772 const Mat kernel7_tm = kernel_tm.channel(p + 7);
1773
1774 Mat ktm2 = kernel_tm2.channel(p / 8);
1775
1776 for (int r = 0; r < 64; r++)
1777 {
1778 float* ktm2p = ktm2.row(r);
1779
1780 for (int q = 0; q < inch; q++)
1781 {
1782 const float* ktm0_0 = kernel0_tm.row(q);
1783 const float* ktm1_0 = kernel1_tm.row(q);
1784 const float* ktm2_0 = kernel2_tm.row(q);
1785 const float* ktm3_0 = kernel3_tm.row(q);
1786 const float* ktm4_0 = kernel4_tm.row(q);
1787 const float* ktm5_0 = kernel5_tm.row(q);
1788 const float* ktm6_0 = kernel6_tm.row(q);
1789 const float* ktm7_0 = kernel7_tm.row(q);
1790
1791 ktm2p[0] = ktm0_0[r];
1792 ktm2p[1] = ktm1_0[r];
1793 ktm2p[2] = ktm2_0[r];
1794 ktm2p[3] = ktm3_0[r];
1795 ktm2p[4] = ktm4_0[r];
1796 ktm2p[5] = ktm5_0[r];
1797 ktm2p[6] = ktm6_0[r];
1798 ktm2p[7] = ktm7_0[r];
1799
1800 ktm2p += 8;
1801 }
1802 }
1803 }
1804 #endif // __aarch64__
1805 for (; p + 3 < outch; p += 4)
1806 {
1807 const Mat kernel0_tm = kernel_tm.channel(p);
1808 const Mat kernel1_tm = kernel_tm.channel(p + 1);
1809 const Mat kernel2_tm = kernel_tm.channel(p + 2);
1810 const Mat kernel3_tm = kernel_tm.channel(p + 3);
1811
1812 #if __ARM_NEON && __aarch64__
1813 Mat ktm2 = kernel_tm2.channel(p / 8 + (p % 8) / 4);
1814 #else
1815 Mat ktm2 = kernel_tm2.channel(p / 4);
1816 #endif
1817
1818 for (int r = 0; r < 64; r++)
1819 {
1820 float* ktm2p = ktm2.row(r);
1821
1822 for (int q = 0; q < inch; q++)
1823 {
1824 const float* ktm0_0 = kernel0_tm.row(q);
1825 const float* ktm1_0 = kernel1_tm.row(q);
1826 const float* ktm2_0 = kernel2_tm.row(q);
1827 const float* ktm3_0 = kernel3_tm.row(q);
1828
1829 ktm2p[0] = ktm0_0[r];
1830 ktm2p[1] = ktm1_0[r];
1831 ktm2p[2] = ktm2_0[r];
1832 ktm2p[3] = ktm3_0[r];
1833
1834 ktm2p += 4;
1835 }
1836 }
1837 }
1838 for (; p < outch; p++)
1839 {
1840 const Mat kernel0_tm = kernel_tm.channel(p);
1841
1842 #if __ARM_NEON && __aarch64__
1843 Mat ktm2 = kernel_tm2.channel(p / 8 + (p % 8) / 4 + p % 4);
1844 #else
1845 Mat ktm2 = kernel_tm2.channel(p / 4 + p % 4);
1846 #endif
1847
1848 for (int r = 0; r < 64; r++)
1849 {
1850 float* ktm2p = ktm2.row(r);
1851
1852 for (int q = 0; q < inch; q++)
1853 {
1854 const float* ktm0_0 = kernel0_tm.row(q);
1855
1856 ktm2p[0] = ktm0_0[r];
1857
1858 ktm2p += 1;
1859 }
1860 }
1861 }
1862
1863 kernel_tm = kernel_tm2;
1864 }
1865
conv3x3s1_winograd64_neon4(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)1866 static void conv3x3s1_winograd64_neon4(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
1867 {
1868 int w = bottom_blob.w;
1869 int h = bottom_blob.h;
1870 int inch = bottom_blob.c;
1871
1872 int outw = top_blob.w;
1873 int outh = top_blob.h;
1874 int outch = top_blob.c;
1875
1876 // pad to 6n+2
1877 Mat bottom_blob_bordered = bottom_blob;
1878
1879 outw = (outw + 5) / 6 * 6;
1880 outh = (outh + 5) / 6 * 6;
1881
1882 w = outw + 2;
1883 h = outh + 2;
1884 Option opt_b = opt;
1885 opt_b.blob_allocator = opt.workspace_allocator;
1886 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
1887
1888 const float* bias = _bias;
1889
1890 // BEGIN transform input
1891 Mat bottom_blob_tm;
1892 {
1893 int w_tm = outw / 6 * 8;
1894 int h_tm = outh / 6 * 8;
1895 bottom_blob_tm.create(4, 16 * w_tm / 8 * h_tm / 8, inch, 4u, opt.workspace_allocator);
1896 const int tiles = w_tm / 8 * h_tm / 8;
1897
1898 // const float itm[8][8] = {
1899 // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f},
1900 //
1901 // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f},
1902 // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f},
1903 //
1904 // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f},
1905 // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f},
1906 //
1907 // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f},
1908 // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f},
1909 //
1910 // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f}
1911 // };
1912
1913 // 0 = r00 - r06 + (r04 - r02) * 5.25
1914 // 7 = r07 - r01 + (r03 - r05) * 5.25
1915
1916 // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
1917 // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
1918
1919 // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
1920 // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
1921
1922 // reuse r04 * 1.25
1923 // reuse r03 * 2.5
1924 // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
1925 // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
1926
1927 #if __ARM_NEON
1928 const float coeff[8] = {
1929 0.25f, 0.5f, -1.25f, 2.f,
1930 -2.5f, 4.f, 4.25f, 5.25f
1931 };
1932 float32x4_t _coeff0 = vld1q_f32(coeff);
1933 float32x4_t _coeff1 = vld1q_f32(coeff + 4);
1934 #endif // __ARM_NEON
1935
1936 #pragma omp parallel for num_threads(opt.num_threads)
1937 for (int q = 0; q < inch; q++)
1938 {
1939 const Mat img0 = bottom_blob_bordered.channel(q);
1940 Mat img0_tm = bottom_blob_tm.channel(q);
1941
1942 float tmp[8][8];
1943
1944 // tile
1945 for (int i = 0; i < h_tm / 8; i++)
1946 {
1947 for (int j = 0; j < w_tm / 8; j++)
1948 {
1949 #if __ARM_NEON
1950 const float* r0 = img0.row(i * 6) + j * 6;
1951 const float* r1 = r0 + w;
1952 const float* r2 = r0 + w * 2;
1953 const float* r3 = r0 + w * 3;
1954
1955 // the assembly block for armv7 input transform requires 13 general registers
1956 // old gcc may fail to allocate register on debug build without -fomit-frame-pointer
1957 // so, fallback to intrinsic version for armv7 debug build --- nihui
1958 #if __aarch64__ || !defined(NDEBUG)
1959 for (int m = 0; m + 3 < 8; m += 4)
1960 {
1961 float32x4_t _r0_0123 = vld1q_f32(r0);
1962 float32x4_t _r0_4567 = vld1q_f32(r0 + 4);
1963 float32x4_t _r1_0123 = vld1q_f32(r1);
1964 float32x4_t _r1_4567 = vld1q_f32(r1 + 4);
1965 float32x4_t _r2_0123 = vld1q_f32(r2);
1966 float32x4_t _r2_4567 = vld1q_f32(r2 + 4);
1967 float32x4_t _r3_0123 = vld1q_f32(r3);
1968 float32x4_t _r3_4567 = vld1q_f32(r3 + 4);
1969
1970 float32x4x2_t _r01_00221133 = vtrnq_f32(_r0_0123, _r1_0123);
1971 float32x4x2_t _r01_44665577 = vtrnq_f32(_r0_4567, _r1_4567);
1972 float32x4x2_t _r23_00221133 = vtrnq_f32(_r2_0123, _r3_0123);
1973 float32x4x2_t _r23_44665577 = vtrnq_f32(_r2_4567, _r3_4567);
1974
1975 // no vswp intrinsic :(
1976 float32x4_t _r_00 = vcombine_f32(vget_low_f32(_r01_00221133.val[0]), vget_low_f32(_r23_00221133.val[0]));
1977 float32x4_t _r_11 = vcombine_f32(vget_low_f32(_r01_00221133.val[1]), vget_low_f32(_r23_00221133.val[1]));
1978 float32x4_t _r_22 = vcombine_f32(vget_high_f32(_r01_00221133.val[0]), vget_high_f32(_r23_00221133.val[0]));
1979 float32x4_t _r_33 = vcombine_f32(vget_high_f32(_r01_00221133.val[1]), vget_high_f32(_r23_00221133.val[1]));
1980 float32x4_t _r_44 = vcombine_f32(vget_low_f32(_r01_44665577.val[0]), vget_low_f32(_r23_44665577.val[0]));
1981 float32x4_t _r_55 = vcombine_f32(vget_low_f32(_r01_44665577.val[1]), vget_low_f32(_r23_44665577.val[1]));
1982 float32x4_t _r_66 = vcombine_f32(vget_high_f32(_r01_44665577.val[0]), vget_high_f32(_r23_44665577.val[0]));
1983 float32x4_t _r_77 = vcombine_f32(vget_high_f32(_r01_44665577.val[1]), vget_high_f32(_r23_44665577.val[1]));
1984
1985 float32x4_t _r_0_m_6 = vsubq_f32(_r_00, _r_66);
1986 float32x4_t _r_7_m_1 = vsubq_f32(_r_77, _r_11);
1987
1988 float32x4_t _r_4_m_2 = vsubq_f32(_r_44, _r_22);
1989 float32x4_t _r_3_m_5 = vsubq_f32(_r_33, _r_55);
1990
1991 float32x4_t _tmp0 = vmlaq_lane_f32(_r_0_m_6, _r_4_m_2, vget_high_f32(_coeff1), 1);
1992 float32x4_t _tmp7 = vmlaq_lane_f32(_r_7_m_1, _r_3_m_5, vget_high_f32(_coeff1), 1);
1993
1994 vst1q_f32(&tmp[0][m], _tmp0);
1995 vst1q_f32(&tmp[7][m], _tmp7);
1996
1997 float32x4_t _r_2_a_6 = vaddq_f32(_r_22, _r_66);
1998 float32x4_t _r_1_a_5 = vaddq_f32(_r_11, _r_55);
1999
2000 float32x4_t _tmp12a = vmlsq_lane_f32(_r_2_a_6, _r_44, vget_high_f32(_coeff1), 0);
2001 float32x4_t _tmp12b = vmlsq_lane_f32(_r_1_a_5, _r_33, vget_high_f32(_coeff1), 0);
2002
2003 float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
2004 float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
2005
2006 vst1q_f32(&tmp[1][m], _tmp1);
2007 vst1q_f32(&tmp[2][m], _tmp2);
2008
2009 float32x4_t _r_4_x_c = vmulq_lane_f32(_r_44, vget_high_f32(_coeff0), 0);
2010 float32x4_t _r_3_x_c = vmulq_lane_f32(_r_33, vget_low_f32(_coeff1), 0);
2011
2012 float32x4_t _tmp34a = vaddq_f32(_r_66, _r_4_x_c);
2013 _tmp34a = vmlaq_lane_f32(_tmp34a, _r_22, vget_low_f32(_coeff0), 0);
2014
2015 float32x4_t _tmp34b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_low_f32(_coeff0), 1);
2016 _tmp34b = vmlaq_lane_f32(_tmp34b, _r_55, vget_high_f32(_coeff0), 1);
2017
2018 float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
2019 float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
2020
2021 vst1q_f32(&tmp[3][m], _tmp3);
2022 vst1q_f32(&tmp[4][m], _tmp4);
2023
2024 // reuse r04 * 1.25
2025 // reuse r03 * 2.5
2026 float32x4_t _r_2_a_4c = vaddq_f32(_r_22, _r_4_x_c);
2027 float32x4_t _tmp56a = vmlaq_lane_f32(_r_66, _r_2_a_4c, vget_low_f32(_coeff1), 1);
2028 float32x4_t _tmp56b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_high_f32(_coeff0), 1);
2029 _tmp56b = vmlaq_lane_f32(_tmp56b, _r_55, vget_low_f32(_coeff0), 1);
2030
2031 float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
2032 float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
2033
2034 vst1q_f32(&tmp[5][m], _tmp5);
2035 vst1q_f32(&tmp[6][m], _tmp6);
2036
2037 r0 += w * 4;
2038 r1 += w * 4;
2039 r2 += w * 4;
2040 r3 += w * 4;
2041 }
2042
2043 const float* t0 = tmp[0];
2044 const float* t1 = tmp[1];
2045 const float* t2 = tmp[2];
2046 const float* t3 = tmp[3];
2047
2048 float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
2049 float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2050 float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
2051 float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
2052 float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
2053 float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
2054 float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
2055 float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
2056
2057 for (int m = 0; m + 3 < 8; m += 4)
2058 {
2059 float32x4_t _t0_0123 = vld1q_f32(t0);
2060 float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
2061 float32x4_t _t1_0123 = vld1q_f32(t1);
2062 float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
2063 float32x4_t _t2_0123 = vld1q_f32(t2);
2064 float32x4_t _t2_4567 = vld1q_f32(t2 + 4);
2065 float32x4_t _t3_0123 = vld1q_f32(t3);
2066 float32x4_t _t3_4567 = vld1q_f32(t3 + 4);
2067
2068 float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
2069 float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
2070 float32x4x2_t _t23_00221133 = vtrnq_f32(_t2_0123, _t3_0123);
2071 float32x4x2_t _t23_44665577 = vtrnq_f32(_t2_4567, _t3_4567);
2072
2073 // no vswp intrinsic :(
2074 float32x4_t _t_00 = vcombine_f32(vget_low_f32(_t01_00221133.val[0]), vget_low_f32(_t23_00221133.val[0]));
2075 float32x4_t _t_11 = vcombine_f32(vget_low_f32(_t01_00221133.val[1]), vget_low_f32(_t23_00221133.val[1]));
2076 float32x4_t _t_22 = vcombine_f32(vget_high_f32(_t01_00221133.val[0]), vget_high_f32(_t23_00221133.val[0]));
2077 float32x4_t _t_33 = vcombine_f32(vget_high_f32(_t01_00221133.val[1]), vget_high_f32(_t23_00221133.val[1]));
2078 float32x4_t _t_44 = vcombine_f32(vget_low_f32(_t01_44665577.val[0]), vget_low_f32(_t23_44665577.val[0]));
2079 float32x4_t _t_55 = vcombine_f32(vget_low_f32(_t01_44665577.val[1]), vget_low_f32(_t23_44665577.val[1]));
2080 float32x4_t _t_66 = vcombine_f32(vget_high_f32(_t01_44665577.val[0]), vget_high_f32(_t23_44665577.val[0]));
2081 float32x4_t _t_77 = vcombine_f32(vget_high_f32(_t01_44665577.val[1]), vget_high_f32(_t23_44665577.val[1]));
2082
2083 float32x4_t _t_0_m_6 = vsubq_f32(_t_00, _t_66);
2084 float32x4_t _t_7_m_1 = vsubq_f32(_t_77, _t_11);
2085
2086 float32x4_t _t_4_m_2 = vsubq_f32(_t_44, _t_22);
2087 float32x4_t _t_3_m_5 = vsubq_f32(_t_33, _t_55);
2088
2089 float32x4_t _r0_tm_0_0 = vmlaq_lane_f32(_t_0_m_6, _t_4_m_2, vget_high_f32(_coeff1), 1);
2090 float32x4_t _r0_tm_4_3 = vmlaq_lane_f32(_t_7_m_1, _t_3_m_5, vget_high_f32(_coeff1), 1);
2091
2092 r0_tm0_0[0] = vgetq_lane_f32(_r0_tm_0_0, 0);
2093 r0_tm1_0[0] = vgetq_lane_f32(_r0_tm_0_0, 1);
2094 r0_tm2_0[0] = vgetq_lane_f32(_r0_tm_0_0, 2);
2095 r0_tm3_0[0] = vgetq_lane_f32(_r0_tm_0_0, 3);
2096
2097 r0_tm0_4[3] = vgetq_lane_f32(_r0_tm_4_3, 0);
2098 r0_tm1_4[3] = vgetq_lane_f32(_r0_tm_4_3, 1);
2099 r0_tm2_4[3] = vgetq_lane_f32(_r0_tm_4_3, 2);
2100 r0_tm3_4[3] = vgetq_lane_f32(_r0_tm_4_3, 3);
2101
2102 float32x4_t _t_2_m_6 = vaddq_f32(_t_22, _t_66);
2103 float32x4_t _t_1_m_5 = vaddq_f32(_t_11, _t_55);
2104
2105 float32x4_t _tmp12a = vmlsq_lane_f32(_t_2_m_6, _t_44, vget_high_f32(_coeff1), 0);
2106 float32x4_t _tmp12b = vmlsq_lane_f32(_t_1_m_5, _t_33, vget_high_f32(_coeff1), 0);
2107
2108 float32x4_t _r0_tm_0_1 = vaddq_f32(_tmp12a, _tmp12b);
2109 float32x4_t _r0_tm_0_2 = vsubq_f32(_tmp12a, _tmp12b);
2110
2111 r0_tm0_0[1] = vgetq_lane_f32(_r0_tm_0_1, 0);
2112 r0_tm1_0[1] = vgetq_lane_f32(_r0_tm_0_1, 1);
2113 r0_tm2_0[1] = vgetq_lane_f32(_r0_tm_0_1, 2);
2114 r0_tm3_0[1] = vgetq_lane_f32(_r0_tm_0_1, 3);
2115
2116 r0_tm0_0[2] = vgetq_lane_f32(_r0_tm_0_2, 0);
2117 r0_tm1_0[2] = vgetq_lane_f32(_r0_tm_0_2, 1);
2118 r0_tm2_0[2] = vgetq_lane_f32(_r0_tm_0_2, 2);
2119 r0_tm3_0[2] = vgetq_lane_f32(_r0_tm_0_2, 3);
2120
2121 float32x4_t _t_4_x_c = vmulq_lane_f32(_t_44, vget_high_f32(_coeff0), 0);
2122 float32x4_t _t_3_x_c = vmulq_lane_f32(_t_33, vget_low_f32(_coeff1), 0);
2123
2124 float32x4_t _tmp34a = vaddq_f32(_t_66, _t_4_x_c);
2125 _tmp34a = vmlaq_lane_f32(_tmp34a, _t_22, vget_low_f32(_coeff0), 0);
2126
2127 float32x4_t _tmp34b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_low_f32(_coeff0), 1);
2128 _tmp34b = vmlaq_lane_f32(_tmp34b, _t_55, vget_high_f32(_coeff0), 1);
2129
2130 float32x4_t _r0_tm_0_3 = vaddq_f32(_tmp34a, _tmp34b);
2131 float32x4_t _r0_tm_4_0 = vsubq_f32(_tmp34a, _tmp34b);
2132
2133 r0_tm0_0[3] = vgetq_lane_f32(_r0_tm_0_3, 0);
2134 r0_tm1_0[3] = vgetq_lane_f32(_r0_tm_0_3, 1);
2135 r0_tm2_0[3] = vgetq_lane_f32(_r0_tm_0_3, 2);
2136 r0_tm3_0[3] = vgetq_lane_f32(_r0_tm_0_3, 3);
2137
2138 r0_tm0_4[0] = vgetq_lane_f32(_r0_tm_4_0, 0);
2139 r0_tm1_4[0] = vgetq_lane_f32(_r0_tm_4_0, 1);
2140 r0_tm2_4[0] = vgetq_lane_f32(_r0_tm_4_0, 2);
2141 r0_tm3_4[0] = vgetq_lane_f32(_r0_tm_4_0, 3);
2142
2143 float32x4_t _t_2_a_4c = vaddq_f32(_t_22, _t_4_x_c);
2144 float32x4_t _tmp56a = vmlaq_lane_f32(_t_66, _t_2_a_4c, vget_low_f32(_coeff1), 1);
2145 float32x4_t _tmp56b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_high_f32(_coeff0), 1);
2146 _tmp56b = vmlaq_lane_f32(_tmp56b, _t_55, vget_low_f32(_coeff0), 1);
2147
2148 float32x4_t _r0_tm_4_1 = vaddq_f32(_tmp56a, _tmp56b);
2149 float32x4_t _r0_tm_4_2 = vsubq_f32(_tmp56a, _tmp56b);
2150
2151 r0_tm0_4[1] = vgetq_lane_f32(_r0_tm_4_1, 0);
2152 r0_tm1_4[1] = vgetq_lane_f32(_r0_tm_4_1, 1);
2153 r0_tm2_4[1] = vgetq_lane_f32(_r0_tm_4_1, 2);
2154 r0_tm3_4[1] = vgetq_lane_f32(_r0_tm_4_1, 3);
2155
2156 r0_tm0_4[2] = vgetq_lane_f32(_r0_tm_4_2, 0);
2157 r0_tm1_4[2] = vgetq_lane_f32(_r0_tm_4_2, 1);
2158 r0_tm2_4[2] = vgetq_lane_f32(_r0_tm_4_2, 2);
2159 r0_tm3_4[2] = vgetq_lane_f32(_r0_tm_4_2, 3);
2160
2161 t0 += 8 * 4;
2162 t1 += 8 * 4;
2163 t2 += 8 * 4;
2164 t3 += 8 * 4;
2165
2166 r0_tm0_0 += img0_tm.w * tiles * 2 * 4;
2167 r0_tm0_4 += img0_tm.w * tiles * 2 * 4;
2168 r0_tm1_0 += img0_tm.w * tiles * 2 * 4;
2169 r0_tm1_4 += img0_tm.w * tiles * 2 * 4;
2170 r0_tm2_0 += img0_tm.w * tiles * 2 * 4;
2171 r0_tm2_4 += img0_tm.w * tiles * 2 * 4;
2172 r0_tm3_0 += img0_tm.w * tiles * 2 * 4;
2173 r0_tm3_4 += img0_tm.w * tiles * 2 * 4;
2174 }
2175 #else // __aarch64__
2176 float* t0 = tmp[0];
2177 float* t1 = tmp[1];
2178 float* t2 = tmp[2];
2179 float* t3 = tmp[3];
2180 float* t4 = tmp[4];
2181 float* t5 = tmp[5];
2182 float* t6 = tmp[6];
2183 float* t7 = tmp[7];
2184
2185 int stepw = w * 4 * 4;
2186
2187 asm volatile(
2188
2189 // loop0
2190 "vld1.f32 {d16-d19}, [%8], %26 \n"
2191 "vld1.f32 {d20-d23}, [%9], %26 \n"
2192 "vld1.f32 {d24-d27}, [%10], %26 \n"
2193
2194 "vtrn.32 q8, q10 \n"
2195
2196 "vld1.f32 {d28-d31}, [%11], %26 \n"
2197
2198 "vtrn.32 q9, q11 \n"
2199 "vtrn.32 q12, q14 \n"
2200 "vtrn.32 q13, q15 \n"
2201
2202 "vswp d17, d24 \n"
2203 "vswp d19, d26 \n"
2204 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
2205 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
2206
2207 "vsub.f32 q2, q8, q13 \n"
2208 "vsub.f32 q3, q9, q12 \n"
2209
2210 "vadd.f32 q4, q12, q13 \n"
2211 "vadd.f32 q5, q10, q11 \n"
2212
2213 "vmla.f32 q2, q3, %f25[1] \n"
2214
2215 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
2216 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
2217
2218 "vmls.f32 q4, q9, %f25[0] \n"
2219 "vmls.f32 q5, q14, %f25[0] \n"
2220
2221 "vst1.f32 {d4-d5}, [%0]! \n" // tmp[0][m]
2222
2223 "vmov q3, q7 \n" // use q7
2224
2225 "vadd.f32 q2, q13, q6 \n" // use q6
2226 "vmla.f32 q3, q10, %e24[1] \n"
2227
2228 "vadd.f32 q8, q4, q5 \n"
2229 "vsub.f32 q9, q4, q5 \n"
2230
2231 "vmov q5, q7 \n" // use q7
2232
2233 "vadd.f32 q6, q12, q6 \n" // use q6
2234 "vmla.f32 q5, q10, %f24[1] \n"
2235
2236 "vmov q4, q13 \n"
2237
2238 "vmla.f32 q2, q12, %e24[0] \n"
2239 "vmla.f32 q3, q11, %f24[1] \n"
2240
2241 "vst1.f32 {d16-d17}, [%1]! \n" // tmp[1][m]
2242
2243 "vmla.f32 q4, q6, %e25[1] \n"
2244 "vmla.f32 q5, q11, %e24[1] \n"
2245
2246 "vst1.f32 {d18-d19}, [%2]! \n" // tmp[2][m]
2247
2248 "vadd.f32 q8, q2, q3 \n"
2249 "vsub.f32 q9, q2, q3 \n"
2250
2251 "vsub.f32 q6, q15, q10 \n"
2252 "vsub.f32 q7, q14, q11 \n"
2253
2254 "vadd.f32 q2, q4, q5 \n"
2255 "vsub.f32 q3, q4, q5 \n"
2256
2257 "vst1.f32 {d16-d17}, [%3]! \n" // tmp[3][m]
2258 "vst1.f32 {d18-d19}, [%4]! \n" // tmp[4][m]
2259
2260 "vmla.f32 q6, q7, %f25[1] \n"
2261
2262 "vst1.f32 {d4-d5}, [%5]! \n" // tmp[5][m]
2263 "vst1.f32 {d6-d7}, [%6]! \n" // tmp[6][m]
2264
2265 "vst1.f32 {d12-d13}, [%7]! \n" // tmp[7][m]
2266
2267 // loop1
2268 "vld1.f32 {d16-d19}, [%8] \n"
2269 "vld1.f32 {d20-d23}, [%9] \n"
2270 "vld1.f32 {d24-d27}, [%10] \n"
2271
2272 "vtrn.32 q8, q10 \n"
2273
2274 "vld1.f32 {d28-d31}, [%11] \n"
2275
2276 "vtrn.32 q9, q11 \n"
2277 "vtrn.32 q12, q14 \n"
2278 "vtrn.32 q13, q15 \n"
2279
2280 "vswp d17, d24 \n"
2281 "vswp d19, d26 \n"
2282 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
2283 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
2284
2285 "vsub.f32 q2, q8, q13 \n"
2286 "vsub.f32 q3, q9, q12 \n"
2287
2288 "vadd.f32 q4, q12, q13 \n"
2289 "vadd.f32 q5, q10, q11 \n"
2290
2291 "vmla.f32 q2, q3, %f25[1] \n"
2292
2293 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
2294 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
2295
2296 "vmls.f32 q4, q9, %f25[0] \n"
2297 "vmls.f32 q5, q14, %f25[0] \n"
2298
2299 "vst1.f32 {d4-d5}, [%0]! \n" // tmp[0][m]
2300
2301 "vmov q3, q7 \n" // use q7
2302
2303 "vadd.f32 q2, q13, q6 \n" // use q6
2304 "vmla.f32 q3, q10, %e24[1] \n"
2305
2306 "vadd.f32 q8, q4, q5 \n"
2307 "vsub.f32 q9, q4, q5 \n"
2308
2309 "vmov q5, q7 \n" // use q7
2310
2311 "vadd.f32 q6, q12, q6 \n" // use q6
2312 "vmla.f32 q5, q10, %f24[1] \n"
2313
2314 "vmov q4, q13 \n"
2315
2316 "vmla.f32 q2, q12, %e24[0] \n"
2317 "vmla.f32 q3, q11, %f24[1] \n"
2318
2319 "vst1.f32 {d16-d17}, [%1]! \n" // tmp[1][m]
2320
2321 "vmla.f32 q4, q6, %e25[1] \n"
2322 "vmla.f32 q5, q11, %e24[1] \n"
2323
2324 "vst1.f32 {d18-d19}, [%2]! \n" // tmp[2][m]
2325
2326 "vadd.f32 q8, q2, q3 \n"
2327 "vsub.f32 q9, q2, q3 \n"
2328
2329 "vsub.f32 q6, q15, q10 \n"
2330 "vsub.f32 q7, q14, q11 \n"
2331
2332 "vadd.f32 q2, q4, q5 \n"
2333 "vsub.f32 q3, q4, q5 \n"
2334
2335 "vst1.f32 {d16-d17}, [%3]! \n" // tmp[3][m]
2336 "vst1.f32 {d18-d19}, [%4]! \n" // tmp[4][m]
2337
2338 "vmla.f32 q6, q7, %f25[1] \n"
2339
2340 "vst1.f32 {d4-d5}, [%5]! \n" // tmp[5][m]
2341 "vst1.f32 {d6-d7}, [%6]! \n" // tmp[6][m]
2342
2343 "vst1.f32 {d12-d13}, [%7]! \n" // tmp[7][m]
2344
2345 : "=r"(t0), // %0
2346 "=r"(t1), // %1
2347 "=r"(t2), // %2
2348 "=r"(t3), // %3
2349 "=r"(t4), // %4
2350 "=r"(t5), // %5
2351 "=r"(t6), // %6
2352 "=r"(t7), // %7
2353 "=r"(r0), // %8
2354 "=r"(r1), // %9
2355 "=r"(r2), // %10
2356 "=r"(r3) // %11
2357 : "0"(t0),
2358 "1"(t1),
2359 "2"(t2),
2360 "3"(t3),
2361 "4"(t4),
2362 "5"(t5),
2363 "6"(t6),
2364 "7"(t7),
2365 "8"(r0),
2366 "9"(r1),
2367 "10"(r2),
2368 "11"(r3),
2369 "w"(_coeff0), // %24
2370 "w"(_coeff1), // %25
2371 "r"(stepw) // %26
2372 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2373
2374 t0 = tmp[0];
2375 t1 = tmp[1];
2376 t2 = tmp[2];
2377 t3 = tmp[3];
2378
2379 float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
2380 float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2381 float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
2382 float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
2383 float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
2384 float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
2385 float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
2386 float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
2387
2388 int step = img0_tm.w * tiles * 2 * 4 * 4;
2389
2390 asm volatile(
2391
2392 // loop0
2393 "vld1.f32 {d16-d19}, [%8] \n"
2394 "add %8, %8, #128 \n"
2395 "vld1.f32 {d20-d23}, [%9] \n"
2396 "add %9, %9, #128 \n"
2397 "vld1.f32 {d24-d27}, [%10] \n"
2398 "add %10, %10, #128 \n"
2399
2400 "vtrn.32 q8, q10 \n"
2401
2402 "vld1.f32 {d28-d31}, [%11] \n"
2403 "add %11, %11, #128 \n"
2404
2405 "vtrn.32 q9, q11 \n"
2406 "vtrn.32 q12, q14 \n"
2407 "vtrn.32 q13, q15 \n"
2408
2409 "vswp d17, d24 \n"
2410 "vswp d19, d26 \n"
2411 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
2412 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
2413
2414 "vsub.f32 q2, q8, q13 \n"
2415 "vsub.f32 q3, q9, q12 \n"
2416
2417 "vadd.f32 q4, q12, q13 \n"
2418 "vadd.f32 q5, q10, q11 \n"
2419
2420 "vmla.f32 q2, q3, %f25[1] \n"
2421
2422 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
2423 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
2424
2425 "vmls.f32 q4, q9, %f25[0] \n"
2426 "vmls.f32 q5, q14, %f25[0] \n"
2427
2428 "vst1.f32 {d4[0]}, [%0]! \n"
2429 "vst1.f32 {d4[1]}, [%2]! \n"
2430
2431 "vmov q3, q7 \n" // use q7
2432
2433 "vst1.f32 {d5[0]}, [%4]! \n"
2434 "vst1.f32 {d5[1]}, [%6]! \n"
2435
2436 "vadd.f32 q2, q13, q6 \n" // use q6
2437 "vmla.f32 q3, q10, %e24[1] \n"
2438
2439 "vadd.f32 q8, q4, q5 \n"
2440 "vsub.f32 q9, q4, q5 \n"
2441
2442 "vmov q5, q7 \n" // use q7
2443
2444 "vadd.f32 q6, q12, q6 \n" // use q6
2445 "vmla.f32 q5, q10, %f24[1] \n"
2446
2447 "vmov q4, q13 \n"
2448
2449 "vmla.f32 q2, q12, %e24[0] \n"
2450 "vmla.f32 q3, q11, %f24[1] \n"
2451
2452 "vst1.f32 {d16[0]}, [%0]! \n"
2453 "vst1.f32 {d16[1]}, [%2]! \n"
2454
2455 "vmla.f32 q4, q6, %e25[1] \n"
2456
2457 "vst1.f32 {d17[0]}, [%4]! \n"
2458 "vst1.f32 {d17[1]}, [%6]! \n"
2459
2460 "vmla.f32 q5, q11, %e24[1] \n"
2461
2462 "vst1.f32 {d18[0]}, [%0]! \n"
2463 "vst1.f32 {d18[1]}, [%2]! \n"
2464
2465 "vadd.f32 q8, q2, q3 \n"
2466
2467 "vst1.f32 {d19[0]}, [%4]! \n"
2468 "vst1.f32 {d19[1]}, [%6]! \n"
2469
2470 "vsub.f32 q9, q2, q3 \n"
2471
2472 "vsub.f32 q6, q15, q10 \n"
2473 "vsub.f32 q7, q14, q11 \n"
2474
2475 "vadd.f32 q2, q4, q5 \n"
2476 "vsub.f32 q3, q4, q5 \n"
2477
2478 "vst1.f32 {d16[0]}, [%0], %26 \n"
2479 "vst1.f32 {d16[1]}, [%2], %26 \n"
2480
2481 "vmla.f32 q6, q7, %f25[1] \n"
2482
2483 "vst1.f32 {d17[0]}, [%4], %26 \n"
2484 "vst1.f32 {d17[1]}, [%6], %26 \n"
2485
2486 "vtrn.32 q9, q2 \n"
2487 "vtrn.32 q3, q6 \n"
2488
2489 "sub %0, %0, #12 \n"
2490 "sub %2, %2, #12 \n"
2491 "sub %4, %4, #12 \n"
2492 "sub %6, %6, #12 \n"
2493
2494 "vswp d19, d6 \n"
2495 "vswp d5, d12 \n"
2496
2497 "vst1.f32 {d18-d19}, [%1], %26 \n"
2498 "vst1.f32 {d4-d5}, [%3], %26 \n"
2499 "vst1.f32 {d6-d7}, [%5], %26 \n"
2500 "vst1.f32 {d12-d13}, [%7], %26 \n"
2501
2502 // loop1
2503 "vld1.f32 {d16-d19}, [%8] \n"
2504 "vld1.f32 {d20-d23}, [%9] \n"
2505 "vld1.f32 {d24-d27}, [%10] \n"
2506
2507 "vtrn.32 q8, q10 \n"
2508
2509 "vld1.f32 {d28-d31}, [%11] \n"
2510
2511 "vtrn.32 q9, q11 \n"
2512 "vtrn.32 q12, q14 \n"
2513 "vtrn.32 q13, q15 \n"
2514
2515 "vswp d17, d24 \n"
2516 "vswp d19, d26 \n"
2517 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
2518 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
2519
2520 "vsub.f32 q2, q8, q13 \n"
2521 "vsub.f32 q3, q9, q12 \n"
2522
2523 "vadd.f32 q4, q12, q13 \n"
2524 "vadd.f32 q5, q10, q11 \n"
2525
2526 "vmla.f32 q2, q3, %f25[1] \n"
2527
2528 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
2529 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
2530
2531 "vmls.f32 q4, q9, %f25[0] \n"
2532 "vmls.f32 q5, q14, %f25[0] \n"
2533
2534 "vst1.f32 {d4[0]}, [%0]! \n"
2535 "vst1.f32 {d4[1]}, [%2]! \n"
2536
2537 "vmov q3, q7 \n" // use q7
2538
2539 "vst1.f32 {d5[0]}, [%4]! \n"
2540 "vst1.f32 {d5[1]}, [%6]! \n"
2541
2542 "vadd.f32 q2, q13, q6 \n" // use q6
2543 "vmla.f32 q3, q10, %e24[1] \n"
2544
2545 "vadd.f32 q8, q4, q5 \n"
2546 "vsub.f32 q9, q4, q5 \n"
2547
2548 "vmov q5, q7 \n" // use q7
2549
2550 "vadd.f32 q6, q12, q6 \n" // use q6
2551 "vmla.f32 q5, q10, %f24[1] \n"
2552
2553 "vmov q4, q13 \n"
2554
2555 "vmla.f32 q2, q12, %e24[0] \n"
2556 "vmla.f32 q3, q11, %f24[1] \n"
2557
2558 "vst1.f32 {d16[0]}, [%0]! \n"
2559 "vst1.f32 {d16[1]}, [%2]! \n"
2560
2561 "vmla.f32 q4, q6, %e25[1] \n"
2562
2563 "vst1.f32 {d17[0]}, [%4]! \n"
2564 "vst1.f32 {d17[1]}, [%6]! \n"
2565
2566 "vmla.f32 q5, q11, %e24[1] \n"
2567
2568 "vst1.f32 {d18[0]}, [%0]! \n"
2569 "vst1.f32 {d18[1]}, [%2]! \n"
2570
2571 "vadd.f32 q8, q2, q3 \n"
2572
2573 "vst1.f32 {d19[0]}, [%4]! \n"
2574 "vst1.f32 {d19[1]}, [%6]! \n"
2575
2576 "vsub.f32 q9, q2, q3 \n"
2577
2578 "vsub.f32 q6, q15, q10 \n"
2579 "vsub.f32 q7, q14, q11 \n"
2580
2581 "vadd.f32 q2, q4, q5 \n"
2582 "vsub.f32 q3, q4, q5 \n"
2583
2584 "vst1.f32 {d16[0]}, [%0] \n"
2585 "vst1.f32 {d16[1]}, [%2] \n"
2586
2587 "vmla.f32 q6, q7, %f25[1] \n"
2588
2589 "vst1.f32 {d17[0]}, [%4] \n"
2590 "vst1.f32 {d17[1]}, [%6] \n"
2591
2592 "vtrn.32 q9, q2 \n"
2593 "vtrn.32 q3, q6 \n"
2594
2595 "vswp d19, d6 \n"
2596 "vswp d5, d12 \n"
2597
2598 "vst1.f32 {d18-d19}, [%1] \n"
2599 "vst1.f32 {d4-d5}, [%3] \n"
2600 "vst1.f32 {d6-d7}, [%5] \n"
2601 "vst1.f32 {d12-d13}, [%7] \n"
2602
2603 : "=r"(r0_tm0_0), // %0
2604 "=r"(r0_tm0_4), // %1
2605 "=r"(r0_tm1_0), // %2
2606 "=r"(r0_tm1_4), // %3
2607 "=r"(r0_tm2_0), // %4
2608 "=r"(r0_tm2_4), // %5
2609 "=r"(r0_tm3_0), // %6
2610 "=r"(r0_tm3_4), // %7
2611 "=r"(t0), // %8
2612 "=r"(t1), // %9
2613 "=r"(t2), // %10
2614 "=r"(t3) // %11
2615 : "0"(r0_tm0_0),
2616 "1"(r0_tm0_4),
2617 "2"(r0_tm1_0),
2618 "3"(r0_tm1_4),
2619 "4"(r0_tm2_0),
2620 "5"(r0_tm2_4),
2621 "6"(r0_tm3_0),
2622 "7"(r0_tm3_4),
2623 "8"(t0),
2624 "9"(t1),
2625 "10"(t2),
2626 "11"(t3),
2627 "w"(_coeff0), // %24
2628 "w"(_coeff1), // %25
2629 "r"(step) // %26
2630 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
2631 #endif // __aarch64__
2632 #else
2633 const float* r0 = img0.row(i * 6) + j * 6;
2634
2635 for (int m = 0; m < 8; m++)
2636 {
2637 tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25f;
2638 tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25f;
2639
2640 float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25f);
2641 float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25f);
2642
2643 tmp[1][m] = tmp12a + tmp12b;
2644 tmp[2][m] = tmp12a - tmp12b;
2645
2646 float tmp34a = (r0[6] + r0[2] * 0.25f - r0[4] * 1.25f);
2647 float tmp34b = (r0[1] * 0.5f - r0[3] * 2.5f + r0[5] * 2.f);
2648
2649 tmp[3][m] = tmp34a + tmp34b;
2650 tmp[4][m] = tmp34a - tmp34b;
2651
2652 float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25f) * 4.f);
2653 float tmp56b = (r0[1] * 2.f - r0[3] * 2.5f + r0[5] * 0.5f);
2654
2655 tmp[5][m] = tmp56a + tmp56b;
2656 tmp[6][m] = tmp56a - tmp56b;
2657
2658 r0 += w;
2659 }
2660
2661 float* r0_tm_0 = img0_tm.row(i * w_tm / 8 + j);
2662 float* r0_tm_4 = img0_tm.row(i * w_tm / 8 + j + tiles);
2663
2664 for (int m = 0; m < 8; m++)
2665 {
2666 const float* tmp0 = tmp[m];
2667
2668 r0_tm_0[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f;
2669 r0_tm_4[3] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f;
2670
2671 float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25f);
2672 float tmp12b = (tmp0[1] - tmp0[3] * 4.25f + tmp0[5]);
2673
2674 r0_tm_0[1] = tmp12a + tmp12b;
2675 r0_tm_0[2] = tmp12a - tmp12b;
2676
2677 float tmp34a = (tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f);
2678 float tmp34b = (tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f);
2679
2680 r0_tm_0[3] = tmp34a + tmp34b;
2681 r0_tm_4[0] = tmp34a - tmp34b;
2682
2683 float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f);
2684 float tmp56b = (tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f);
2685
2686 r0_tm_4[1] = tmp56a + tmp56b;
2687 r0_tm_4[2] = tmp56a - tmp56b;
2688
2689 r0_tm_0 += img0_tm.w * tiles * 2;
2690 r0_tm_4 += img0_tm.w * tiles * 2;
2691 }
2692 #endif // __ARM_NEON
2693 }
2694 }
2695 }
2696 }
2697 bottom_blob_bordered = Mat();
2698 // END transform input
2699
2700 // BEGIN dot
2701 Mat top_blob_tm;
2702 {
2703 int w_tm = outw / 6 * 8;
2704 int h_tm = outh / 6 * 8;
2705 top_blob_tm.create(4, 16 * w_tm / 8 * h_tm / 8, outch, 4u, opt.workspace_allocator);
2706
2707 const int tiles = h_tm / 8 * w_tm / 8;
2708
2709 int nn_outch = outch >> 2;
2710 int remain_outch_start = nn_outch << 2;
2711
2712 #pragma omp parallel for num_threads(opt.num_threads)
2713 for (int pp = 0; pp < nn_outch; pp++)
2714 {
2715 int p = pp * 4;
2716
2717 Mat out0_tm = top_blob_tm.channel(p);
2718 Mat out1_tm = top_blob_tm.channel(p + 1);
2719 Mat out2_tm = top_blob_tm.channel(p + 2);
2720 Mat out3_tm = top_blob_tm.channel(p + 3);
2721
2722 const float* ktm = kernel_tm.channel(pp);
2723
2724 out0_tm.fill(0.f);
2725 out1_tm.fill(0.f);
2726 out2_tm.fill(0.f);
2727 out3_tm.fill(0.f);
2728
2729 int q = 0;
2730
2731 #if __ARM_NEON && __aarch64__
2732 for (; q + 3 < inch; q += 4)
2733 {
2734 const float* r0 = bottom_blob_tm.channel(q);
2735 const float* r1 = bottom_blob_tm.channel(q + 1);
2736 const float* r2 = bottom_blob_tm.channel(q + 2);
2737 const float* r3 = bottom_blob_tm.channel(q + 3);
2738
2739 float* output0_tm = out0_tm;
2740 float* output1_tm = out1_tm;
2741 float* output2_tm = out2_tm;
2742 float* output3_tm = out3_tm;
2743
2744 asm volatile(
2745 "mov w0, #16 \n" // w0 = r = 16
2746 "0: \n"
2747
2748 "prfm pldl1keep, [%8, #512] \n"
2749 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%8], #64 \n" // v0 v1 v2 v3 = _k00 _k01 _k02 _k03
2750
2751 "prfm pldl1keep, [%8, #512] \n"
2752 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%8], #64 \n" // v4 v5 v6 v7 = _k10 _k11 _k12 _k13
2753
2754 "prfm pldl1keep, [%8, #512] \n"
2755 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64 \n" // v8 v9 v10 v11 = _k20 _k21 _k22 _k23
2756
2757 "prfm pldl1keep, [%8, #512] \n"
2758 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n" // v12 v13 v14 v15 = _k30 _k31 _k32 _k33
2759
2760 // tile loop
2761 "lsr w1, %w18, #2 \n" // w1 = nn = tiles >> 2
2762 "cmp w1, #0 \n"
2763 "beq 2f \n"
2764
2765 //BEGIN tile loop
2766 "prfm pldl1keep, [%4, #128] \n" //
2767 "ld1 {v16.4s}, [%4], #16 \n"
2768
2769 "1: \n"
2770
2771 "prfm pldl1keep, [%0, #128] \n"
2772 "ld1 {v20.4s}, [%0] \n"
2773 "add x4, %0, #16 \n" // x4 = %0 next
2774
2775 "fmla v20.4s, v16.4s, v0.4s \n"
2776
2777 "prfm pldl1keep, [%1, #128] \n"
2778 "ld1 {v21.4s}, [%1] \n"
2779 "add x5, %1, #16 \n" // x5 = %1 next
2780
2781 "fmla v21.4s, v16.4s, v4.4s \n"
2782
2783 "prfm pldl1keep, [%2, #128] \n"
2784 "ld1 {v22.4s}, [%2] \n"
2785 "add x6, %2, #16 \n" // x6 = %2 next
2786
2787 "fmla v22.4s, v16.4s, v8.4s \n"
2788
2789 "prfm pldl1keep, [%3, #128] \n"
2790 "ld1 {v23.4s}, [%3] \n"
2791 "add x7, %3, #16 \n" // x7 = %3 next
2792
2793 "prfm pldl1keep, [%5, #128] \n"
2794 "ld1 {v17.4s}, [%5], #16 \n"
2795
2796 "fmla v23.4s, v16.4s, v12.4s \n"
2797
2798 "prfm pldl1keep, [x4, #128] \n"
2799 "ld1 {v24.4s}, [x4] \n"
2800
2801 "fmla v20.4s, v17.4s, v1.4s \n"
2802 "fmla v21.4s, v17.4s, v5.4s \n"
2803
2804 "prfm pldl1keep, [%6, #128] \n"
2805 "ld1 {v18.4s}, [%6], #16 \n"
2806
2807 "fmla v22.4s, v17.4s, v9.4s \n"
2808 "fmla v23.4s, v17.4s, v13.4s \n"
2809
2810 "prfm pldl1keep, [x5, #128] \n"
2811 "ld1 {v25.4s}, [x5] \n"
2812
2813 "fmla v20.4s, v18.4s, v2.4s \n"
2814 "fmla v21.4s, v18.4s, v6.4s \n"
2815
2816 "prfm pldl1keep, [%7, #128] \n"
2817 "ld1 {v19.4s}, [%7], #16 \n"
2818
2819 "fmla v22.4s, v18.4s, v10.4s \n"
2820 "fmla v23.4s, v18.4s, v14.4s \n"
2821
2822 "prfm pldl1keep, [x6, #128] \n"
2823 "ld1 {v26.4s}, [x6] \n"
2824
2825 "fmla v20.4s, v19.4s, v3.4s \n"
2826 "fmla v21.4s, v19.4s, v7.4s \n"
2827
2828 "prfm pldl1keep, [%4, #128] \n"
2829 "ld1 {v16.4s}, [%4], #16 \n"
2830
2831 "fmla v22.4s, v19.4s, v11.4s \n"
2832 "fmla v23.4s, v19.4s, v15.4s \n"
2833
2834 ///////
2835
2836 "prfm pldl1keep, [x7, #128] \n"
2837 "ld1 {v27.4s}, [x7] \n"
2838
2839 "st1 {v20.4s}, [%0] \n"
2840 "add %0, %0, #32 \n"
2841
2842 "fmla v24.4s, v16.4s, v0.4s \n"
2843 "fmla v25.4s, v16.4s, v4.4s \n"
2844
2845 "prfm pldl1keep, [%5, #128] \n"
2846 "ld1 {v17.4s}, [%5], #16 \n"
2847
2848 "fmla v26.4s, v16.4s, v8.4s \n"
2849 "fmla v27.4s, v16.4s, v12.4s \n"
2850
2851 "prfm pldl1keep, [%0, #128] \n"
2852 "ld1 {v20.4s}, [%0] \n"
2853
2854 "st1 {v21.4s}, [%1] \n"
2855 "add %1, %1, #32 \n"
2856
2857 "fmla v24.4s, v17.4s, v1.4s \n"
2858 "fmla v25.4s, v17.4s, v5.4s \n"
2859
2860 "prfm pldl1keep, [%6, #128] \n"
2861 "ld1 {v18.4s}, [%6], #16 \n"
2862
2863 "fmla v26.4s, v17.4s, v9.4s \n"
2864 "fmla v27.4s, v17.4s, v13.4s \n"
2865
2866 "prfm pldl1keep, [%1, #128] \n"
2867 "ld1 {v21.4s}, [%1] \n"
2868
2869 "st1 {v22.4s}, [%2] \n"
2870 "add %2, %2, #32 \n"
2871
2872 "fmla v24.4s, v18.4s, v2.4s \n"
2873 "fmla v25.4s, v18.4s, v6.4s \n"
2874
2875 "prfm pldl1keep, [%7, #128] \n"
2876 "ld1 {v19.4s}, [%7], #16 \n"
2877
2878 "fmla v26.4s, v18.4s, v10.4s \n"
2879 "fmla v27.4s, v18.4s, v14.4s \n"
2880
2881 "prfm pldl1keep, [%2, #128] \n"
2882 "ld1 {v22.4s}, [%2] \n"
2883
2884 "st1 {v23.4s}, [%3] \n"
2885 "add %3, %3, #32 \n"
2886
2887 "fmla v24.4s, v19.4s, v3.4s \n"
2888 "fmla v25.4s, v19.4s, v7.4s \n"
2889
2890 "prfm pldl1keep, [%4, #128] \n"
2891 "ld1 {v16.4s}, [%4], #16 \n"
2892
2893 "fmla v26.4s, v19.4s, v11.4s \n"
2894 "fmla v27.4s, v19.4s, v15.4s \n"
2895
2896 ///////
2897
2898 "prfm pldl1keep, [%3, #128] \n"
2899 "ld1 {v23.4s}, [%3] \n"
2900
2901 "st1 {v24.4s}, [x4] \n"
2902 "add x4, x4, #32 \n"
2903
2904 "fmla v20.4s, v16.4s, v0.4s \n"
2905 "fmla v21.4s, v16.4s, v4.4s \n"
2906
2907 "prfm pldl1keep, [%5, #128] \n"
2908 "ld1 {v17.4s}, [%5], #16 \n"
2909
2910 "fmla v22.4s, v16.4s, v8.4s \n"
2911 "fmla v23.4s, v16.4s, v12.4s \n"
2912
2913 "prfm pldl1keep, [x4, #128] \n"
2914 "ld1 {v24.4s}, [x4] \n"
2915
2916 "st1 {v25.4s}, [x5] \n"
2917 "add x5, x5, #32 \n"
2918
2919 "fmla v20.4s, v17.4s, v1.4s \n"
2920 "fmla v21.4s, v17.4s, v5.4s \n"
2921
2922 "prfm pldl1keep, [%6, #128] \n"
2923 "ld1 {v18.4s}, [%6], #16 \n"
2924
2925 "fmla v22.4s, v17.4s, v9.4s \n"
2926 "fmla v23.4s, v17.4s, v13.4s \n"
2927
2928 "prfm pldl1keep, [x5, #128] \n"
2929 "ld1 {v25.4s}, [x5] \n"
2930
2931 "st1 {v26.4s}, [x6] \n"
2932 "add x6, x6, #32 \n"
2933
2934 "fmla v20.4s, v18.4s, v2.4s \n"
2935 "fmla v21.4s, v18.4s, v6.4s \n"
2936
2937 "prfm pldl1keep, [%7, #128] \n"
2938 "ld1 {v19.4s}, [%7], #16 \n"
2939
2940 "fmla v22.4s, v18.4s, v10.4s \n"
2941 "fmla v23.4s, v18.4s, v14.4s \n"
2942
2943 "prfm pldl1keep, [x6, #128] \n"
2944 "ld1 {v26.4s}, [x6] \n"
2945
2946 "st1 {v27.4s}, [x7] \n"
2947 "add x7, x7, #32 \n"
2948
2949 "fmla v20.4s, v19.4s, v3.4s \n"
2950 "fmla v21.4s, v19.4s, v7.4s \n"
2951
2952 "prfm pldl1keep, [%4, #128] \n"
2953 "ld1 {v16.4s}, [%4], #16 \n"
2954
2955 "fmla v22.4s, v19.4s, v11.4s \n"
2956 "fmla v23.4s, v19.4s, v15.4s \n"
2957
2958 ///////
2959
2960 "prfm pldl1keep, [x7, #128] \n"
2961 "ld1 {v27.4s}, [x7] \n"
2962
2963 "st1 {v20.4s}, [%0] \n"
2964
2965 "fmla v24.4s, v16.4s, v0.4s \n"
2966 "fmla v25.4s, v16.4s, v4.4s \n"
2967
2968 "prfm pldl1keep, [%5, #128] \n"
2969 "ld1 {v17.4s}, [%5], #16 \n"
2970
2971 "fmla v26.4s, v16.4s, v8.4s \n"
2972 "fmla v27.4s, v16.4s, v12.4s \n"
2973
2974 "st1 {v21.4s}, [%1] \n"
2975
2976 "fmla v24.4s, v17.4s, v1.4s \n"
2977 "fmla v25.4s, v17.4s, v5.4s \n"
2978
2979 "prfm pldl1keep, [%6, #128] \n"
2980 "ld1 {v18.4s}, [%6], #16 \n"
2981
2982 "fmla v26.4s, v17.4s, v9.4s \n"
2983 "fmla v27.4s, v17.4s, v13.4s \n"
2984
2985 "st1 {v22.4s}, [%2] \n"
2986
2987 "fmla v24.4s, v18.4s, v2.4s \n"
2988 "fmla v25.4s, v18.4s, v6.4s \n"
2989
2990 "prfm pldl1keep, [%7, #128] \n"
2991 "ld1 {v19.4s}, [%7], #16 \n"
2992
2993 "fmla v26.4s, v18.4s, v10.4s \n"
2994 "fmla v27.4s, v18.4s, v14.4s \n"
2995
2996 "st1 {v23.4s}, [%3] \n"
2997
2998 "fmla v24.4s, v19.4s, v3.4s \n"
2999 "fmla v25.4s, v19.4s, v7.4s \n"
3000
3001 "prfm pldl1keep, [%4, #128] \n"
3002 "ld1 {v16.4s}, [%4], #16 \n"
3003
3004 "fmla v26.4s, v19.4s, v11.4s \n"
3005 "fmla v27.4s, v19.4s, v15.4s \n"
3006
3007 "st1 {v24.4s}, [x4], #16 \n"
3008 "mov %0, x4 \n"
3009
3010 "st1 {v25.4s}, [x5], #16 \n"
3011 "mov %1, x5 \n"
3012
3013 "subs w1, w1, #1 \n"
3014
3015 "st1 {v26.4s}, [x6], #16 \n"
3016 "mov %2, x6 \n"
3017
3018 "st1 {v27.4s}, [x7], #16 \n"
3019 "mov %3, x7 \n"
3020
3021 "bne 1b \n"
3022 "sub %4, %4, #16 \n"
3023 //END tile loop
3024
3025 "2: \n"
3026
3027 // remain loop
3028 "and w1, %w18, #3 \n" // w1 = remain = tiles & 3;
3029 "cmp w1, #0 \n"
3030 "beq 4f \n"
3031
3032 //BEGIN remain loop
3033 "3: \n"
3034
3035 "prfm pldl1keep, [%4, #128] \n"
3036 "ld1 {v16.4s}, [%4], #16 \n"
3037
3038 "prfm pldl1keep, [%0, #128] \n"
3039 "ld1 {v20.4s}, [%0] \n"
3040
3041 "fmla v20.4s, v16.4s, v0.4s \n"
3042
3043 "prfm pldl1keep, [%1, #128] \n"
3044 "ld1 {v21.4s}, [%1] \n"
3045
3046 "fmla v21.4s, v16.4s, v4.4s \n"
3047
3048 "prfm pldl1keep, [%2, #128] \n"
3049 "ld1 {v22.4s}, [%2] \n"
3050
3051 "fmla v22.4s, v16.4s, v8.4s \n"
3052
3053 "prfm pldl1keep, [%3, #128] \n"
3054 "ld1 {v23.4s}, [%3] \n"
3055
3056 "fmla v23.4s, v16.4s, v12.4s \n"
3057
3058 "prfm pldl1keep, [%5, #128] \n"
3059 "ld1 {v17.4s}, [%5], #16 \n"
3060
3061 "fmla v20.4s, v17.4s, v1.4s \n"
3062 "fmla v21.4s, v17.4s, v5.4s \n"
3063
3064 "fmla v22.4s, v17.4s, v9.4s \n"
3065 "fmla v23.4s, v17.4s, v13.4s \n"
3066
3067 "prfm pldl1keep, [%6, #128] \n"
3068 "ld1 {v18.4s}, [%6], #16 \n"
3069
3070 "fmla v20.4s, v18.4s, v2.4s \n"
3071 "fmla v21.4s, v18.4s, v6.4s \n"
3072
3073 "fmla v22.4s, v18.4s, v10.4s \n"
3074 "fmla v23.4s, v18.4s, v14.4s \n"
3075
3076 "prfm pldl1keep, [%7, #128] \n"
3077 "ld1 {v19.4s}, [%7], #16 \n"
3078
3079 "fmla v20.4s, v19.4s, v3.4s \n"
3080 "fmla v21.4s, v19.4s, v7.4s \n"
3081 "fmla v22.4s, v19.4s, v11.4s \n"
3082 "fmla v23.4s, v19.4s, v15.4s \n"
3083
3084 "st1 {v20.4s}, [%0], #16 \n"
3085 "st1 {v21.4s}, [%1], #16 \n"
3086
3087 "subs w1, w1, #1 \n"
3088
3089 "st1 {v22.4s}, [%2], #16 \n"
3090 "st1 {v23.4s}, [%3], #16 \n"
3091
3092 "bne 3b \n"
3093 //END remain loop
3094
3095 "4: \n"
3096
3097 "subs w0, w0, #1 \n"
3098 "bne 0b \n"
3099
3100 : "=r"(output0_tm), // %0
3101 "=r"(output1_tm), // %1
3102 "=r"(output2_tm), // %2
3103 "=r"(output3_tm), // %3
3104 "=r"(r0), // %4
3105 "=r"(r1), // %5
3106 "=r"(r2), // %6
3107 "=r"(r3), // %7
3108 "=r"(ktm) // %8
3109 : "0"(output0_tm),
3110 "1"(output1_tm),
3111 "2"(output2_tm),
3112 "3"(output3_tm),
3113 "4"(r0),
3114 "5"(r1),
3115 "6"(r2),
3116 "7"(r3),
3117 "8"(ktm),
3118 "r"(tiles) // %18
3119 : "cc", "memory", "x0", "x1", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
3120 }
3121 #endif // __ARM_NEON && __aarch64__
3122
3123 for (; q + 1 < inch; q += 2)
3124 {
3125 const float* r0 = bottom_blob_tm.channel(q);
3126 const float* r1 = bottom_blob_tm.channel(q + 1);
3127
3128 float* output0_tm = out0_tm;
3129 float* output1_tm = out1_tm;
3130 float* output2_tm = out2_tm;
3131 float* output3_tm = out3_tm;
3132
3133 #if __ARM_NEON
3134 #if __aarch64__
3135 asm volatile(
3136 "mov w0, #16 \n" // w0 = r = 16
3137 "0: \n"
3138
3139 "prfm pldl1keep, [%6, #256] \n"
3140 "ld1 {v0.4s, v1.4s}, [%6], #32 \n" // v0 v1 = _k00 _k01
3141
3142 "prfm pldl1keep, [%6, #256] \n"
3143 "ld1 {v2.4s, v3.4s}, [%6], #32 \n" // v2 v3 = _k10 _k11
3144
3145 "prfm pldl1keep, [%6, #256] \n"
3146 "ld1 {v4.4s, v5.4s}, [%6], #32 \n" // v4 v5 = _k20 _k21
3147
3148 "prfm pldl1keep, [%6, #256] \n"
3149 "ld1 {v6.4s, v7.4s}, [%6], #32 \n" // v6 v7 = _k30 _k31
3150
3151 // tile loop
3152 "lsr w1, %w14, #2 \n" // w1 = nn = tiles >> 2
3153 "cmp w1, #0 \n"
3154 "beq 2f \n"
3155
3156 //BEGIN tile loop
3157 "prfm pldl1keep, [%4, #128] \n"
3158 "ld1 {v20.4s}, [%4], #16 \n"
3159
3160 "1: \n"
3161
3162 "prfm pldl1keep, [%0, #128] \n"
3163 "ld1 {v16.4s}, [%0] \n"
3164
3165 "fmla v16.4s, v20.4s, v0.4s \n"
3166
3167 "prfm pldl1keep, [%1, #128] \n"
3168 "ld1 {v17.4s}, [%1] \n"
3169
3170 "fmla v17.4s, v20.4s, v2.4s \n"
3171
3172 "prfm pldl1keep, [%2, #128] \n"
3173 "ld1 {v18.4s}, [%2] \n"
3174
3175 "fmla v18.4s, v20.4s, v4.4s \n"
3176
3177 "prfm pldl1keep, [%3, #128] \n"
3178 "ld1 {v19.4s}, [%3] \n"
3179
3180 "fmla v19.4s, v20.4s, v6.4s \n"
3181
3182 "prfm pldl1keep, [%5, #128] \n"
3183 "ld1 {v21.4s}, [%5], #16 \n"
3184
3185 "fmla v16.4s, v21.4s, v1.4s \n"
3186 "fmla v17.4s, v21.4s, v3.4s \n"
3187
3188 "prfm pldl1keep, [%4, #128] \n"
3189 "ld1 {v20.4s}, [%4], #16 \n"
3190
3191 "fmla v18.4s, v21.4s, v5.4s \n"
3192 "fmla v19.4s, v21.4s, v7.4s \n"
3193
3194 "st1 {v16.4s}, [%0], #16 \n"
3195 "st1 {v17.4s}, [%1], #16 \n"
3196
3197 ////
3198
3199 "prfm pldl1keep, [%0, #128] \n"
3200 "ld1 {v16.4s}, [%0] \n"
3201
3202 "fmla v16.4s, v20.4s, v0.4s \n"
3203
3204 "prfm pldl1keep, [%1, #128] \n"
3205 "ld1 {v17.4s}, [%1] \n"
3206
3207 "fmla v17.4s, v20.4s, v2.4s \n"
3208
3209 "st1 {v18.4s}, [%2], #16 \n"
3210 "st1 {v19.4s}, [%3], #16 \n"
3211
3212 "prfm pldl1keep, [%2, #128] \n"
3213 "ld1 {v18.4s}, [%2] \n"
3214
3215 "fmla v18.4s, v20.4s, v4.4s \n"
3216
3217 "prfm pldl1keep, [%3, #128] \n"
3218 "ld1 {v19.4s}, [%3] \n"
3219
3220 "fmla v19.4s, v20.4s, v6.4s \n"
3221
3222 "prfm pldl1keep, [%5, #128] \n"
3223 "ld1 {v21.4s}, [%5], #16 \n"
3224
3225 "fmla v16.4s, v21.4s, v1.4s \n"
3226 "fmla v17.4s, v21.4s, v3.4s \n"
3227
3228 "prfm pldl1keep, [%4, #128] \n"
3229 "ld1 {v20.4s}, [%4], #16 \n"
3230
3231 "fmla v18.4s, v21.4s, v5.4s \n"
3232 "fmla v19.4s, v21.4s, v7.4s \n"
3233
3234 "st1 {v16.4s}, [%0], #16 \n"
3235 "st1 {v17.4s}, [%1], #16 \n"
3236
3237 ////
3238
3239 "prfm pldl1keep, [%0, #128] \n"
3240 "ld1 {v16.4s}, [%0] \n"
3241
3242 "fmla v16.4s, v20.4s, v0.4s \n"
3243
3244 "prfm pldl1keep, [%1, #128] \n"
3245 "ld1 {v17.4s}, [%1] \n"
3246
3247 "fmla v17.4s, v20.4s, v2.4s \n"
3248
3249 "st1 {v18.4s}, [%2], #16 \n"
3250 "st1 {v19.4s}, [%3], #16 \n"
3251
3252 "prfm pldl1keep, [%2, #128] \n"
3253 "ld1 {v18.4s}, [%2] \n"
3254
3255 "fmla v18.4s, v20.4s, v4.4s \n"
3256
3257 "prfm pldl1keep, [%3, #128] \n"
3258 "ld1 {v19.4s}, [%3] \n"
3259
3260 "fmla v19.4s, v20.4s, v6.4s \n"
3261
3262 "prfm pldl1keep, [%5, #128] \n"
3263 "ld1 {v21.4s}, [%5], #16 \n"
3264
3265 "fmla v16.4s, v21.4s, v1.4s \n"
3266 "fmla v17.4s, v21.4s, v3.4s \n"
3267
3268 "prfm pldl1keep, [%4, #128] \n"
3269 "ld1 {v20.4s}, [%4], #16 \n"
3270
3271 "fmla v18.4s, v21.4s, v5.4s \n"
3272 "fmla v19.4s, v21.4s, v7.4s \n"
3273
3274 "st1 {v16.4s}, [%0], #16 \n"
3275 "st1 {v17.4s}, [%1], #16 \n"
3276
3277 ////
3278
3279 "prfm pldl1keep, [%0, #128] \n"
3280 "ld1 {v16.4s}, [%0] \n"
3281
3282 "fmla v16.4s, v20.4s, v0.4s \n"
3283
3284 "prfm pldl1keep, [%1, #128] \n"
3285 "ld1 {v17.4s}, [%1] \n"
3286
3287 "fmla v17.4s, v20.4s, v2.4s \n"
3288
3289 "st1 {v18.4s}, [%2], #16 \n"
3290 "st1 {v19.4s}, [%3], #16 \n"
3291
3292 "prfm pldl1keep, [%2, #128] \n"
3293 "ld1 {v18.4s}, [%2] \n"
3294
3295 "fmla v18.4s, v20.4s, v4.4s \n"
3296
3297 "prfm pldl1keep, [%3, #128] \n"
3298 "ld1 {v19.4s}, [%3] \n"
3299
3300 "fmla v19.4s, v20.4s, v6.4s \n"
3301
3302 "prfm pldl1keep, [%5, #128] \n"
3303 "ld1 {v21.4s}, [%5], #16 \n"
3304
3305 "fmla v16.4s, v21.4s, v1.4s \n"
3306 "fmla v17.4s, v21.4s, v3.4s \n"
3307
3308 "prfm pldl1keep, [%4, #128] \n"
3309 "ld1 {v20.4s}, [%4], #16 \n"
3310
3311 "fmla v18.4s, v21.4s, v5.4s \n"
3312 "fmla v19.4s, v21.4s, v7.4s \n"
3313
3314 "st1 {v16.4s}, [%0], #16 \n"
3315 "st1 {v17.4s}, [%1], #16 \n"
3316
3317 "subs w1, w1, #1 \n"
3318
3319 "st1 {v18.4s}, [%2], #16 \n"
3320 "st1 {v19.4s}, [%3], #16 \n"
3321
3322 "bne 1b \n"
3323 "sub %4, %4, #16 \n"
3324 //END tile loop
3325
3326 "2: \n"
3327
3328 // remain loop
3329 "and w1, %w14, #3 \n" // w1 = remain = tiles & 3;
3330 "cmp w1, #0 \n"
3331 "beq 4f \n"
3332
3333 //BEGIN remain loop
3334 "3: \n"
3335
3336 "prfm pldl1keep, [%4, #128] \n"
3337 "ld1 {v20.4s}, [%4], #16 \n"
3338
3339 "prfm pldl1keep, [%0, #128] \n"
3340 "ld1 {v16.4s}, [%0] \n"
3341
3342 "fmla v16.4s, v20.4s, v0.4s \n"
3343
3344 "prfm pldl1keep, [%1, #128] \n"
3345 "ld1 {v17.4s}, [%1] \n"
3346
3347 "fmla v17.4s, v20.4s, v2.4s \n"
3348
3349 "prfm pldl1keep, [%2, #128] \n"
3350 "ld1 {v18.4s}, [%2] \n"
3351
3352 "fmla v18.4s, v20.4s, v4.4s \n"
3353
3354 "prfm pldl1keep, [%3, #128] \n"
3355 "ld1 {v19.4s}, [%3] \n"
3356
3357 "fmla v19.4s, v20.4s, v6.4s \n"
3358
3359 "prfm pldl1keep, [%5, #128] \n"
3360 "ld1 {v21.4s}, [%5], #16 \n"
3361
3362 "fmla v16.4s, v21.4s, v1.4s \n"
3363 "fmla v17.4s, v21.4s, v3.4s \n"
3364 "fmla v18.4s, v21.4s, v5.4s \n"
3365 "fmla v19.4s, v21.4s, v7.4s \n"
3366
3367 "st1 {v16.4s}, [%0], #16 \n"
3368 "st1 {v17.4s}, [%1], #16 \n"
3369
3370 "subs w1, w1, #1 \n"
3371
3372 "st1 {v18.4s}, [%2], #16 \n"
3373 "st1 {v19.4s}, [%3], #16 \n"
3374
3375 "bne 3b \n"
3376 //END remain loop
3377
3378 "4: \n"
3379
3380 "subs w0, w0, #1 \n"
3381 "bne 0b \n"
3382
3383 : "=r"(output0_tm), // %0
3384 "=r"(output1_tm), // %1
3385 "=r"(output2_tm), // %2
3386 "=r"(output3_tm), // %3
3387 "=r"(r0), // %4
3388 "=r"(r1), // %5
3389 "=r"(ktm) // %6
3390 : "0"(output0_tm),
3391 "1"(output1_tm),
3392 "2"(output2_tm),
3393 "3"(output3_tm),
3394 "4"(r0),
3395 "5"(r1),
3396 "6"(ktm),
3397 "r"(tiles) // %14
3398 : "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21");
3399 #else
3400 asm volatile(
3401 "mov r0, #16 \n" // r0 = r = 16
3402 "0: \n"
3403
3404 "pld [%6, #256] \n"
3405 "vld1.f32 {d0-d3}, [%6 :128]! \n" // q0 q1 = _k00 _k01
3406
3407 "pld [%6, #256] \n"
3408 "vld1.f32 {d4-d7}, [%6 :128]! \n" // q2 q3 = _k10 _k11
3409
3410 "pld [%6, #256] \n"
3411 "vld1.f32 {d8-d11}, [%6 :128]! \n" // q4 q5 = _k20 _k21
3412
3413 "pld [%6, #256] \n"
3414 "vld1.f32 {d12-d15}, [%6 :128]! \n" // q6 q7 = _k30 _k31
3415
3416 // tile loop
3417 "lsr r1, %14, #2 \n" // r1 = nn = tiles >> 2
3418 "cmp r1, #0 \n"
3419 "beq 2f \n"
3420
3421 //BEGIN tile loop
3422 "pld [%4, #128] \n"
3423 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3424
3425 "1: \n"
3426
3427 "pld [%0, #128] \n"
3428 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3429
3430 "vmla.f32 q8, q12, q0 \n"
3431
3432 "pld [%1, #128] \n"
3433 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3434
3435 "vmla.f32 q9, q12, q2 \n"
3436
3437 "pld [%2, #128] \n"
3438 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3439
3440 "vmla.f32 q10, q12, q4 \n"
3441
3442 "pld [%3, #128] \n"
3443 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3444
3445 "vmla.f32 q11, q12, q6 \n"
3446
3447 "pld [%5, #128] \n"
3448 "vld1.f32 {d26-d27}, [%5 :128]! \n" // q13 = _r1
3449
3450 "vmla.f32 q8, q13, q1 \n"
3451 "vmla.f32 q9, q13, q3 \n"
3452
3453 "pld [%4, #128] \n"
3454 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3455
3456 "vmla.f32 q10, q13, q5 \n"
3457 "vmla.f32 q11, q13, q7 \n"
3458
3459 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3460 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3461
3462 ////
3463
3464 "pld [%0, #128] \n"
3465 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3466
3467 "vmla.f32 q8, q12, q0 \n"
3468
3469 "pld [%1, #128] \n"
3470 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3471
3472 "vmla.f32 q9, q12, q2 \n"
3473
3474 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3475 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3476
3477 "pld [%2, #128] \n"
3478 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3479
3480 "vmla.f32 q10, q12, q4 \n"
3481
3482 "pld [%3, #128] \n"
3483 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3484
3485 "vmla.f32 q11, q12, q6 \n"
3486
3487 "pld [%5, #128] \n"
3488 "vld1.f32 {d26-d27}, [%5 :128]! \n" // q13 = _r1
3489
3490 "vmla.f32 q8, q13, q1 \n"
3491 "vmla.f32 q9, q13, q3 \n"
3492
3493 "pld [%4, #128] \n"
3494 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3495
3496 "vmla.f32 q10, q13, q5 \n"
3497 "vmla.f32 q11, q13, q7 \n"
3498
3499 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3500 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3501
3502 ////
3503
3504 "pld [%0, #128] \n"
3505 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3506
3507 "vmla.f32 q8, q12, q0 \n"
3508
3509 "pld [%1, #128] \n"
3510 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3511
3512 "vmla.f32 q9, q12, q2 \n"
3513
3514 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3515 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3516
3517 "pld [%2, #128] \n"
3518 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3519
3520 "vmla.f32 q10, q12, q4 \n"
3521
3522 "pld [%3, #128] \n"
3523 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3524
3525 "vmla.f32 q11, q12, q6 \n"
3526
3527 "pld [%5, #128] \n"
3528 "vld1.f32 {d26-d27}, [%5 :128]! \n" // q13 = _r1
3529
3530 "vmla.f32 q8, q13, q1 \n"
3531 "vmla.f32 q9, q13, q3 \n"
3532
3533 "pld [%4, #128] \n"
3534 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3535
3536 "vmla.f32 q10, q13, q5 \n"
3537 "vmla.f32 q11, q13, q7 \n"
3538
3539 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3540 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3541
3542 ////
3543
3544 "pld [%0, #128] \n"
3545 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3546
3547 "vmla.f32 q8, q12, q0 \n"
3548
3549 "pld [%1, #128] \n"
3550 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3551
3552 "vmla.f32 q9, q12, q2 \n"
3553
3554 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3555 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3556
3557 "pld [%2, #128] \n"
3558 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3559
3560 "vmla.f32 q10, q12, q4 \n"
3561
3562 "pld [%3, #128] \n"
3563 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3564
3565 "vmla.f32 q11, q12, q6 \n"
3566
3567 "pld [%5, #128] \n"
3568 "vld1.f32 {d26-d27}, [%5 :128]! \n" // q13 = _r1
3569
3570 "vmla.f32 q8, q13, q1 \n"
3571 "vmla.f32 q9, q13, q3 \n"
3572
3573 "pld [%4, #128] \n"
3574 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3575
3576 "vmla.f32 q10, q13, q5 \n"
3577 "vmla.f32 q11, q13, q7 \n"
3578
3579 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3580 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3581
3582 "subs r1, #1 \n"
3583
3584 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3585 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3586
3587 "bne 1b \n"
3588 "sub %4, %4, #16 \n"
3589 //END tile loop
3590
3591 "2: \n"
3592
3593 // remain loop
3594 "and r1, %14, #3 \n" // r1 = remain = tiles & 3;
3595 "cmp r1, #0 \n"
3596 "beq 4f \n"
3597
3598 //BEGIN remain loop
3599 "3: \n"
3600
3601 "pld [%4, #128] \n"
3602 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3603
3604 "pld [%0, #128] \n"
3605 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3606
3607 "vmla.f32 q8, q12, q0 \n"
3608
3609 "pld [%1, #128] \n"
3610 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3611
3612 "vmla.f32 q9, q12, q2 \n"
3613
3614 "pld [%2, #128] \n"
3615 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3616
3617 "vmla.f32 q10, q12, q4 \n"
3618
3619 "pld [%3, #128] \n"
3620 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3621
3622 "vmla.f32 q11, q12, q6 \n"
3623
3624 "pld [%5, #128] \n"
3625 "vld1.f32 {d26-d27}, [%5 :128]! \n" // q13 = _r1
3626
3627 "vmla.f32 q8, q13, q1 \n"
3628 "vmla.f32 q9, q13, q3 \n"
3629 "vmla.f32 q10, q13, q5 \n"
3630 "vmla.f32 q11, q13, q7 \n"
3631
3632 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3633 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3634
3635 "subs r1, #1 \n"
3636
3637 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3638 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3639
3640 "bne 3b \n"
3641 //END remain loop
3642
3643 "4: \n"
3644
3645 "subs r0, #1 \n"
3646 "bne 0b \n"
3647
3648 : "=r"(output0_tm), // %0
3649 "=r"(output1_tm), // %1
3650 "=r"(output2_tm), // %2
3651 "=r"(output3_tm), // %3
3652 "=r"(r0), // %4
3653 "=r"(r1), // %5
3654 "=r"(ktm) // %6
3655 : "0"(output0_tm),
3656 "1"(output1_tm),
3657 "2"(output2_tm),
3658 "3"(output3_tm),
3659 "4"(r0),
3660 "5"(r1),
3661 "6"(ktm),
3662 "r"(tiles) // %14
3663 : "cc", "memory", "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
3664 #endif // __aarch64__
3665 #else
3666 for (int r = 0; r < 16; r++)
3667 {
3668 for (int t = 0; t < tiles; t++)
3669 {
3670 for (int m = 0; m < 4; m++)
3671 {
3672 output0_tm[m] += r0[m] * ktm[0 + m];
3673 output0_tm[m] += r1[m] * ktm[4 + m];
3674 output1_tm[m] += r0[m] * ktm[8 + m];
3675 output1_tm[m] += r1[m] * ktm[12 + m];
3676 output2_tm[m] += r0[m] * ktm[16 + m];
3677 output2_tm[m] += r1[m] * ktm[20 + m];
3678 output3_tm[m] += r0[m] * ktm[24 + m];
3679 output3_tm[m] += r1[m] * ktm[28 + m];
3680 }
3681
3682 r0 += 4;
3683 r1 += 4;
3684 output0_tm += 4;
3685 output1_tm += 4;
3686 output2_tm += 4;
3687 output3_tm += 4;
3688 }
3689
3690 ktm += 32;
3691 }
3692 #endif // __ARM_NEON
3693 }
3694
3695 for (; q < inch; q++)
3696 {
3697 const float* r0 = bottom_blob_tm.channel(q);
3698
3699 float* output0_tm = out0_tm;
3700 float* output1_tm = out1_tm;
3701 float* output2_tm = out2_tm;
3702 float* output3_tm = out3_tm;
3703
3704 #if __ARM_NEON
3705 #if __aarch64__
3706 asm volatile(
3707 "mov w0, #16 \n" // w0 = r = 16
3708 "0: \n"
3709
3710 "prfm pldl1keep, [%5, #256] \n"
3711 "ld1 {v0.4s, v1.4s}, [%5], #32 \n" // v0 v1 = _k00 _k10
3712
3713 "prfm pldl1keep, [%5, #256] \n"
3714 "ld1 {v2.4s, v3.4s}, [%5], #32 \n" // v2 v3 = _k20 _k30
3715
3716 // tile loop
3717 "mov w1, %w12 \n" // w1 = tiles
3718 "cmp w1, #0 \n"
3719 "beq 2f \n"
3720
3721 //BEGIN tile loop
3722 "1: \n"
3723
3724 "prfm pldl1keep, [%4, #128] \n"
3725 "ld1 {v16.4s}, [%4], #16 \n"
3726
3727 "prfm pldl1keep, [%0, #128] \n"
3728 "ld1 {v17.4s}, [%0] \n"
3729
3730 "fmla v17.4s, v16.4s, v0.4s \n"
3731
3732 "prfm pldl1keep, [%1, #128] \n"
3733 "ld1 {v18.4s}, [%1] \n"
3734
3735 "fmla v18.4s, v16.4s, v1.4s \n"
3736
3737 "prfm pldl1keep, [%2, #128] \n"
3738 "ld1 {v19.4s}, [%2] \n"
3739
3740 "fmla v19.4s, v16.4s, v2.4s \n"
3741
3742 "prfm pldl1keep, [%3, #128] \n"
3743 "ld1 {v20.4s}, [%3] \n"
3744
3745 "fmla v20.4s, v16.4s, v3.4s \n"
3746
3747 "st1 {v17.4s}, [%0], #16 \n"
3748 "st1 {v18.4s}, [%1], #16 \n"
3749
3750 "subs w1, w1, #1 \n"
3751
3752 "st1 {v19.4s}, [%2], #16 \n"
3753 "st1 {v20.4s}, [%3], #16 \n"
3754
3755 "bne 1b \n"
3756 //END tile loop
3757
3758 "2: \n"
3759
3760 "subs w0, w0, #1 \n"
3761 "bne 0b \n"
3762
3763 : "=r"(output0_tm), // %0
3764 "=r"(output1_tm), // %1
3765 "=r"(output2_tm), // %2
3766 "=r"(output3_tm), // %3
3767 "=r"(r0), // %4
3768 "=r"(ktm) // %5
3769 : "0"(output0_tm),
3770 "1"(output1_tm),
3771 "2"(output2_tm),
3772 "3"(output3_tm),
3773 "4"(r0),
3774 "5"(ktm),
3775 "r"(tiles) // %12
3776 : "cc", "memory", "x0", "x1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20");
3777 #else
3778 asm volatile(
3779 "mov r0, #16 \n" // r0 = r = 16
3780 "0: \n"
3781
3782 "pld [%5, #256] \n"
3783 "vld1.f32 {d0-d3}, [%5 :128]! \n" // q0 q1 = _k00 _k10
3784
3785 "pld [%5, #256] \n"
3786 "vld1.f32 {d4-d7}, [%5 :128]! \n" // q2 q3 = _k20 _k30
3787
3788 // tile loop
3789 "mov r1, %12 \n" // r1 = tiles
3790 "cmp r1, #0 \n"
3791 "beq 2f \n"
3792
3793 //BEGIN tile loop
3794 "1: \n"
3795
3796 "pld [%4, #128] \n"
3797 "vld1.f32 {d24-d25}, [%4 :128]! \n" // q12 = _r0
3798
3799 "pld [%0, #128] \n"
3800 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3801
3802 "vmla.f32 q8, q12, q0 \n"
3803
3804 "pld [%1, #128] \n"
3805 "vld1.f32 {d18-d19}, [%1 :128] \n" // q9 = _output1_tm
3806
3807 "vmla.f32 q9, q12, q1 \n"
3808
3809 "pld [%2, #128] \n"
3810 "vld1.f32 {d20-d21}, [%2 :128] \n" // q10 = _output2_tm
3811
3812 "vmla.f32 q10, q12, q2 \n"
3813
3814 "pld [%3, #128] \n"
3815 "vld1.f32 {d22-d23}, [%3 :128] \n" // q11 = _output3_tm
3816
3817 "vmla.f32 q11, q12, q3 \n"
3818
3819 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3820 "vst1.f32 {d18-d19}, [%1 :128]! \n"
3821
3822 "subs r1, #1 \n"
3823
3824 "vst1.f32 {d20-d21}, [%2 :128]! \n"
3825 "vst1.f32 {d22-d23}, [%3 :128]! \n"
3826
3827 "bne 1b \n"
3828 //END tile loop
3829
3830 "2: \n"
3831
3832 "subs r0, #1 \n"
3833 "bne 0b \n"
3834
3835 : "=r"(output0_tm), // %0
3836 "=r"(output1_tm), // %1
3837 "=r"(output2_tm), // %2
3838 "=r"(output3_tm), // %3
3839 "=r"(r0), // %4
3840 "=r"(ktm) // %5
3841 : "0"(output0_tm),
3842 "1"(output1_tm),
3843 "2"(output2_tm),
3844 "3"(output3_tm),
3845 "4"(r0),
3846 "5"(ktm),
3847 "r"(tiles) // %12
3848 : "cc", "memory", "r0", "r1", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13");
3849 #endif // __aarch64__
3850 #else
3851 for (int r = 0; r < 16; r++)
3852 {
3853 for (int t = 0; t < tiles; t++)
3854 {
3855 for (int m = 0; m < 4; m++)
3856 {
3857 output0_tm[m] += r0[m] * ktm[0 + m];
3858 output1_tm[m] += r0[m] * ktm[4 + m];
3859 output2_tm[m] += r0[m] * ktm[8 + m];
3860 output3_tm[m] += r0[m] * ktm[12 + m];
3861 }
3862
3863 r0 += 4;
3864 output0_tm += 4;
3865 output1_tm += 4;
3866 output2_tm += 4;
3867 output3_tm += 4;
3868 }
3869
3870 ktm += 16;
3871 }
3872 #endif // __ARM_NEON
3873 }
3874 }
3875
3876 #pragma omp parallel for num_threads(opt.num_threads)
3877 for (int p = remain_outch_start; p < outch; p++)
3878 {
3879 Mat out0_tm = top_blob_tm.channel(p);
3880
3881 const float* ktm = (const float*)kernel_tm.channel(nn_outch) + 8 * 8 * inch * (p - remain_outch_start);
3882
3883 out0_tm.fill(0.f);
3884
3885 int q = 0;
3886
3887 for (; q < inch; q++)
3888 {
3889 const float* r0 = bottom_blob_tm.channel(q);
3890
3891 float* output0_tm = out0_tm;
3892
3893 for (int r = 0; r < 16; r++)
3894 {
3895 #if __ARM_NEON
3896 float32x4_t _k00 = vld1q_f32(ktm);
3897 ktm += 4;
3898 #endif // __ARM_NEON
3899
3900 // tile
3901 for (int i = 0; i < tiles; i++)
3902 {
3903 #if __ARM_NEON
3904 #if __aarch64__
3905 asm volatile(
3906 "prfm pldl1keep, [%1, #128] \n"
3907 "ld1 {v17.4s}, [%1], #16 \n"
3908
3909 "prfm pldl1keep, [%0, #128] \n"
3910 "ld1 {v16.4s}, [%0] \n"
3911
3912 "fmla v16.4s, v17.4s, %4.4s \n"
3913
3914 "st1 {v16.4s}, [%0], #16 \n"
3915 : "=r"(output0_tm), // %0
3916 "=r"(r0) // %1
3917 : "0"(output0_tm),
3918 "1"(r0),
3919 "w"(_k00) // %4
3920 : "cc", "memory", "v16", "v17");
3921 #else
3922 asm volatile(
3923 "pld [%1, #128] \n"
3924 "vld1.f32 {d18-d19}, [%1 :128]! \n" // q9 = _r0
3925
3926 "pld [%0, #128] \n"
3927 "vld1.f32 {d16-d17}, [%0 :128] \n" // q8 = _output0_tm
3928
3929 "vmla.f32 q8, q9, %q4 \n"
3930
3931 "vst1.f32 {d16-d17}, [%0 :128]! \n"
3932 : "=r"(output0_tm), // %0
3933 "=r"(r0) // %1
3934 : "0"(output0_tm),
3935 "1"(r0),
3936 "w"(_k00) // %4
3937 : "cc", "memory", "q8", "q9");
3938 #endif // __aarch64__
3939 #else
3940 for (int m = 0; m < 4; m++)
3941 {
3942 output0_tm[m] += r0[m] * ktm[m];
3943 }
3944
3945 r0 += 4;
3946 output0_tm += 4;
3947 #endif // __ARM_NEON
3948 }
3949
3950 #if !__ARM_NEON
3951 ktm += 4;
3952 #endif // __ARM_NEON
3953 }
3954 }
3955 }
3956 }
3957 bottom_blob_tm = Mat();
3958 // END dot
3959
3960 // BEGIN transform output
3961 Mat top_blob_bordered;
3962 top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
3963 {
3964 // const float otm[6][8] = {
3965 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
3966 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
3967 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
3968 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
3969 // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
3970 // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
3971 // };
3972
3973 // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
3974 // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
3975 // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
3976 // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
3977 // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
3978 // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
3979
3980 #if __ARM_NEON
3981 const float coeff[4] = {4.f, 8.f, 16.f, 32.f};
3982 float32x4_t _coeff = vld1q_f32(coeff);
3983 #endif // __ARM_NEON
3984
3985 int w_tm = outw / 6 * 8;
3986 int h_tm = outh / 6 * 8;
3987 const int tiles = w_tm / 8 * h_tm / 8;
3988
3989 #pragma omp parallel for num_threads(opt.num_threads)
3990 for (int p = 0; p < outch; p++)
3991 {
3992 const Mat out0_tm = top_blob_tm.channel(p);
3993 Mat out0 = top_blob_bordered.channel(p);
3994
3995 const float bias0 = bias ? bias[p] : 0.f;
3996 #if __ARM_NEON
3997 float32x2_t _bias0 = vdup_n_f32(bias0);
3998 #endif // __ARM_NEON
3999
4000 float tmp[6][8];
4001
4002 // tile
4003 for (int i = 0; i < outh / 6; i++)
4004 {
4005 for (int j = 0; j < outw / 6; j++)
4006 {
4007 #if __ARM_NEON
4008 const float* output0_tm0_0 = out0_tm.row(i * w_tm / 8 + j);
4009 const float* output0_tm0_4 = out0_tm.row(i * w_tm / 8 + j + tiles);
4010 const float* output0_tm1_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 2);
4011 const float* output0_tm1_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 3);
4012 const float* output0_tm2_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 4);
4013 const float* output0_tm2_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 5);
4014 const float* output0_tm3_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 6);
4015 const float* output0_tm3_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 7);
4016
4017 #if __aarch64__
4018 for (int m = 0; m + 3 < 8; m += 4)
4019 {
4020 float32x4_t _output0_tm0_0123 = vld1q_f32(output0_tm0_0);
4021 float32x4_t _output0_tm0_4567 = vld1q_f32(output0_tm0_4);
4022 float32x4_t _output0_tm1_0123 = vld1q_f32(output0_tm1_0);
4023 float32x4_t _output0_tm1_4567 = vld1q_f32(output0_tm1_4);
4024 float32x4_t _output0_tm2_0123 = vld1q_f32(output0_tm2_0);
4025 float32x4_t _output0_tm2_4567 = vld1q_f32(output0_tm2_4);
4026 float32x4_t _output0_tm3_0123 = vld1q_f32(output0_tm3_0);
4027 float32x4_t _output0_tm3_4567 = vld1q_f32(output0_tm3_4);
4028
4029 float32x4x2_t _output0_tm01_00221133 = vtrnq_f32(_output0_tm0_0123, _output0_tm1_0123);
4030 float32x4x2_t _output0_tm01_44665577 = vtrnq_f32(_output0_tm0_4567, _output0_tm1_4567);
4031 float32x4x2_t _output0_tm23_00221133 = vtrnq_f32(_output0_tm2_0123, _output0_tm3_0123);
4032 float32x4x2_t _output0_tm23_44665577 = vtrnq_f32(_output0_tm2_4567, _output0_tm3_4567);
4033
4034 // no vswp intrinsic :(
4035 float32x4_t _output0_tm_00 = vcombine_f32(vget_low_f32(_output0_tm01_00221133.val[0]), vget_low_f32(_output0_tm23_00221133.val[0]));
4036 float32x4_t _output0_tm_11 = vcombine_f32(vget_low_f32(_output0_tm01_00221133.val[1]), vget_low_f32(_output0_tm23_00221133.val[1]));
4037 float32x4_t _output0_tm_22 = vcombine_f32(vget_high_f32(_output0_tm01_00221133.val[0]), vget_high_f32(_output0_tm23_00221133.val[0]));
4038 float32x4_t _output0_tm_33 = vcombine_f32(vget_high_f32(_output0_tm01_00221133.val[1]), vget_high_f32(_output0_tm23_00221133.val[1]));
4039 float32x4_t _output0_tm_44 = vcombine_f32(vget_low_f32(_output0_tm01_44665577.val[0]), vget_low_f32(_output0_tm23_44665577.val[0]));
4040 float32x4_t _output0_tm_55 = vcombine_f32(vget_low_f32(_output0_tm01_44665577.val[1]), vget_low_f32(_output0_tm23_44665577.val[1]));
4041 float32x4_t _output0_tm_66 = vcombine_f32(vget_high_f32(_output0_tm01_44665577.val[0]), vget_high_f32(_output0_tm23_44665577.val[0]));
4042 float32x4_t _output0_tm_77 = vcombine_f32(vget_high_f32(_output0_tm01_44665577.val[1]), vget_high_f32(_output0_tm23_44665577.val[1]));
4043
4044 float32x4_t _tmp024a = vaddq_f32(_output0_tm_11, _output0_tm_22);
4045 float32x4_t _tmp135a = vsubq_f32(_output0_tm_11, _output0_tm_22);
4046
4047 float32x4_t _tmp024b = vaddq_f32(_output0_tm_33, _output0_tm_44);
4048 float32x4_t _tmp135b = vsubq_f32(_output0_tm_33, _output0_tm_44);
4049
4050 float32x4_t _tmp024c = vaddq_f32(_output0_tm_55, _output0_tm_66);
4051 float32x4_t _tmp135c = vsubq_f32(_output0_tm_55, _output0_tm_66);
4052
4053 float32x4_t _tmp0 = vaddq_f32(_output0_tm_00, _tmp024a);
4054 _tmp0 = vmlaq_lane_f32(_tmp0, _tmp024c, vget_high_f32(_coeff), 1);
4055 _tmp0 = vaddq_f32(_tmp0, _tmp024b);
4056
4057 float32x4_t _tmp2 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
4058 _tmp2 = vmlaq_lane_f32(_tmp2, _tmp024c, vget_low_f32(_coeff), 1);
4059
4060 float32x4_t _tmp4 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
4061 _tmp4 = vaddq_f32(_tmp4, _tmp024c);
4062 _tmp4 = vaddq_f32(_tmp4, _tmp024c);
4063
4064 vst1q_f32(&tmp[0][m], _tmp0);
4065 vst1q_f32(&tmp[2][m], _tmp2);
4066 vst1q_f32(&tmp[4][m], _tmp4);
4067
4068 float32x4_t _tmp1 = vmlaq_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
4069 _tmp1 = vaddq_f32(_tmp1, _tmp135b);
4070 _tmp1 = vaddq_f32(_tmp1, _tmp135b);
4071
4072 float32x4_t _tmp3 = vmlaq_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
4073 _tmp3 = vmlaq_lane_f32(_tmp3, _tmp135c, vget_low_f32(_coeff), 0);
4074
4075 float32x4_t _tmp5 = vaddq_f32(_output0_tm_77, _tmp135a);
4076 _tmp5 = vmlaq_lane_f32(_tmp5, _tmp135b, vget_high_f32(_coeff), 1);
4077 _tmp5 = vaddq_f32(_tmp5, _tmp135c);
4078
4079 vst1q_f32(&tmp[1][m], _tmp1);
4080 vst1q_f32(&tmp[3][m], _tmp3);
4081 vst1q_f32(&tmp[5][m], _tmp5);
4082
4083 output0_tm0_0 += out0_tm.w * tiles * 2 * 4;
4084 output0_tm0_4 += out0_tm.w * tiles * 2 * 4;
4085 output0_tm1_0 += out0_tm.w * tiles * 2 * 4;
4086 output0_tm1_4 += out0_tm.w * tiles * 2 * 4;
4087 output0_tm2_0 += out0_tm.w * tiles * 2 * 4;
4088 output0_tm2_4 += out0_tm.w * tiles * 2 * 4;
4089 output0_tm3_0 += out0_tm.w * tiles * 2 * 4;
4090 output0_tm3_4 += out0_tm.w * tiles * 2 * 4;
4091 }
4092
4093 const float* t0 = tmp[0];
4094 const float* t1 = tmp[1];
4095
4096 float* output0 = out0.row(i * 6) + j * 6;
4097 float* output1 = output0 + outw;
4098
4099 for (int m = 0; m + 1 < 6; m += 2)
4100 {
4101 float32x4_t _t0_0123 = vld1q_f32(t0);
4102 float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
4103 float32x4_t _t1_0123 = vld1q_f32(t1);
4104 float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
4105
4106 float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
4107 float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
4108
4109 float32x2_t _t_00 = vget_low_f32(_t01_00221133.val[0]);
4110 float32x2_t _t_11 = vget_low_f32(_t01_00221133.val[1]);
4111 float32x2_t _t_22 = vget_high_f32(_t01_00221133.val[0]);
4112 float32x2_t _t_33 = vget_high_f32(_t01_00221133.val[1]);
4113 float32x2_t _t_44 = vget_low_f32(_t01_44665577.val[0]);
4114 float32x2_t _t_55 = vget_low_f32(_t01_44665577.val[1]);
4115 float32x2_t _t_66 = vget_high_f32(_t01_44665577.val[0]);
4116 float32x2_t _t_77 = vget_high_f32(_t01_44665577.val[1]);
4117
4118 float32x2_t _tmp024a = vadd_f32(_t_11, _t_22);
4119 float32x2_t _tmp135a = vsub_f32(_t_11, _t_22);
4120
4121 float32x2_t _tmp024b = vadd_f32(_t_33, _t_44);
4122 float32x2_t _tmp135b = vsub_f32(_t_33, _t_44);
4123
4124 float32x2_t _tmp024c = vadd_f32(_t_55, _t_66);
4125 float32x2_t _tmp135c = vsub_f32(_t_55, _t_66);
4126
4127 float32x2_t _output_0 = vadd_f32(_t_00, _tmp024a);
4128 _output_0 = vmla_lane_f32(_output_0, _tmp024c, vget_high_f32(_coeff), 1);
4129 _output_0 = vadd_f32(_output_0, _tmp024b);
4130 _output_0 = vadd_f32(_output_0, _bias0);
4131
4132 float32x2_t _output_2 = vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
4133 _output_2 = vmla_lane_f32(_output_2, _tmp024c, vget_low_f32(_coeff), 1);
4134 _output_2 = vadd_f32(_output_2, _bias0);
4135
4136 float32x2_t _output_4 = vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
4137 _output_4 = vadd_f32(_output_4, _tmp024c);
4138 _output_4 = vadd_f32(_output_4, _tmp024c);
4139 _output_4 = vadd_f32(_output_4, _bias0);
4140
4141 output0[0] = vget_lane_f32(_output_0, 0);
4142 output1[0] = vget_lane_f32(_output_0, 1);
4143 output0[2] = vget_lane_f32(_output_2, 0);
4144 output1[2] = vget_lane_f32(_output_2, 1);
4145 output0[4] = vget_lane_f32(_output_4, 0);
4146 output1[4] = vget_lane_f32(_output_4, 1);
4147
4148 float32x2_t _output_1 = vmla_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
4149 _output_1 = vadd_f32(_output_1, _tmp135b);
4150 _output_1 = vadd_f32(_output_1, _tmp135b);
4151 _output_1 = vadd_f32(_output_1, _bias0);
4152
4153 float32x2_t _output_3 = vmla_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
4154 _output_3 = vmla_lane_f32(_output_3, _tmp135c, vget_low_f32(_coeff), 0);
4155 _output_3 = vadd_f32(_output_3, _bias0);
4156
4157 float32x2_t _output_5 = vadd_f32(_t_77, _tmp135a);
4158 _output_5 = vmla_lane_f32(_output_5, _tmp135b, vget_high_f32(_coeff), 1);
4159 _output_5 = vadd_f32(_output_5, _tmp135c);
4160 _output_5 = vadd_f32(_output_5, _bias0);
4161
4162 output0[1] = vget_lane_f32(_output_1, 0);
4163 output1[1] = vget_lane_f32(_output_1, 1);
4164 output0[3] = vget_lane_f32(_output_3, 0);
4165 output1[3] = vget_lane_f32(_output_3, 1);
4166 output0[5] = vget_lane_f32(_output_5, 0);
4167 output1[5] = vget_lane_f32(_output_5, 1);
4168
4169 t0 += 8 * 2;
4170 t1 += 8 * 2;
4171 output0 += outw * 2;
4172 output1 += outw * 2;
4173 }
4174 #else // __aarch64__
4175 float* t0 = tmp[0];
4176 float* t1 = tmp[1];
4177
4178 int step = out0_tm.w * tiles * 2 * 4 * 4;
4179
4180 asm volatile(
4181
4182 // loop0
4183 "vld1.f32 {d16-d17}, [%2], %21 \n"
4184 "vld1.f32 {d18-d19}, [%3], %21 \n"
4185 "vld1.f32 {d20-d21}, [%4], %21 \n"
4186 "vld1.f32 {d22-d23}, [%5], %21 \n"
4187 "vld1.f32 {d24-d25}, [%6], %21 \n"
4188 "vld1.f32 {d26-d27}, [%7], %21 \n"
4189 "vld1.f32 {d28-d29}, [%8], %21 \n"
4190 "vld1.f32 {d30-d31}, [%9], %21 \n"
4191
4192 "vtrn.32 q8, q10 \n"
4193 "vtrn.32 q9, q11 \n"
4194 "vtrn.32 q12, q14 \n"
4195 "vtrn.32 q13, q15 \n"
4196
4197 "vswp d17, d24 \n"
4198 "vswp d19, d26 \n"
4199 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
4200 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
4201
4202 "vadd.f32 q2, q10, q12 \n"
4203 "vsub.f32 q3, q10, q12 \n"
4204
4205 "vadd.f32 q4, q14, q9 \n"
4206 "vsub.f32 q5, q14, q9 \n"
4207
4208 "vadd.f32 q6, q11, q13 \n"
4209 "vsub.f32 q7, q11, q13 \n" // spare q9 q10 q11 q12 q13 q14
4210
4211 "vmov q9, q3 \n"
4212 "vadd.f32 q8, q8, q2 \n"
4213 "vmla.f32 q9, q7, %f20[0] \n"
4214 "vmov q12, q2 \n"
4215 "vmov q10, q2 \n"
4216 "vmov q11, q3 \n"
4217 "vmla.f32 q12, q4, %f20[0] \n"
4218 "vadd.f32 q15, q15, q3 \n"
4219 "vmla.f32 q8, q6, %f20[1] \n"
4220 "vadd.f32 q9, q9, q5 \n"
4221 "vmla.f32 q10, q4, %e20[0] \n"
4222 "vmla.f32 q11, q5, %e20[1] \n"
4223 "vadd.f32 q12, q12, q6 \n"
4224 "vmla.f32 q15, q5, %f20[1] \n"
4225 "vadd.f32 q8, q8, q4 \n"
4226 "vadd.f32 q9, q9, q5 \n"
4227 "vmla.f32 q10, q6, %e20[1] \n"
4228 "vmla.f32 q11, q7, %e20[0] \n"
4229 "vadd.f32 q12, q12, q6 \n"
4230 "vadd.f32 q15, q15, q7 \n"
4231
4232 "vst1.f32 {d16-d17}, [%0] \n"
4233 "add %0, %0, #64 \n"
4234
4235 "vst1.f32 {d18-d19}, [%1] \n"
4236 "add %1, %1, #64 \n"
4237
4238 "vst1.f32 {d20-d21}, [%0] \n"
4239 "add %0, %0, #64 \n"
4240
4241 "vst1.f32 {d22-d23}, [%1] \n"
4242 "add %1, %1, #64 \n"
4243
4244 "vst1.f32 {d24-d25}, [%0] \n"
4245 "sub %0, %0, #112 \n"
4246
4247 "vst1.f32 {d30-d31}, [%1] \n"
4248 "sub %1, %1, #112 \n"
4249
4250 // loop1
4251 "vld1.f32 {d16-d17}, [%2] \n"
4252 "vld1.f32 {d18-d19}, [%3] \n"
4253 "vld1.f32 {d20-d21}, [%4] \n"
4254 "vld1.f32 {d22-d23}, [%5] \n"
4255 "vld1.f32 {d24-d25}, [%6] \n"
4256 "vld1.f32 {d26-d27}, [%7] \n"
4257 "vld1.f32 {d28-d29}, [%8] \n"
4258 "vld1.f32 {d30-d31}, [%9] \n"
4259
4260 "vtrn.32 q8, q10 \n"
4261 "vtrn.32 q9, q11 \n"
4262 "vtrn.32 q12, q14 \n"
4263 "vtrn.32 q13, q15 \n"
4264
4265 "vswp d17, d24 \n"
4266 "vswp d19, d26 \n"
4267 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
4268 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
4269
4270 "vadd.f32 q2, q10, q12 \n"
4271 "vsub.f32 q3, q10, q12 \n"
4272
4273 "vadd.f32 q4, q14, q9 \n"
4274 "vsub.f32 q5, q14, q9 \n"
4275
4276 "vadd.f32 q6, q11, q13 \n"
4277 "vsub.f32 q7, q11, q13 \n" // spare q9 q10 q11 q12 q13 q14
4278
4279 "vmov q9, q3 \n"
4280 "vadd.f32 q8, q8, q2 \n"
4281 "vmla.f32 q9, q7, %f20[0] \n"
4282 "vmov q12, q2 \n"
4283 "vmov q10, q2 \n"
4284 "vmov q11, q3 \n"
4285 "vmla.f32 q12, q4, %f20[0] \n"
4286 "vadd.f32 q15, q15, q3 \n"
4287 "vmla.f32 q8, q6, %f20[1] \n"
4288 "vadd.f32 q9, q9, q5 \n"
4289 "vmla.f32 q10, q4, %e20[0] \n"
4290 "vmla.f32 q11, q5, %e20[1] \n"
4291 "vadd.f32 q12, q12, q6 \n"
4292 "vmla.f32 q15, q5, %f20[1] \n"
4293 "vadd.f32 q8, q8, q4 \n"
4294 "vadd.f32 q9, q9, q5 \n"
4295 "vmla.f32 q10, q6, %e20[1] \n"
4296 "vmla.f32 q11, q7, %e20[0] \n"
4297 "vadd.f32 q12, q12, q6 \n"
4298 "vadd.f32 q15, q15, q7 \n"
4299
4300 "vst1.f32 {d16-d17}, [%0] \n"
4301 "add %0, %0, #64 \n"
4302
4303 "vst1.f32 {d18-d19}, [%1] \n"
4304 "add %1, %1, #64 \n"
4305
4306 "vst1.f32 {d20-d21}, [%0] \n"
4307 "add %0, %0, #64 \n"
4308
4309 "vst1.f32 {d22-d23}, [%1] \n"
4310 "add %1, %1, #64 \n"
4311
4312 "vst1.f32 {d24-d25}, [%0] \n"
4313
4314 "vst1.f32 {d30-d31}, [%1] \n"
4315
4316 : "=r"(t0), // %0
4317 "=r"(t1), // %1
4318 "=r"(output0_tm0_0), // %2
4319 "=r"(output0_tm0_4), // %3
4320 "=r"(output0_tm1_0), // %4
4321 "=r"(output0_tm1_4), // %5
4322 "=r"(output0_tm2_0), // %6
4323 "=r"(output0_tm2_4), // %7
4324 "=r"(output0_tm3_0), // %8
4325 "=r"(output0_tm3_4) // %9
4326 : "0"(t0),
4327 "1"(t1),
4328 "2"(output0_tm0_0),
4329 "3"(output0_tm0_4),
4330 "4"(output0_tm1_0),
4331 "5"(output0_tm1_4),
4332 "6"(output0_tm2_0),
4333 "7"(output0_tm2_4),
4334 "8"(output0_tm3_0),
4335 "9"(output0_tm3_4),
4336 "w"(_coeff), // %20
4337 "r"(step) // %21
4338 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4339
4340 t0 = tmp[0];
4341 t1 = tmp[1];
4342
4343 float* output0 = out0.row(i * 6) + j * 6;
4344 float* output1 = output0 + outw;
4345
4346 int stepw = outw * 2 * 4;
4347
4348 asm volatile(
4349
4350 // loop0
4351 "vld1.f32 {d16-d19}, [%2] \n"
4352 "vld1.f32 {d20-d23}, [%3] \n"
4353
4354 "add %2, %2, #64 \n"
4355 "add %3, %3, #64 \n"
4356
4357 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
4358 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
4359
4360 "vadd.f32 d4, d20, d17 \n"
4361 "vsub.f32 d5, d20, d17 \n"
4362
4363 "vadd.f32 d6, d21, d18 \n"
4364 "vsub.f32 d7, d21, d18 \n"
4365
4366 "vadd.f32 d8, d22, d19 \n"
4367 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
4368
4369 "vmov d20, d5 \n"
4370 "vmov d18, d4 \n"
4371
4372 "vadd.f32 d16, d16, d4 \n"
4373 "vmla.f32 d20, d9, %f8[0] \n"
4374 "vmov d17, d4 \n"
4375 "vmov d21, d5 \n"
4376 "vmla.f32 d18, d6, %f8[0] \n"
4377 "vadd.f32 d22, d23, d5 \n"
4378
4379 "vmla.f32 d16, d8, %f8[1] \n"
4380 "vadd.f32 d20, d20, d7 \n"
4381 "vmla.f32 d17, d6, %e8[0] \n"
4382 "vmla.f32 d21, d7, %e8[1] \n"
4383 "vadd.f32 d18, d18, d8 \n"
4384 "vmla.f32 d22, d7, %f8[1] \n"
4385
4386 "vadd.f32 d16, d16, d6 \n"
4387 "vadd.f32 d20, d20, d7 \n"
4388 "vmla.f32 d17, d8, %e8[1] \n"
4389 "vmla.f32 d21, d9, %e8[0] \n"
4390 "vadd.f32 d18, d18, d8 \n"
4391 "vadd.f32 d22, d22, d9 \n"
4392
4393 "vadd.f32 d16, d16, %P9 \n" // _bias0
4394 "vadd.f32 d20, d20, %P9 \n" // _bias0
4395 "vadd.f32 d17, d17, %P9 \n" // _bias0
4396 "vadd.f32 d21, d21, %P9 \n" // _bias0
4397 "vadd.f32 d18, d18, %P9 \n" // _bias0
4398 "vadd.f32 d22, d22, %P9 \n" // _bias0
4399
4400 "vtrn.f32 q8, q10 \n"
4401 "vtrn.f32 d18, d22 \n"
4402
4403 "vst1.f32 {d16-d18}, [%0], %10 \n"
4404 "vst1.f32 {d20-d22}, [%1], %10 \n"
4405
4406 // loop1
4407 "vld1.f32 {d16-d19}, [%2] \n"
4408 "vld1.f32 {d20-d23}, [%3] \n"
4409
4410 "add %2, %2, #64 \n"
4411 "add %3, %3, #64 \n"
4412
4413 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
4414 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
4415
4416 "vadd.f32 d4, d20, d17 \n"
4417 "vsub.f32 d5, d20, d17 \n"
4418
4419 "vadd.f32 d6, d21, d18 \n"
4420 "vsub.f32 d7, d21, d18 \n"
4421
4422 "vadd.f32 d8, d22, d19 \n"
4423 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
4424
4425 "vmov d20, d5 \n"
4426 "vmov d18, d4 \n"
4427
4428 "vadd.f32 d16, d16, d4 \n"
4429 "vmla.f32 d20, d9, %f8[0] \n"
4430 "vmov d17, d4 \n"
4431 "vmov d21, d5 \n"
4432 "vmla.f32 d18, d6, %f8[0] \n"
4433 "vadd.f32 d22, d23, d5 \n"
4434
4435 "vmla.f32 d16, d8, %f8[1] \n"
4436 "vadd.f32 d20, d20, d7 \n"
4437 "vmla.f32 d17, d6, %e8[0] \n"
4438 "vmla.f32 d21, d7, %e8[1] \n"
4439 "vadd.f32 d18, d18, d8 \n"
4440 "vmla.f32 d22, d7, %f8[1] \n"
4441
4442 "vadd.f32 d16, d16, d6 \n"
4443 "vadd.f32 d20, d20, d7 \n"
4444 "vmla.f32 d17, d8, %e8[1] \n"
4445 "vmla.f32 d21, d9, %e8[0] \n"
4446 "vadd.f32 d18, d18, d8 \n"
4447 "vadd.f32 d22, d22, d9 \n"
4448
4449 "vadd.f32 d16, d16, %P9 \n" // _bias0
4450 "vadd.f32 d20, d20, %P9 \n" // _bias0
4451 "vadd.f32 d17, d17, %P9 \n" // _bias0
4452 "vadd.f32 d21, d21, %P9 \n" // _bias0
4453 "vadd.f32 d18, d18, %P9 \n" // _bias0
4454 "vadd.f32 d22, d22, %P9 \n" // _bias0
4455
4456 "vtrn.f32 q8, q10 \n"
4457 "vtrn.f32 d18, d22 \n"
4458
4459 "vst1.f32 {d16-d18}, [%0], %10 \n"
4460 "vst1.f32 {d20-d22}, [%1], %10 \n"
4461
4462 // loop2
4463 "vld1.f32 {d16-d19}, [%2] \n"
4464 "vld1.f32 {d20-d23}, [%3] \n"
4465
4466 "add %2, %2, #64 \n"
4467 "add %3, %3, #64 \n"
4468
4469 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
4470 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
4471
4472 "vadd.f32 d4, d20, d17 \n"
4473 "vsub.f32 d5, d20, d17 \n"
4474
4475 "vadd.f32 d6, d21, d18 \n"
4476 "vsub.f32 d7, d21, d18 \n"
4477
4478 "vadd.f32 d8, d22, d19 \n"
4479 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
4480
4481 "vmov d20, d5 \n"
4482 "vmov d18, d4 \n"
4483
4484 "vadd.f32 d16, d16, d4 \n"
4485 "vmla.f32 d20, d9, %f8[0] \n"
4486 "vmov d17, d4 \n"
4487 "vmov d21, d5 \n"
4488 "vmla.f32 d18, d6, %f8[0] \n"
4489 "vadd.f32 d22, d23, d5 \n"
4490
4491 "vmla.f32 d16, d8, %f8[1] \n"
4492 "vadd.f32 d20, d20, d7 \n"
4493 "vmla.f32 d17, d6, %e8[0] \n"
4494 "vmla.f32 d21, d7, %e8[1] \n"
4495 "vadd.f32 d18, d18, d8 \n"
4496 "vmla.f32 d22, d7, %f8[1] \n"
4497
4498 "vadd.f32 d16, d16, d6 \n"
4499 "vadd.f32 d20, d20, d7 \n"
4500 "vmla.f32 d17, d8, %e8[1] \n"
4501 "vmla.f32 d21, d9, %e8[0] \n"
4502 "vadd.f32 d18, d18, d8 \n"
4503 "vadd.f32 d22, d22, d9 \n"
4504
4505 "vadd.f32 d16, d16, %P9 \n" // _bias0
4506 "vadd.f32 d20, d20, %P9 \n" // _bias0
4507 "vadd.f32 d17, d17, %P9 \n" // _bias0
4508 "vadd.f32 d21, d21, %P9 \n" // _bias0
4509 "vadd.f32 d18, d18, %P9 \n" // _bias0
4510 "vadd.f32 d22, d22, %P9 \n" // _bias0
4511
4512 "vtrn.f32 q8, q10 \n"
4513 "vtrn.f32 d18, d22 \n"
4514
4515 "vst1.f32 {d16-d18}, [%0], %10 \n"
4516 "vst1.f32 {d20-d22}, [%1], %10 \n"
4517
4518 : "=r"(output0), // %0
4519 "=r"(output1), // %1
4520 "=r"(t0), // %2
4521 "=r"(t1) // %3
4522 : "0"(output0),
4523 "1"(output1),
4524 "2"(t0),
4525 "3"(t1),
4526 "w"(_coeff), // %8
4527 "w"(_bias0), // %9
4528 "r"(stepw) // %10
4529 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4530 #endif // __aarch64__
4531 #else
4532 const float* output0_tm_0 = out0_tm.row(i * w_tm / 8 + j);
4533 const float* output0_tm_4 = out0_tm.row(i * w_tm / 8 + j + tiles);
4534
4535 for (int m = 0; m < 8; m++)
4536 {
4537 float tmp024a = output0_tm_0[1] + output0_tm_0[2];
4538 float tmp135a = output0_tm_0[1] - output0_tm_0[2];
4539
4540 float tmp024b = output0_tm_0[3] + output0_tm_4[0];
4541 float tmp135b = output0_tm_0[3] - output0_tm_4[0];
4542
4543 float tmp024c = output0_tm_4[1] + output0_tm_4[2];
4544 float tmp135c = output0_tm_4[1] - output0_tm_4[2];
4545
4546 tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
4547 tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
4548 tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
4549
4550 tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
4551 tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
4552 tmp[5][m] = output0_tm_4[3] + tmp135a + tmp135b * 32 + tmp135c;
4553
4554 output0_tm_0 += out0_tm.w * tiles * 2;
4555 output0_tm_4 += out0_tm.w * tiles * 2;
4556 }
4557
4558 float* output0 = out0.row(i * 6) + j * 6;
4559
4560 for (int m = 0; m < 6; m++)
4561 {
4562 const float* tmp0 = tmp[m];
4563
4564 float tmp024a = tmp0[1] + tmp0[2];
4565 float tmp135a = tmp0[1] - tmp0[2];
4566
4567 float tmp024b = tmp0[3] + tmp0[4];
4568 float tmp135b = tmp0[3] - tmp0[4];
4569
4570 float tmp024c = tmp0[5] + tmp0[6];
4571 float tmp135c = tmp0[5] - tmp0[6];
4572
4573 output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
4574 output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
4575 output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
4576
4577 output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
4578 output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
4579 output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
4580
4581 output0 += outw;
4582 }
4583 #endif // __ARM_NEON
4584 }
4585 }
4586 }
4587 }
4588 // END transform output
4589
4590 // cut result pad
4591 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
4592 }
4593
conv3x3s1_winograd64_neon5(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)4594 static void conv3x3s1_winograd64_neon5(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
4595 {
4596 int w = bottom_blob.w;
4597 int h = bottom_blob.h;
4598 int inch = bottom_blob.c;
4599
4600 int outw = top_blob.w;
4601 int outh = top_blob.h;
4602 int outch = top_blob.c;
4603
4604 // pad to 6n+2
4605 Mat bottom_blob_bordered = bottom_blob;
4606
4607 outw = (outw + 5) / 6 * 6;
4608 outh = (outh + 5) / 6 * 6;
4609
4610 w = outw + 2;
4611 h = outh + 2;
4612 Option opt_b = opt;
4613 opt_b.blob_allocator = opt.workspace_allocator;
4614 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
4615
4616 const float* bias = _bias;
4617
4618 // BEGIN transform input
4619 Mat bottom_blob_tm;
4620 {
4621 int w_tm = outw / 6 * 8;
4622 int h_tm = outh / 6 * 8;
4623 const int tiles = w_tm / 8 * h_tm / 8;
4624 bottom_blob_tm.create(1, 64 * tiles, inch, 4u, opt.workspace_allocator);
4625 // bottom_blob_tm.create(inch, tiles, 64);
4626
4627 // const float itm[8][8] = {
4628 // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f},
4629 //
4630 // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f},
4631 // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f},
4632 //
4633 // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f},
4634 // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f},
4635 //
4636 // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f},
4637 // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f},
4638 //
4639 // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f}
4640 // };
4641
4642 // 0 = r00 - r06 + (r04 - r02) * 5.25
4643 // 7 = r07 - r01 + (r03 - r05) * 5.25
4644
4645 // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
4646 // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
4647
4648 // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
4649 // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
4650
4651 // reuse r04 * 1.25
4652 // reuse r03 * 2.5
4653 // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
4654 // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
4655
4656 #if __ARM_NEON
4657 const float coeff[8] = {
4658 0.25f, 0.5f, -1.25f, 2.f,
4659 -2.5f, 4.f, 4.25f, 5.25f
4660 };
4661 float32x4_t _coeff0 = vld1q_f32(coeff);
4662 float32x4_t _coeff1 = vld1q_f32(coeff + 4);
4663 #endif // __ARM_NEON
4664
4665 #pragma omp parallel for num_threads(opt.num_threads)
4666 for (int q = 0; q < inch; q++)
4667 {
4668 const Mat img0 = bottom_blob_bordered.channel(q);
4669 Mat img0_tm = bottom_blob_tm.channel(q);
4670
4671 float tmp[8][8];
4672
4673 // tile
4674 for (int i = 0; i < h_tm / 8; i++)
4675 {
4676 for (int j = 0; j < w_tm / 8; j++)
4677 {
4678 #if __ARM_NEON
4679 const float* r0 = img0.row(i * 6) + j * 6;
4680 const float* r1 = r0 + w;
4681 const float* r2 = r0 + w * 2;
4682 const float* r3 = r0 + w * 3;
4683
4684 // the assembly block for armv7 input transform requires 13 general registers
4685 // old gcc may fail to allocate register on debug build without -fomit-frame-pointer
4686 // so, fallback to intrinsic version for armv7 debug build --- nihui
4687 #if __aarch64__ || !defined(NDEBUG)
4688 for (int m = 0; m + 3 < 8; m += 4)
4689 {
4690 float32x4_t _r0_0123 = vld1q_f32(r0);
4691 float32x4_t _r0_4567 = vld1q_f32(r0 + 4);
4692 float32x4_t _r1_0123 = vld1q_f32(r1);
4693 float32x4_t _r1_4567 = vld1q_f32(r1 + 4);
4694 float32x4_t _r2_0123 = vld1q_f32(r2);
4695 float32x4_t _r2_4567 = vld1q_f32(r2 + 4);
4696 float32x4_t _r3_0123 = vld1q_f32(r3);
4697 float32x4_t _r3_4567 = vld1q_f32(r3 + 4);
4698
4699 float32x4x2_t _r01_00221133 = vtrnq_f32(_r0_0123, _r1_0123);
4700 float32x4x2_t _r01_44665577 = vtrnq_f32(_r0_4567, _r1_4567);
4701 float32x4x2_t _r23_00221133 = vtrnq_f32(_r2_0123, _r3_0123);
4702 float32x4x2_t _r23_44665577 = vtrnq_f32(_r2_4567, _r3_4567);
4703
4704 // no vswp intrinsic :(
4705 float32x4_t _r_00 = vcombine_f32(vget_low_f32(_r01_00221133.val[0]), vget_low_f32(_r23_00221133.val[0]));
4706 float32x4_t _r_11 = vcombine_f32(vget_low_f32(_r01_00221133.val[1]), vget_low_f32(_r23_00221133.val[1]));
4707 float32x4_t _r_22 = vcombine_f32(vget_high_f32(_r01_00221133.val[0]), vget_high_f32(_r23_00221133.val[0]));
4708 float32x4_t _r_33 = vcombine_f32(vget_high_f32(_r01_00221133.val[1]), vget_high_f32(_r23_00221133.val[1]));
4709 float32x4_t _r_44 = vcombine_f32(vget_low_f32(_r01_44665577.val[0]), vget_low_f32(_r23_44665577.val[0]));
4710 float32x4_t _r_55 = vcombine_f32(vget_low_f32(_r01_44665577.val[1]), vget_low_f32(_r23_44665577.val[1]));
4711 float32x4_t _r_66 = vcombine_f32(vget_high_f32(_r01_44665577.val[0]), vget_high_f32(_r23_44665577.val[0]));
4712 float32x4_t _r_77 = vcombine_f32(vget_high_f32(_r01_44665577.val[1]), vget_high_f32(_r23_44665577.val[1]));
4713
4714 float32x4_t _r_0_m_6 = vsubq_f32(_r_00, _r_66);
4715 float32x4_t _r_7_m_1 = vsubq_f32(_r_77, _r_11);
4716
4717 float32x4_t _r_4_m_2 = vsubq_f32(_r_44, _r_22);
4718 float32x4_t _r_3_m_5 = vsubq_f32(_r_33, _r_55);
4719
4720 float32x4_t _tmp0 = vmlaq_lane_f32(_r_0_m_6, _r_4_m_2, vget_high_f32(_coeff1), 1);
4721 float32x4_t _tmp7 = vmlaq_lane_f32(_r_7_m_1, _r_3_m_5, vget_high_f32(_coeff1), 1);
4722
4723 vst1q_f32(&tmp[0][m], _tmp0);
4724 vst1q_f32(&tmp[7][m], _tmp7);
4725
4726 float32x4_t _r_2_a_6 = vaddq_f32(_r_22, _r_66);
4727 float32x4_t _r_1_a_5 = vaddq_f32(_r_11, _r_55);
4728
4729 float32x4_t _tmp12a = vmlsq_lane_f32(_r_2_a_6, _r_44, vget_high_f32(_coeff1), 0);
4730 float32x4_t _tmp12b = vmlsq_lane_f32(_r_1_a_5, _r_33, vget_high_f32(_coeff1), 0);
4731
4732 float32x4_t _tmp1 = vaddq_f32(_tmp12a, _tmp12b);
4733 float32x4_t _tmp2 = vsubq_f32(_tmp12a, _tmp12b);
4734
4735 vst1q_f32(&tmp[1][m], _tmp1);
4736 vst1q_f32(&tmp[2][m], _tmp2);
4737
4738 float32x4_t _r_4_x_c = vmulq_lane_f32(_r_44, vget_high_f32(_coeff0), 0);
4739 float32x4_t _r_3_x_c = vmulq_lane_f32(_r_33, vget_low_f32(_coeff1), 0);
4740
4741 float32x4_t _tmp34a = vaddq_f32(_r_66, _r_4_x_c);
4742 _tmp34a = vmlaq_lane_f32(_tmp34a, _r_22, vget_low_f32(_coeff0), 0);
4743
4744 float32x4_t _tmp34b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_low_f32(_coeff0), 1);
4745 _tmp34b = vmlaq_lane_f32(_tmp34b, _r_55, vget_high_f32(_coeff0), 1);
4746
4747 float32x4_t _tmp3 = vaddq_f32(_tmp34a, _tmp34b);
4748 float32x4_t _tmp4 = vsubq_f32(_tmp34a, _tmp34b);
4749
4750 vst1q_f32(&tmp[3][m], _tmp3);
4751 vst1q_f32(&tmp[4][m], _tmp4);
4752
4753 // reuse r04 * 1.25
4754 // reuse r03 * 2.5
4755 float32x4_t _r_2_a_4c = vaddq_f32(_r_22, _r_4_x_c);
4756 float32x4_t _tmp56a = vmlaq_lane_f32(_r_66, _r_2_a_4c, vget_low_f32(_coeff1), 1);
4757 float32x4_t _tmp56b = vmlaq_lane_f32(_r_3_x_c, _r_11, vget_high_f32(_coeff0), 1);
4758 _tmp56b = vmlaq_lane_f32(_tmp56b, _r_55, vget_low_f32(_coeff0), 1);
4759
4760 float32x4_t _tmp5 = vaddq_f32(_tmp56a, _tmp56b);
4761 float32x4_t _tmp6 = vsubq_f32(_tmp56a, _tmp56b);
4762
4763 vst1q_f32(&tmp[5][m], _tmp5);
4764 vst1q_f32(&tmp[6][m], _tmp6);
4765
4766 r0 += w * 4;
4767 r1 += w * 4;
4768 r2 += w * 4;
4769 r3 += w * 4;
4770 }
4771
4772 const float* t0 = tmp[0];
4773 const float* t1 = tmp[1];
4774 const float* t2 = tmp[2];
4775 const float* t3 = tmp[3];
4776
4777 float* r0_tm0 = img0_tm.row(i * w_tm / 8 + j);
4778 float* r0_tm1 = img0_tm.row(i * w_tm / 8 + j + tiles * 8);
4779 float* r0_tm2 = img0_tm.row(i * w_tm / 8 + j + tiles * 16);
4780 float* r0_tm3 = img0_tm.row(i * w_tm / 8 + j + tiles * 24);
4781
4782 for (int m = 0; m + 3 < 8; m += 4)
4783 {
4784 float32x4_t _t0_0123 = vld1q_f32(t0);
4785 float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
4786 float32x4_t _t1_0123 = vld1q_f32(t1);
4787 float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
4788 float32x4_t _t2_0123 = vld1q_f32(t2);
4789 float32x4_t _t2_4567 = vld1q_f32(t2 + 4);
4790 float32x4_t _t3_0123 = vld1q_f32(t3);
4791 float32x4_t _t3_4567 = vld1q_f32(t3 + 4);
4792
4793 float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
4794 float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
4795 float32x4x2_t _t23_00221133 = vtrnq_f32(_t2_0123, _t3_0123);
4796 float32x4x2_t _t23_44665577 = vtrnq_f32(_t2_4567, _t3_4567);
4797
4798 // no vswp intrinsic :(
4799 float32x4_t _t_00 = vcombine_f32(vget_low_f32(_t01_00221133.val[0]), vget_low_f32(_t23_00221133.val[0]));
4800 float32x4_t _t_11 = vcombine_f32(vget_low_f32(_t01_00221133.val[1]), vget_low_f32(_t23_00221133.val[1]));
4801 float32x4_t _t_22 = vcombine_f32(vget_high_f32(_t01_00221133.val[0]), vget_high_f32(_t23_00221133.val[0]));
4802 float32x4_t _t_33 = vcombine_f32(vget_high_f32(_t01_00221133.val[1]), vget_high_f32(_t23_00221133.val[1]));
4803 float32x4_t _t_44 = vcombine_f32(vget_low_f32(_t01_44665577.val[0]), vget_low_f32(_t23_44665577.val[0]));
4804 float32x4_t _t_55 = vcombine_f32(vget_low_f32(_t01_44665577.val[1]), vget_low_f32(_t23_44665577.val[1]));
4805 float32x4_t _t_66 = vcombine_f32(vget_high_f32(_t01_44665577.val[0]), vget_high_f32(_t23_44665577.val[0]));
4806 float32x4_t _t_77 = vcombine_f32(vget_high_f32(_t01_44665577.val[1]), vget_high_f32(_t23_44665577.val[1]));
4807
4808 float32x4_t _t_0_m_6 = vsubq_f32(_t_00, _t_66);
4809 float32x4_t _t_7_m_1 = vsubq_f32(_t_77, _t_11);
4810
4811 float32x4_t _t_4_m_2 = vsubq_f32(_t_44, _t_22);
4812 float32x4_t _t_3_m_5 = vsubq_f32(_t_33, _t_55);
4813
4814 float32x4_t _r0_tm_0_0 = vmlaq_lane_f32(_t_0_m_6, _t_4_m_2, vget_high_f32(_coeff1), 1);
4815 float32x4_t _r0_tm_4_3 = vmlaq_lane_f32(_t_7_m_1, _t_3_m_5, vget_high_f32(_coeff1), 1);
4816
4817 r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_0, 0);
4818 r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_0, 1);
4819 r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_0, 2);
4820 r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_0, 3);
4821
4822 r0_tm0 += img0_tm.w * tiles;
4823 r0_tm1 += img0_tm.w * tiles;
4824 r0_tm2 += img0_tm.w * tiles;
4825 r0_tm3 += img0_tm.w * tiles;
4826
4827 float32x4_t _t_2_m_6 = vaddq_f32(_t_22, _t_66);
4828 float32x4_t _t_1_m_5 = vaddq_f32(_t_11, _t_55);
4829
4830 float32x4_t _tmp12a = vmlsq_lane_f32(_t_2_m_6, _t_44, vget_high_f32(_coeff1), 0);
4831 float32x4_t _tmp12b = vmlsq_lane_f32(_t_1_m_5, _t_33, vget_high_f32(_coeff1), 0);
4832
4833 float32x4_t _r0_tm_0_1 = vaddq_f32(_tmp12a, _tmp12b);
4834 float32x4_t _r0_tm_0_2 = vsubq_f32(_tmp12a, _tmp12b);
4835
4836 r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_1, 0);
4837 r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_1, 1);
4838 r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_1, 2);
4839 r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_1, 3);
4840
4841 r0_tm0 += img0_tm.w * tiles;
4842 r0_tm1 += img0_tm.w * tiles;
4843 r0_tm2 += img0_tm.w * tiles;
4844 r0_tm3 += img0_tm.w * tiles;
4845
4846 r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_2, 0);
4847 r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_2, 1);
4848 r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_2, 2);
4849 r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_2, 3);
4850
4851 r0_tm0 += img0_tm.w * tiles;
4852 r0_tm1 += img0_tm.w * tiles;
4853 r0_tm2 += img0_tm.w * tiles;
4854 r0_tm3 += img0_tm.w * tiles;
4855
4856 float32x4_t _t_4_x_c = vmulq_lane_f32(_t_44, vget_high_f32(_coeff0), 0);
4857 float32x4_t _t_3_x_c = vmulq_lane_f32(_t_33, vget_low_f32(_coeff1), 0);
4858
4859 float32x4_t _tmp34a = vaddq_f32(_t_66, _t_4_x_c);
4860 _tmp34a = vmlaq_lane_f32(_tmp34a, _t_22, vget_low_f32(_coeff0), 0);
4861
4862 float32x4_t _tmp34b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_low_f32(_coeff0), 1);
4863 _tmp34b = vmlaq_lane_f32(_tmp34b, _t_55, vget_high_f32(_coeff0), 1);
4864
4865 float32x4_t _r0_tm_0_3 = vaddq_f32(_tmp34a, _tmp34b);
4866 float32x4_t _r0_tm_4_0 = vsubq_f32(_tmp34a, _tmp34b);
4867
4868 r0_tm0[0] = vgetq_lane_f32(_r0_tm_0_3, 0);
4869 r0_tm1[0] = vgetq_lane_f32(_r0_tm_0_3, 1);
4870 r0_tm2[0] = vgetq_lane_f32(_r0_tm_0_3, 2);
4871 r0_tm3[0] = vgetq_lane_f32(_r0_tm_0_3, 3);
4872
4873 r0_tm0 += img0_tm.w * tiles;
4874 r0_tm1 += img0_tm.w * tiles;
4875 r0_tm2 += img0_tm.w * tiles;
4876 r0_tm3 += img0_tm.w * tiles;
4877
4878 r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_0, 0);
4879 r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_0, 1);
4880 r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_0, 2);
4881 r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_0, 3);
4882
4883 r0_tm0 += img0_tm.w * tiles;
4884 r0_tm1 += img0_tm.w * tiles;
4885 r0_tm2 += img0_tm.w * tiles;
4886 r0_tm3 += img0_tm.w * tiles;
4887
4888 float32x4_t _t_2_a_4c = vaddq_f32(_t_22, _t_4_x_c);
4889 float32x4_t _tmp56a = vmlaq_lane_f32(_t_66, _t_2_a_4c, vget_low_f32(_coeff1), 1);
4890 float32x4_t _tmp56b = vmlaq_lane_f32(_t_3_x_c, _t_11, vget_high_f32(_coeff0), 1);
4891 _tmp56b = vmlaq_lane_f32(_tmp56b, _t_55, vget_low_f32(_coeff0), 1);
4892
4893 float32x4_t _r0_tm_4_1 = vaddq_f32(_tmp56a, _tmp56b);
4894 float32x4_t _r0_tm_4_2 = vsubq_f32(_tmp56a, _tmp56b);
4895
4896 r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_1, 0);
4897 r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_1, 1);
4898 r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_1, 2);
4899 r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_1, 3);
4900
4901 r0_tm0 += img0_tm.w * tiles;
4902 r0_tm1 += img0_tm.w * tiles;
4903 r0_tm2 += img0_tm.w * tiles;
4904 r0_tm3 += img0_tm.w * tiles;
4905
4906 r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_2, 0);
4907 r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_2, 1);
4908 r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_2, 2);
4909 r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_2, 3);
4910
4911 r0_tm0 += img0_tm.w * tiles;
4912 r0_tm1 += img0_tm.w * tiles;
4913 r0_tm2 += img0_tm.w * tiles;
4914 r0_tm3 += img0_tm.w * tiles;
4915
4916 r0_tm0[0] = vgetq_lane_f32(_r0_tm_4_3, 0);
4917 r0_tm1[0] = vgetq_lane_f32(_r0_tm_4_3, 1);
4918 r0_tm2[0] = vgetq_lane_f32(_r0_tm_4_3, 2);
4919 r0_tm3[0] = vgetq_lane_f32(_r0_tm_4_3, 3);
4920
4921 t0 += 8 * 4;
4922 t1 += 8 * 4;
4923 t2 += 8 * 4;
4924 t3 += 8 * 4;
4925
4926 r0_tm0 += img0_tm.w * tiles * 25;
4927 r0_tm1 += img0_tm.w * tiles * 25;
4928 r0_tm2 += img0_tm.w * tiles * 25;
4929 r0_tm3 += img0_tm.w * tiles * 25;
4930 }
4931 #else // __aarch64__
4932 float* t0 = tmp[0];
4933 float* t1 = tmp[1];
4934 float* t2 = tmp[2];
4935 float* t3 = tmp[3];
4936 float* t4 = tmp[4];
4937 float* t5 = tmp[5];
4938 float* t6 = tmp[6];
4939 float* t7 = tmp[7];
4940
4941 int stepw = w * 4 * 4;
4942
4943 asm volatile(
4944
4945 // loop0
4946 "vld1.f32 {d16-d19}, [%8], %26 \n"
4947 "vld1.f32 {d20-d23}, [%9], %26 \n"
4948 "vld1.f32 {d24-d27}, [%10], %26 \n"
4949
4950 "vtrn.32 q8, q10 \n"
4951
4952 "vld1.f32 {d28-d31}, [%11], %26 \n"
4953
4954 "vtrn.32 q9, q11 \n"
4955 "vtrn.32 q12, q14 \n"
4956 "vtrn.32 q13, q15 \n"
4957
4958 "vswp d17, d24 \n"
4959 "vswp d19, d26 \n"
4960 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
4961 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
4962
4963 "vsub.f32 q2, q8, q13 \n"
4964 "vsub.f32 q3, q9, q12 \n"
4965
4966 "vadd.f32 q4, q12, q13 \n"
4967 "vadd.f32 q5, q10, q11 \n"
4968
4969 "vmla.f32 q2, q3, %f25[1] \n"
4970
4971 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
4972 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
4973
4974 "vmls.f32 q4, q9, %f25[0] \n"
4975 "vmls.f32 q5, q14, %f25[0] \n"
4976
4977 "vst1.f32 {d4-d5}, [%0]! \n" // tmp[0][m]
4978
4979 "vmov q3, q7 \n" // use q7
4980
4981 "vadd.f32 q2, q13, q6 \n" // use q6
4982 "vmla.f32 q3, q10, %e24[1] \n"
4983
4984 "vadd.f32 q8, q4, q5 \n"
4985 "vsub.f32 q9, q4, q5 \n"
4986
4987 "vmov q5, q7 \n" // use q7
4988
4989 "vadd.f32 q6, q12, q6 \n" // use q6
4990 "vmla.f32 q5, q10, %f24[1] \n"
4991
4992 "vmov q4, q13 \n"
4993
4994 "vmla.f32 q2, q12, %e24[0] \n"
4995 "vmla.f32 q3, q11, %f24[1] \n"
4996
4997 "vst1.f32 {d16-d17}, [%1]! \n" // tmp[1][m]
4998
4999 "vmla.f32 q4, q6, %e25[1] \n"
5000 "vmla.f32 q5, q11, %e24[1] \n"
5001
5002 "vst1.f32 {d18-d19}, [%2]! \n" // tmp[2][m]
5003
5004 "vadd.f32 q8, q2, q3 \n"
5005 "vsub.f32 q9, q2, q3 \n"
5006
5007 "vsub.f32 q6, q15, q10 \n"
5008 "vsub.f32 q7, q14, q11 \n"
5009
5010 "vadd.f32 q2, q4, q5 \n"
5011 "vsub.f32 q3, q4, q5 \n"
5012
5013 "vst1.f32 {d16-d17}, [%3]! \n" // tmp[3][m]
5014 "vst1.f32 {d18-d19}, [%4]! \n" // tmp[4][m]
5015
5016 "vmla.f32 q6, q7, %f25[1] \n"
5017
5018 "vst1.f32 {d4-d5}, [%5]! \n" // tmp[5][m]
5019 "vst1.f32 {d6-d7}, [%6]! \n" // tmp[6][m]
5020
5021 "vst1.f32 {d12-d13}, [%7]! \n" // tmp[7][m]
5022
5023 // loop1
5024 "vld1.f32 {d16-d19}, [%8] \n"
5025 "vld1.f32 {d20-d23}, [%9] \n"
5026 "vld1.f32 {d24-d27}, [%10] \n"
5027
5028 "vtrn.32 q8, q10 \n"
5029
5030 "vld1.f32 {d28-d31}, [%11] \n"
5031
5032 "vtrn.32 q9, q11 \n"
5033 "vtrn.32 q12, q14 \n"
5034 "vtrn.32 q13, q15 \n"
5035
5036 "vswp d17, d24 \n"
5037 "vswp d19, d26 \n"
5038 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
5039 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
5040
5041 "vsub.f32 q2, q8, q13 \n"
5042 "vsub.f32 q3, q9, q12 \n"
5043
5044 "vadd.f32 q4, q12, q13 \n"
5045 "vadd.f32 q5, q10, q11 \n"
5046
5047 "vmla.f32 q2, q3, %f25[1] \n"
5048
5049 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
5050 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
5051
5052 "vmls.f32 q4, q9, %f25[0] \n"
5053 "vmls.f32 q5, q14, %f25[0] \n"
5054
5055 "vst1.f32 {d4-d5}, [%0]! \n" // tmp[0][m]
5056
5057 "vmov q3, q7 \n" // use q7
5058
5059 "vadd.f32 q2, q13, q6 \n" // use q6
5060 "vmla.f32 q3, q10, %e24[1] \n"
5061
5062 "vadd.f32 q8, q4, q5 \n"
5063 "vsub.f32 q9, q4, q5 \n"
5064
5065 "vmov q5, q7 \n" // use q7
5066
5067 "vadd.f32 q6, q12, q6 \n" // use q6
5068 "vmla.f32 q5, q10, %f24[1] \n"
5069
5070 "vmov q4, q13 \n"
5071
5072 "vmla.f32 q2, q12, %e24[0] \n"
5073 "vmla.f32 q3, q11, %f24[1] \n"
5074
5075 "vst1.f32 {d16-d17}, [%1]! \n" // tmp[1][m]
5076
5077 "vmla.f32 q4, q6, %e25[1] \n"
5078 "vmla.f32 q5, q11, %e24[1] \n"
5079
5080 "vst1.f32 {d18-d19}, [%2]! \n" // tmp[2][m]
5081
5082 "vadd.f32 q8, q2, q3 \n"
5083 "vsub.f32 q9, q2, q3 \n"
5084
5085 "vsub.f32 q6, q15, q10 \n"
5086 "vsub.f32 q7, q14, q11 \n"
5087
5088 "vadd.f32 q2, q4, q5 \n"
5089 "vsub.f32 q3, q4, q5 \n"
5090
5091 "vst1.f32 {d16-d17}, [%3]! \n" // tmp[3][m]
5092 "vst1.f32 {d18-d19}, [%4]! \n" // tmp[4][m]
5093
5094 "vmla.f32 q6, q7, %f25[1] \n"
5095
5096 "vst1.f32 {d4-d5}, [%5]! \n" // tmp[5][m]
5097 "vst1.f32 {d6-d7}, [%6]! \n" // tmp[6][m]
5098
5099 "vst1.f32 {d12-d13}, [%7]! \n" // tmp[7][m]
5100
5101 : "=r"(t0), // %0
5102 "=r"(t1), // %1
5103 "=r"(t2), // %2
5104 "=r"(t3), // %3
5105 "=r"(t4), // %4
5106 "=r"(t5), // %5
5107 "=r"(t6), // %6
5108 "=r"(t7), // %7
5109 "=r"(r0), // %8
5110 "=r"(r1), // %9
5111 "=r"(r2), // %10
5112 "=r"(r3) // %11
5113 : "0"(t0),
5114 "1"(t1),
5115 "2"(t2),
5116 "3"(t3),
5117 "4"(t4),
5118 "5"(t5),
5119 "6"(t6),
5120 "7"(t7),
5121 "8"(r0),
5122 "9"(r1),
5123 "10"(r2),
5124 "11"(r3),
5125 "w"(_coeff0), // %24
5126 "w"(_coeff1), // %25
5127 "r"(stepw) // %26
5128 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5129
5130 t0 = tmp[0];
5131 t1 = tmp[1];
5132 t2 = tmp[2];
5133 t3 = tmp[3];
5134
5135 float* r0_tm0_0 = img0_tm.row(i * w_tm / 8 + j);
5136 float* r0_tm1_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 8);
5137 float* r0_tm2_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 16);
5138 float* r0_tm3_0 = img0_tm.row(i * w_tm / 8 + j + tiles * 24);
5139 float* r0_tm0_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 32);
5140 float* r0_tm1_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 40);
5141 float* r0_tm2_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 48);
5142 float* r0_tm3_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 56);
5143
5144 int step = img0_tm.w * tiles * 4;
5145
5146 asm volatile(
5147
5148 // loop0
5149 "vld1.f32 {d16-d19}, [%8] \n"
5150 "add %8, %8, #128 \n"
5151 "vld1.f32 {d20-d23}, [%9] \n"
5152 "add %9, %9, #128 \n"
5153 "vld1.f32 {d24-d27}, [%10] \n"
5154 "add %10, %10, #128 \n"
5155
5156 "vtrn.32 q8, q10 \n"
5157
5158 "vld1.f32 {d28-d31}, [%11] \n"
5159 "add %11, %11, #128 \n"
5160
5161 "vtrn.32 q9, q11 \n"
5162 "vtrn.32 q12, q14 \n"
5163 "vtrn.32 q13, q15 \n"
5164
5165 "vswp d17, d24 \n"
5166 "vswp d19, d26 \n"
5167 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
5168 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
5169
5170 "vsub.f32 q2, q8, q13 \n"
5171 "vsub.f32 q3, q9, q12 \n"
5172
5173 "vadd.f32 q4, q12, q13 \n"
5174 "vadd.f32 q5, q10, q11 \n"
5175
5176 "vmla.f32 q2, q3, %f25[1] \n"
5177
5178 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
5179 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
5180
5181 "vmls.f32 q4, q9, %f25[0] \n"
5182 "vmls.f32 q5, q14, %f25[0] \n"
5183
5184 "vst1.f32 {d4[0]}, [%0], %26 \n"
5185 "vst1.f32 {d4[1]}, [%1], %26 \n"
5186
5187 "vmov q3, q7 \n" // use q7
5188
5189 "vst1.f32 {d5[0]}, [%2], %26 \n"
5190 "vst1.f32 {d5[1]}, [%3], %26 \n"
5191
5192 "vadd.f32 q2, q13, q6 \n" // use q6
5193 "vmla.f32 q3, q10, %e24[1] \n"
5194
5195 "vadd.f32 q8, q4, q5 \n"
5196 "vsub.f32 q9, q4, q5 \n"
5197
5198 "vmov q5, q7 \n" // use q7
5199
5200 "vadd.f32 q6, q12, q6 \n" // use q6
5201 "vmla.f32 q5, q10, %f24[1] \n"
5202
5203 "vmov q4, q13 \n"
5204
5205 "vmla.f32 q2, q12, %e24[0] \n"
5206 "vmla.f32 q3, q11, %f24[1] \n"
5207
5208 "vst1.f32 {d16[0]}, [%0], %26 \n"
5209 "vst1.f32 {d16[1]}, [%1], %26 \n"
5210
5211 "vmla.f32 q4, q6, %e25[1] \n"
5212
5213 "vst1.f32 {d17[0]}, [%2], %26 \n"
5214 "vst1.f32 {d17[1]}, [%3], %26 \n"
5215
5216 "vmla.f32 q5, q11, %e24[1] \n"
5217
5218 "vst1.f32 {d18[0]}, [%0], %26 \n"
5219 "vst1.f32 {d18[1]}, [%1], %26 \n"
5220
5221 "vadd.f32 q8, q2, q3 \n"
5222
5223 "vst1.f32 {d19[0]}, [%2], %26 \n"
5224 "vst1.f32 {d19[1]}, [%3], %26 \n"
5225
5226 "vsub.f32 q9, q2, q3 \n"
5227
5228 "vsub.f32 q6, q15, q10 \n"
5229 "vsub.f32 q7, q14, q11 \n"
5230
5231 "vst1.f32 {d16[0]}, [%0], %26 \n"
5232 "vst1.f32 {d16[1]}, [%1], %26 \n"
5233 "vst1.f32 {d17[0]}, [%2], %26 \n"
5234 "vst1.f32 {d17[1]}, [%3], %26 \n"
5235
5236 "vadd.f32 q2, q4, q5 \n"
5237
5238 "vst1.f32 {d18[0]}, [%0], %26 \n"
5239 "vst1.f32 {d18[1]}, [%1], %26 \n"
5240 "vst1.f32 {d19[0]}, [%2], %26 \n"
5241 "vst1.f32 {d19[1]}, [%3], %26 \n"
5242
5243 "vsub.f32 q3, q4, q5 \n"
5244
5245 "vst1.f32 {d4[0]}, [%0], %26 \n"
5246 "vst1.f32 {d4[1]}, [%1], %26 \n"
5247 "vst1.f32 {d5[0]}, [%2], %26 \n"
5248 "vst1.f32 {d5[1]}, [%3], %26 \n"
5249
5250 "vmla.f32 q6, q7, %f25[1] \n"
5251
5252 "vst1.f32 {d6[0]}, [%0], %26 \n"
5253 "vst1.f32 {d6[1]}, [%1], %26 \n"
5254 "vst1.f32 {d7[0]}, [%2], %26 \n"
5255 "vst1.f32 {d7[1]}, [%3], %26 \n"
5256
5257 "vst1.f32 {d12[0]}, [%0] \n"
5258 "vst1.f32 {d12[1]}, [%1] \n"
5259 "vst1.f32 {d13[0]}, [%2] \n"
5260 "vst1.f32 {d13[1]}, [%3] \n"
5261
5262 // loop1
5263 "vld1.f32 {d16-d19}, [%8] \n"
5264 "vld1.f32 {d20-d23}, [%9] \n"
5265 "vld1.f32 {d24-d27}, [%10] \n"
5266
5267 "vtrn.32 q8, q10 \n"
5268
5269 "vld1.f32 {d28-d31}, [%11] \n"
5270
5271 "vtrn.32 q9, q11 \n"
5272 "vtrn.32 q12, q14 \n"
5273 "vtrn.32 q13, q15 \n"
5274
5275 "vswp d17, d24 \n"
5276 "vswp d19, d26 \n"
5277 "vswp d21, d28 \n" // q8 = 00 q9 = 44 q10 = 11 q11 = 55
5278 "vswp d23, d30 \n" // q12 = 22 q13 = 66 q14 = 33 q15 = 77
5279
5280 "vsub.f32 q2, q8, q13 \n"
5281 "vsub.f32 q3, q9, q12 \n"
5282
5283 "vadd.f32 q4, q12, q13 \n"
5284 "vadd.f32 q5, q10, q11 \n"
5285
5286 "vmla.f32 q2, q3, %f25[1] \n"
5287
5288 "vmul.f32 q7, q14, %e25[0] \n" // q7 = _r_3_x_c
5289 "vmul.f32 q6, q9, %f24[0] \n" // q6 = _r_4_x_c
5290
5291 "vmls.f32 q4, q9, %f25[0] \n"
5292 "vmls.f32 q5, q14, %f25[0] \n"
5293
5294 "vst1.f32 {d4[0]}, [%4], %26 \n"
5295 "vst1.f32 {d4[1]}, [%5], %26 \n"
5296
5297 "vmov q3, q7 \n" // use q7
5298
5299 "vst1.f32 {d5[0]}, [%6], %26 \n"
5300 "vst1.f32 {d5[1]}, [%7], %26 \n"
5301
5302 "vadd.f32 q2, q13, q6 \n" // use q6
5303 "vmla.f32 q3, q10, %e24[1] \n"
5304
5305 "vadd.f32 q8, q4, q5 \n"
5306 "vsub.f32 q9, q4, q5 \n"
5307
5308 "vmov q5, q7 \n" // use q7
5309
5310 "vadd.f32 q6, q12, q6 \n" // use q6
5311 "vmla.f32 q5, q10, %f24[1] \n"
5312
5313 "vmov q4, q13 \n"
5314
5315 "vmla.f32 q2, q12, %e24[0] \n"
5316 "vmla.f32 q3, q11, %f24[1] \n"
5317
5318 "vst1.f32 {d16[0]}, [%4], %26 \n"
5319 "vst1.f32 {d16[1]}, [%5], %26 \n"
5320
5321 "vmla.f32 q4, q6, %e25[1] \n"
5322
5323 "vst1.f32 {d17[0]}, [%6], %26 \n"
5324 "vst1.f32 {d17[1]}, [%7], %26 \n"
5325
5326 "vmla.f32 q5, q11, %e24[1] \n"
5327
5328 "vst1.f32 {d18[0]}, [%4], %26 \n"
5329 "vst1.f32 {d18[1]}, [%5], %26 \n"
5330
5331 "vadd.f32 q8, q2, q3 \n"
5332
5333 "vst1.f32 {d19[0]}, [%6], %26 \n"
5334 "vst1.f32 {d19[1]}, [%7], %26 \n"
5335
5336 "vsub.f32 q9, q2, q3 \n"
5337
5338 "vsub.f32 q6, q15, q10 \n"
5339 "vsub.f32 q7, q14, q11 \n"
5340
5341 "vst1.f32 {d16[0]}, [%4], %26 \n"
5342 "vst1.f32 {d16[1]}, [%5], %26 \n"
5343 "vst1.f32 {d17[0]}, [%6], %26 \n"
5344 "vst1.f32 {d17[1]}, [%7], %26 \n"
5345
5346 "vadd.f32 q2, q4, q5 \n"
5347
5348 "vst1.f32 {d18[0]}, [%4], %26 \n"
5349 "vst1.f32 {d18[1]}, [%5], %26 \n"
5350 "vst1.f32 {d19[0]}, [%6], %26 \n"
5351 "vst1.f32 {d19[1]}, [%7], %26 \n"
5352
5353 "vsub.f32 q3, q4, q5 \n"
5354
5355 "vst1.f32 {d4[0]}, [%4], %26 \n"
5356 "vst1.f32 {d4[1]}, [%5], %26 \n"
5357 "vst1.f32 {d5[0]}, [%6], %26 \n"
5358 "vst1.f32 {d5[1]}, [%7], %26 \n"
5359
5360 "vmla.f32 q6, q7, %f25[1] \n"
5361
5362 "vst1.f32 {d6[0]}, [%4], %26 \n"
5363 "vst1.f32 {d6[1]}, [%5], %26 \n"
5364 "vst1.f32 {d7[0]}, [%6], %26 \n"
5365 "vst1.f32 {d7[1]}, [%7], %26 \n"
5366
5367 "vst1.f32 {d12[0]}, [%4] \n"
5368 "vst1.f32 {d12[1]}, [%5] \n"
5369 "vst1.f32 {d13[0]}, [%6] \n"
5370 "vst1.f32 {d13[1]}, [%7] \n"
5371
5372 : "=r"(r0_tm0_0), // %0
5373 "=r"(r0_tm1_0), // %1
5374 "=r"(r0_tm2_0), // %2
5375 "=r"(r0_tm3_0), // %3
5376 "=r"(r0_tm0_4), // %4
5377 "=r"(r0_tm1_4), // %5
5378 "=r"(r0_tm2_4), // %6
5379 "=r"(r0_tm3_4), // %7
5380 "=r"(t0), // %8
5381 "=r"(t1), // %9
5382 "=r"(t2), // %10
5383 "=r"(t3) // %11
5384 : "0"(r0_tm0_0),
5385 "1"(r0_tm1_0),
5386 "2"(r0_tm2_0),
5387 "3"(r0_tm3_0),
5388 "4"(r0_tm0_4),
5389 "5"(r0_tm1_4),
5390 "6"(r0_tm2_4),
5391 "7"(r0_tm3_4),
5392 "8"(t0),
5393 "9"(t1),
5394 "10"(t2),
5395 "11"(t3),
5396 "w"(_coeff0), // %24
5397 "w"(_coeff1), // %25
5398 "r"(step) // %26
5399 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
5400 #endif // __aarch64__
5401 #else
5402 const float* r0 = img0.row(i * 6) + j * 6;
5403
5404 for (int m = 0; m < 8; m++)
5405 {
5406 tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25f;
5407 tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25f;
5408
5409 float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25f);
5410 float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25f);
5411
5412 tmp[1][m] = tmp12a + tmp12b;
5413 tmp[2][m] = tmp12a - tmp12b;
5414
5415 float tmp34a = (r0[6] + r0[2] * 0.25f - r0[4] * 1.25f);
5416 float tmp34b = (r0[1] * 0.5f - r0[3] * 2.5f + r0[5] * 2.f);
5417
5418 tmp[3][m] = tmp34a + tmp34b;
5419 tmp[4][m] = tmp34a - tmp34b;
5420
5421 float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25f) * 4.f);
5422 float tmp56b = (r0[1] * 2.f - r0[3] * 2.5f + r0[5] * 0.5f);
5423
5424 tmp[5][m] = tmp56a + tmp56b;
5425 tmp[6][m] = tmp56a - tmp56b;
5426
5427 r0 += w;
5428 }
5429
5430 float* r0_tm_0 = img0_tm.row(i * w_tm / 8 + j);
5431 float* r0_tm_1 = img0_tm.row(i * w_tm / 8 + j + tiles);
5432 float* r0_tm_2 = img0_tm.row(i * w_tm / 8 + j + tiles * 2);
5433 float* r0_tm_3 = img0_tm.row(i * w_tm / 8 + j + tiles * 3);
5434 float* r0_tm_4 = img0_tm.row(i * w_tm / 8 + j + tiles * 4);
5435 float* r0_tm_5 = img0_tm.row(i * w_tm / 8 + j + tiles * 5);
5436 float* r0_tm_6 = img0_tm.row(i * w_tm / 8 + j + tiles * 6);
5437 float* r0_tm_7 = img0_tm.row(i * w_tm / 8 + j + tiles * 7);
5438
5439 for (int m = 0; m < 8; m++)
5440 {
5441 const float* tmp0 = tmp[m];
5442
5443 r0_tm_0[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25f;
5444 r0_tm_7[0] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25f;
5445
5446 float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25f);
5447 float tmp12b = (tmp0[1] - tmp0[3] * 4.25f + tmp0[5]);
5448
5449 r0_tm_1[0] = tmp12a + tmp12b;
5450 r0_tm_2[0] = tmp12a - tmp12b;
5451
5452 float tmp34a = (tmp0[6] + tmp0[2] * 0.25f - tmp0[4] * 1.25f);
5453 float tmp34b = (tmp0[1] * 0.5f - tmp0[3] * 2.5f + tmp0[5] * 2.f);
5454
5455 r0_tm_3[0] = tmp34a + tmp34b;
5456 r0_tm_4[0] = tmp34a - tmp34b;
5457
5458 float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25f) * 4.f);
5459 float tmp56b = (tmp0[1] * 2.f - tmp0[3] * 2.5f + tmp0[5] * 0.5f);
5460
5461 r0_tm_5[0] = tmp56a + tmp56b;
5462 r0_tm_6[0] = tmp56a - tmp56b;
5463
5464 r0_tm_0 += img0_tm.w * tiles * 8;
5465 r0_tm_1 += img0_tm.w * tiles * 8;
5466 r0_tm_2 += img0_tm.w * tiles * 8;
5467 r0_tm_3 += img0_tm.w * tiles * 8;
5468 r0_tm_4 += img0_tm.w * tiles * 8;
5469 r0_tm_5 += img0_tm.w * tiles * 8;
5470 r0_tm_6 += img0_tm.w * tiles * 8;
5471 r0_tm_7 += img0_tm.w * tiles * 8;
5472 }
5473 #endif // __ARM_NEON
5474 }
5475 }
5476 }
5477 }
5478 bottom_blob_bordered = Mat();
5479 // END transform input
5480
5481 // BEGIN dot
5482 Mat top_blob_tm;
5483 {
5484 int w_tm = outw / 6 * 8;
5485 int h_tm = outh / 6 * 8;
5486 const int tiles = w_tm / 8 * h_tm / 8;
5487
5488 // permute
5489 // bottom_blob_tm.create(1, 64 * tiles, inch);
5490 // Mat bottom_blob_tm2(inch, tiles, 64);
5491 Mat bottom_blob_tm2(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u, opt.workspace_allocator);
5492
5493 #pragma omp parallel for num_threads(opt.num_threads)
5494 for (int r = 0; r < 64; r++)
5495 {
5496 Mat tm2 = bottom_blob_tm2.channel(r);
5497
5498 // tile
5499 int i = 0;
5500 for (; i + 7 < tiles; i += 8)
5501 {
5502 float* tm2p = tm2.row(i / 8);
5503
5504 const float* r0 = bottom_blob_tm;
5505
5506 r0 += r * tiles + i;
5507
5508 for (int q = 0; q < inch; q++)
5509 {
5510 #if __ARM_NEON
5511 float32x4_t _r0 = vld1q_f32(r0);
5512 float32x4_t _r0n = vld1q_f32(r0 + 4);
5513 vst1q_f32(tm2p, _r0);
5514 vst1q_f32(tm2p + 4, _r0n);
5515 #else
5516 tm2p[0] = r0[0];
5517 tm2p[1] = r0[1];
5518 tm2p[2] = r0[2];
5519 tm2p[3] = r0[3];
5520 tm2p[4] = r0[4];
5521 tm2p[5] = r0[5];
5522 tm2p[6] = r0[6];
5523 tm2p[7] = r0[7];
5524 #endif // __ARM_NEON
5525
5526 r0 += bottom_blob_tm.cstep;
5527 tm2p += 8;
5528 }
5529 }
5530 for (; i + 3 < tiles; i += 4)
5531 {
5532 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
5533
5534 const float* r0 = bottom_blob_tm;
5535
5536 r0 += r * tiles + i;
5537
5538 for (int q = 0; q < inch; q++)
5539 {
5540 #if __ARM_NEON
5541 float32x4_t _r0 = vld1q_f32(r0);
5542 vst1q_f32(tm2p, _r0);
5543 #else
5544 tm2p[0] = r0[0];
5545 tm2p[1] = r0[1];
5546 tm2p[2] = r0[2];
5547 tm2p[3] = r0[3];
5548 #endif // __ARM_NEON
5549
5550 r0 += bottom_blob_tm.cstep;
5551 tm2p += 4;
5552 }
5553 }
5554 for (; i < tiles; i++)
5555 {
5556 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + i % 4);
5557
5558 const float* r0 = bottom_blob_tm;
5559
5560 r0 += r * tiles + i;
5561
5562 for (int q = 0; q < inch; q++)
5563 {
5564 tm2p[0] = r0[0];
5565
5566 r0 += bottom_blob_tm.cstep;
5567 tm2p += 1;
5568 }
5569 }
5570 }
5571
5572 bottom_blob_tm = Mat();
5573 // permute end
5574
5575 top_blob_tm.create(1, 64 * tiles, outch);
5576
5577 int nn_outch = 0;
5578 int remain_outch_start = 0;
5579
5580 #if __ARM_NEON && __aarch64__
5581 nn_outch = outch >> 3;
5582 remain_outch_start = nn_outch << 3;
5583
5584 #pragma omp parallel for num_threads(opt.num_threads)
5585 for (int pp = 0; pp < nn_outch; pp++)
5586 {
5587 int p = pp * 8;
5588
5589 const Mat kernel_tm0 = kernel_tm.channel(p / 8);
5590
5591 Mat out0_tm = top_blob_tm.channel(p);
5592 Mat out1_tm = top_blob_tm.channel(p + 1);
5593 Mat out2_tm = top_blob_tm.channel(p + 2);
5594 Mat out3_tm = top_blob_tm.channel(p + 3);
5595 Mat out4_tm = top_blob_tm.channel(p + 4);
5596 Mat out5_tm = top_blob_tm.channel(p + 5);
5597 Mat out6_tm = top_blob_tm.channel(p + 6);
5598 Mat out7_tm = top_blob_tm.channel(p + 7);
5599
5600 float* output0_tm = out0_tm;
5601 float* output1_tm = out1_tm;
5602 float* output2_tm = out2_tm;
5603 float* output3_tm = out3_tm;
5604 float* output4_tm = out4_tm;
5605 float* output5_tm = out5_tm;
5606 float* output6_tm = out6_tm;
5607 float* output7_tm = out7_tm;
5608
5609 for (int r = 0; r < 64; r++)
5610 {
5611 const Mat bb2 = bottom_blob_tm2.channel(r);
5612
5613 // tile
5614 int i = 0;
5615 for (; i + 7 < tiles; i += 8)
5616 {
5617 const float* bb2p0 = bb2.row(i / 8);
5618
5619 const float* ktm0 = kernel_tm0.row(r);
5620
5621 asm volatile(
5622 "eor v16.16b, v16.16b, v16.16b \n"
5623 "eor v17.16b, v17.16b, v17.16b \n"
5624 "eor v18.16b, v18.16b, v18.16b \n"
5625 "eor v19.16b, v19.16b, v19.16b \n"
5626 "eor v20.16b, v20.16b, v20.16b \n"
5627 "eor v21.16b, v21.16b, v21.16b \n"
5628 "eor v22.16b, v22.16b, v22.16b \n"
5629 "eor v23.16b, v23.16b, v23.16b \n"
5630 "eor v24.16b, v24.16b, v24.16b \n"
5631 "eor v25.16b, v25.16b, v25.16b \n"
5632 "eor v26.16b, v26.16b, v26.16b \n"
5633 "eor v27.16b, v27.16b, v27.16b \n"
5634 "eor v28.16b, v28.16b, v28.16b \n"
5635 "eor v29.16b, v29.16b, v29.16b \n"
5636 "eor v30.16b, v30.16b, v30.16b \n"
5637 "eor v31.16b, v31.16b, v31.16b \n"
5638
5639 // inch loop
5640 "lsr w4, %w20, #2 \n" // w4 = nn = inch >> 2
5641 "cmp w4, #0 \n"
5642 "beq 1f \n"
5643
5644 "0: \n"
5645
5646 "prfm pldl1keep, [%8, #512] \n"
5647 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64 \n"
5648
5649 "prfm pldl1keep, [%9, #512] \n"
5650 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
5651
5652 "fmla v16.4s, v8.4s, v0.s[0] \n"
5653 "fmla v17.4s, v9.4s, v0.s[0] \n"
5654 "fmla v18.4s, v8.4s, v0.s[1] \n"
5655 "fmla v19.4s, v9.4s, v0.s[1] \n"
5656 "fmla v20.4s, v8.4s, v0.s[2] \n"
5657 "fmla v21.4s, v9.4s, v0.s[2] \n"
5658 "fmla v22.4s, v8.4s, v0.s[3] \n"
5659 "fmla v23.4s, v9.4s, v0.s[3] \n"
5660
5661 "prfm pldl1keep, [%9, #512] \n"
5662 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n"
5663
5664 "fmla v24.4s, v8.4s, v1.s[0] \n"
5665 "fmla v25.4s, v9.4s, v1.s[0] \n"
5666 "fmla v26.4s, v8.4s, v1.s[1] \n"
5667 "fmla v27.4s, v9.4s, v1.s[1] \n"
5668 "fmla v28.4s, v8.4s, v1.s[2] \n"
5669 "fmla v29.4s, v9.4s, v1.s[2] \n"
5670 "fmla v30.4s, v8.4s, v1.s[3] \n"
5671 "fmla v31.4s, v9.4s, v1.s[3] \n"
5672
5673 "fmla v16.4s, v10.4s, v2.s[0] \n"
5674 "fmla v17.4s, v11.4s, v2.s[0] \n"
5675 "fmla v18.4s, v10.4s, v2.s[1] \n"
5676 "fmla v19.4s, v11.4s, v2.s[1] \n"
5677 "fmla v20.4s, v10.4s, v2.s[2] \n"
5678 "fmla v21.4s, v11.4s, v2.s[2] \n"
5679 "fmla v22.4s, v10.4s, v2.s[3] \n"
5680 "fmla v23.4s, v11.4s, v2.s[3] \n"
5681
5682 "prfm pldl1keep, [%8, #512] \n"
5683 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%8], #64 \n"
5684
5685 "fmla v24.4s, v10.4s, v3.s[0] \n"
5686 "fmla v25.4s, v11.4s, v3.s[0] \n"
5687 "fmla v26.4s, v10.4s, v3.s[1] \n"
5688 "fmla v27.4s, v11.4s, v3.s[1] \n"
5689 "fmla v28.4s, v10.4s, v3.s[2] \n"
5690 "fmla v29.4s, v11.4s, v3.s[2] \n"
5691 "fmla v30.4s, v10.4s, v3.s[3] \n"
5692 "fmla v31.4s, v11.4s, v3.s[3] \n"
5693
5694 "fmla v16.4s, v12.4s, v4.s[0] \n"
5695 "fmla v17.4s, v13.4s, v4.s[0] \n"
5696 "fmla v18.4s, v12.4s, v4.s[1] \n"
5697 "fmla v19.4s, v13.4s, v4.s[1] \n"
5698 "fmla v20.4s, v12.4s, v4.s[2] \n"
5699 "fmla v21.4s, v13.4s, v4.s[2] \n"
5700 "fmla v22.4s, v12.4s, v4.s[3] \n"
5701 "fmla v23.4s, v13.4s, v4.s[3] \n"
5702
5703 "fmla v24.4s, v12.4s, v5.s[0] \n"
5704 "fmla v25.4s, v13.4s, v5.s[0] \n"
5705 "fmla v26.4s, v12.4s, v5.s[1] \n"
5706 "fmla v27.4s, v13.4s, v5.s[1] \n"
5707 "fmla v28.4s, v12.4s, v5.s[2] \n"
5708 "fmla v29.4s, v13.4s, v5.s[2] \n"
5709 "fmla v30.4s, v12.4s, v5.s[3] \n"
5710 "fmla v31.4s, v13.4s, v5.s[3] \n"
5711
5712 "fmla v16.4s, v14.4s, v6.s[0] \n"
5713 "fmla v17.4s, v15.4s, v6.s[0] \n"
5714 "fmla v18.4s, v14.4s, v6.s[1] \n"
5715 "fmla v19.4s, v15.4s, v6.s[1] \n"
5716 "fmla v20.4s, v14.4s, v6.s[2] \n"
5717 "fmla v21.4s, v15.4s, v6.s[2] \n"
5718 "fmla v22.4s, v14.4s, v6.s[3] \n"
5719 "fmla v23.4s, v15.4s, v6.s[3] \n"
5720
5721 "subs w4, w4, #1 \n"
5722
5723 "fmla v24.4s, v14.4s, v7.s[0] \n"
5724 "fmla v25.4s, v15.4s, v7.s[0] \n"
5725 "fmla v26.4s, v14.4s, v7.s[1] \n"
5726 "fmla v27.4s, v15.4s, v7.s[1] \n"
5727 "fmla v28.4s, v14.4s, v7.s[2] \n"
5728 "fmla v29.4s, v15.4s, v7.s[2] \n"
5729 "fmla v30.4s, v14.4s, v7.s[3] \n"
5730 "fmla v31.4s, v15.4s, v7.s[3] \n"
5731
5732 "bne 0b \n"
5733
5734 "1: \n"
5735
5736 // remain loop
5737 "and w4, %w20, #3 \n" // w4 = remain = tiles & 3;
5738 "cmp w4, #0 \n"
5739 "beq 3f \n"
5740
5741 "2: \n"
5742
5743 "prfm pldl1keep, [%8, #256] \n"
5744 "ld1 {v8.4s, v9.4s}, [%8], #32 \n"
5745
5746 "prfm pldl1keep, [%9, #256] \n"
5747 "ld1 {v0.4s, v1.4s}, [%9], #32 \n"
5748
5749 "fmla v16.4s, v8.4s, v0.s[0] \n"
5750 "fmla v17.4s, v9.4s, v0.s[0] \n"
5751 "fmla v18.4s, v8.4s, v0.s[1] \n"
5752 "fmla v19.4s, v9.4s, v0.s[1] \n"
5753 "fmla v20.4s, v8.4s, v0.s[2] \n"
5754 "fmla v21.4s, v9.4s, v0.s[2] \n"
5755 "fmla v22.4s, v8.4s, v0.s[3] \n"
5756 "fmla v23.4s, v9.4s, v0.s[3] \n"
5757
5758 "subs w4, w4, #1 \n"
5759
5760 "fmla v24.4s, v8.4s, v1.s[0] \n"
5761 "fmla v25.4s, v9.4s, v1.s[0] \n"
5762 "fmla v26.4s, v8.4s, v1.s[1] \n"
5763 "fmla v27.4s, v9.4s, v1.s[1] \n"
5764 "fmla v28.4s, v8.4s, v1.s[2] \n"
5765 "fmla v29.4s, v9.4s, v1.s[2] \n"
5766 "fmla v30.4s, v8.4s, v1.s[3] \n"
5767 "fmla v31.4s, v9.4s, v1.s[3] \n"
5768
5769 "bne 2b \n"
5770
5771 "3: \n"
5772
5773 "st1 {v16.4s, v17.4s}, [%0], #32 \n"
5774 "st1 {v18.4s, v19.4s}, [%1], #32 \n"
5775 "st1 {v20.4s, v21.4s}, [%2], #32 \n"
5776 "st1 {v22.4s, v23.4s}, [%3], #32 \n"
5777 "st1 {v24.4s, v25.4s}, [%4], #32 \n"
5778 "st1 {v26.4s, v27.4s}, [%5], #32 \n"
5779 "st1 {v28.4s, v29.4s}, [%6], #32 \n"
5780 "st1 {v30.4s, v31.4s}, [%7], #32 \n"
5781
5782 : "=r"(output0_tm), // %0
5783 "=r"(output1_tm), // %1
5784 "=r"(output2_tm), // %2
5785 "=r"(output3_tm), // %3
5786 "=r"(output4_tm), // %4
5787 "=r"(output5_tm), // %5
5788 "=r"(output6_tm), // %6
5789 "=r"(output7_tm), // %7
5790 "=r"(bb2p0), // %8
5791 "=r"(ktm0) // %9
5792 : "0"(output0_tm),
5793 "1"(output1_tm),
5794 "2"(output2_tm),
5795 "3"(output3_tm),
5796 "4"(output4_tm),
5797 "5"(output5_tm),
5798 "6"(output6_tm),
5799 "7"(output7_tm),
5800 "8"(bb2p0),
5801 "9"(ktm0),
5802 "r"(inch) // %20
5803 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
5804 }
5805 for (; i + 3 < tiles; i += 4)
5806 {
5807 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
5808
5809 const float* ktm0 = kernel_tm0.row(r);
5810
5811 asm volatile(
5812 "eor v16.16b, v16.16b, v16.16b \n"
5813 "eor v17.16b, v17.16b, v17.16b \n"
5814 "eor v18.16b, v18.16b, v18.16b \n"
5815 "eor v19.16b, v19.16b, v19.16b \n"
5816 "eor v20.16b, v20.16b, v20.16b \n"
5817 "eor v21.16b, v21.16b, v21.16b \n"
5818 "eor v22.16b, v22.16b, v22.16b \n"
5819 "eor v23.16b, v23.16b, v23.16b \n"
5820
5821 // inch loop
5822 "lsr w4, %w20, #2 \n" // w4 = nn = inch >> 2
5823 "cmp w4, #0 \n"
5824 "beq 1f \n"
5825
5826 "0: \n"
5827
5828 "prfm pldl1keep, [%8, #512] \n"
5829 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%8], #64 \n"
5830
5831 "prfm pldl1keep, [%9, #512] \n"
5832 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
5833
5834 "fmla v16.4s, v8.4s, v0.s[0] \n"
5835 "fmla v17.4s, v8.4s, v0.s[1] \n"
5836 "fmla v18.4s, v8.4s, v0.s[2] \n"
5837 "fmla v19.4s, v8.4s, v0.s[3] \n"
5838 "fmla v20.4s, v8.4s, v1.s[0] \n"
5839 "fmla v21.4s, v8.4s, v1.s[1] \n"
5840 "fmla v22.4s, v8.4s, v1.s[2] \n"
5841 "fmla v23.4s, v8.4s, v1.s[3] \n"
5842
5843 "prfm pldl1keep, [%9, #512] \n"
5844 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%9], #64 \n"
5845
5846 "fmla v16.4s, v9.4s, v2.s[0] \n"
5847 "fmla v17.4s, v9.4s, v2.s[1] \n"
5848 "fmla v18.4s, v9.4s, v2.s[2] \n"
5849 "fmla v19.4s, v9.4s, v2.s[3] \n"
5850 "fmla v20.4s, v9.4s, v3.s[0] \n"
5851 "fmla v21.4s, v9.4s, v3.s[1] \n"
5852 "fmla v22.4s, v9.4s, v3.s[2] \n"
5853 "fmla v23.4s, v9.4s, v3.s[3] \n"
5854
5855 "fmla v16.4s, v10.4s, v4.s[0] \n"
5856 "fmla v17.4s, v10.4s, v4.s[1] \n"
5857 "fmla v18.4s, v10.4s, v4.s[2] \n"
5858 "fmla v19.4s, v10.4s, v4.s[3] \n"
5859 "fmla v20.4s, v10.4s, v5.s[0] \n"
5860 "fmla v21.4s, v10.4s, v5.s[1] \n"
5861 "fmla v22.4s, v10.4s, v5.s[2] \n"
5862 "fmla v23.4s, v10.4s, v5.s[3] \n"
5863
5864 "subs w4, w4, #1 \n"
5865
5866 "fmla v16.4s, v11.4s, v6.s[0] \n"
5867 "fmla v17.4s, v11.4s, v6.s[1] \n"
5868 "fmla v18.4s, v11.4s, v6.s[2] \n"
5869 "fmla v19.4s, v11.4s, v6.s[3] \n"
5870 "fmla v20.4s, v11.4s, v7.s[0] \n"
5871 "fmla v21.4s, v11.4s, v7.s[1] \n"
5872 "fmla v22.4s, v11.4s, v7.s[2] \n"
5873 "fmla v23.4s, v11.4s, v7.s[3] \n"
5874
5875 "bne 0b \n"
5876
5877 "1: \n"
5878
5879 // remain loop
5880 "and w4, %w20, #3 \n" // w4 = remain = tiles & 3;
5881 "cmp w4, #0 \n"
5882 "beq 3f \n"
5883
5884 "2: \n"
5885
5886 "prfm pldl1keep, [%8, #128] \n"
5887 "ld1 {v8.4s}, [%8], #16 \n"
5888
5889 "prfm pldl1keep, [%9, #256] \n"
5890 "ld1 {v0.4s, v1.4s}, [%9], #32 \n"
5891
5892 "fmla v16.4s, v8.4s, v0.s[0] \n"
5893 "fmla v17.4s, v8.4s, v0.s[1] \n"
5894 "fmla v18.4s, v8.4s, v0.s[2] \n"
5895 "fmla v19.4s, v8.4s, v0.s[3] \n"
5896
5897 "subs w4, w4, #1 \n"
5898
5899 "fmla v20.4s, v8.4s, v1.s[0] \n"
5900 "fmla v21.4s, v8.4s, v1.s[1] \n"
5901 "fmla v22.4s, v8.4s, v1.s[2] \n"
5902 "fmla v23.4s, v8.4s, v1.s[3] \n"
5903
5904 "bne 2b \n"
5905
5906 "3: \n"
5907
5908 "st1 {v16.4s}, [%0], #16 \n"
5909 "st1 {v17.4s}, [%1], #16 \n"
5910 "st1 {v18.4s}, [%2], #16 \n"
5911 "st1 {v19.4s}, [%3], #16 \n"
5912 "st1 {v20.4s}, [%4], #16 \n"
5913 "st1 {v21.4s}, [%5], #16 \n"
5914 "st1 {v22.4s}, [%6], #16 \n"
5915 "st1 {v23.4s}, [%7], #16 \n"
5916
5917 : "=r"(output0_tm), // %0
5918 "=r"(output1_tm), // %1
5919 "=r"(output2_tm), // %2
5920 "=r"(output3_tm), // %3
5921 "=r"(output4_tm), // %4
5922 "=r"(output5_tm), // %5
5923 "=r"(output6_tm), // %6
5924 "=r"(output7_tm), // %7
5925 "=r"(bb2p0), // %8
5926 "=r"(ktm0) // %9
5927 : "0"(output0_tm),
5928 "1"(output1_tm),
5929 "2"(output2_tm),
5930 "3"(output3_tm),
5931 "4"(output4_tm),
5932 "5"(output5_tm),
5933 "6"(output6_tm),
5934 "7"(output7_tm),
5935 "8"(bb2p0),
5936 "9"(ktm0),
5937 "r"(inch) // %20
5938 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
5939 }
5940 for (; i < tiles; i++)
5941 {
5942 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
5943
5944 const float* ktm0 = kernel_tm0.row(r);
5945
5946 float32x4_t _sum0123 = vdupq_n_f32(0.f);
5947 float32x4_t _sum4567 = vdupq_n_f32(0.f);
5948
5949 int q = 0;
5950 for (; q + 3 < inch; q += 4)
5951 {
5952 // asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
5953 float32x4_t _bb2p0 = vld1q_f32(bb2p0);
5954 bb2p0 += 4;
5955
5956 // asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
5957 float32x4_t _ktm0 = vld1q_f32(ktm0 + 0);
5958 float32x4_t _ktm1 = vld1q_f32(ktm0 + 4);
5959 float32x4_t _ktm2 = vld1q_f32(ktm0 + 8);
5960 float32x4_t _ktm3 = vld1q_f32(ktm0 + 12);
5961 ktm0 += 16;
5962
5963 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm0, _bb2p0, 0);
5964 _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm1, _bb2p0, 0);
5965 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm2, _bb2p0, 1);
5966 _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm3, _bb2p0, 1);
5967
5968 // asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
5969 float32x4_t _ktm4 = vld1q_f32(ktm0 + 0);
5970 float32x4_t _ktm5 = vld1q_f32(ktm0 + 4);
5971 float32x4_t _ktm6 = vld1q_f32(ktm0 + 8);
5972 float32x4_t _ktm7 = vld1q_f32(ktm0 + 12);
5973 ktm0 += 16;
5974
5975 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm4, _bb2p0, 2);
5976 _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm5, _bb2p0, 2);
5977 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm6, _bb2p0, 3);
5978 _sum4567 = vmlaq_laneq_f32(_sum4567, _ktm7, _bb2p0, 3);
5979 }
5980
5981 for (; q < inch; q++)
5982 {
5983 float32x4_t _bb2p0 = vld1q_dup_f32(bb2p0);
5984 float32x4_t _ktm0123 = vld1q_f32(ktm0 + 0);
5985 float32x4_t _ktm4567 = vld1q_f32(ktm0 + 4);
5986
5987 _sum0123 = vmlaq_f32(_sum0123, _bb2p0, _ktm0123);
5988 _sum4567 = vmlaq_f32(_sum4567, _bb2p0, _ktm4567);
5989
5990 bb2p0 += 1;
5991 ktm0 += 8;
5992 }
5993
5994 float sum0 = vgetq_lane_f32(_sum0123, 0);
5995 float sum1 = vgetq_lane_f32(_sum0123, 1);
5996 float sum2 = vgetq_lane_f32(_sum0123, 2);
5997 float sum3 = vgetq_lane_f32(_sum0123, 3);
5998 float sum4 = vgetq_lane_f32(_sum4567, 0);
5999 float sum5 = vgetq_lane_f32(_sum4567, 1);
6000 float sum6 = vgetq_lane_f32(_sum4567, 2);
6001 float sum7 = vgetq_lane_f32(_sum4567, 3);
6002
6003 output0_tm[0] = sum0;
6004 output1_tm[0] = sum1;
6005 output2_tm[0] = sum2;
6006 output3_tm[0] = sum3;
6007 output4_tm[0] = sum4;
6008 output5_tm[0] = sum5;
6009 output6_tm[0] = sum6;
6010 output7_tm[0] = sum7;
6011
6012 output0_tm += 1;
6013 output1_tm += 1;
6014 output2_tm += 1;
6015 output3_tm += 1;
6016 output4_tm += 1;
6017 output5_tm += 1;
6018 output6_tm += 1;
6019 output7_tm += 1;
6020 }
6021 }
6022 }
6023 #endif // __aarch64__
6024
6025 nn_outch = (outch - remain_outch_start) >> 2;
6026
6027 #pragma omp parallel for num_threads(opt.num_threads)
6028 for (int pp = 0; pp < nn_outch; pp++)
6029 {
6030 int p = remain_outch_start + pp * 4;
6031
6032 #if __ARM_NEON && __aarch64__
6033 const Mat kernel_tm0 = kernel_tm.channel(p / 8 + (p % 8) / 4);
6034 #else
6035 const Mat kernel_tm0 = kernel_tm.channel(p / 4);
6036 #endif
6037
6038 Mat out0_tm = top_blob_tm.channel(p);
6039 Mat out1_tm = top_blob_tm.channel(p + 1);
6040 Mat out2_tm = top_blob_tm.channel(p + 2);
6041 Mat out3_tm = top_blob_tm.channel(p + 3);
6042
6043 float* output0_tm = out0_tm;
6044 float* output1_tm = out1_tm;
6045 float* output2_tm = out2_tm;
6046 float* output3_tm = out3_tm;
6047
6048 for (int r = 0; r < 64; r++)
6049 {
6050 const Mat bb2 = bottom_blob_tm2.channel(r);
6051
6052 // tile
6053 int i = 0;
6054 for (; i + 7 < tiles; i += 8)
6055 {
6056 const float* bb2p0 = bb2.row(i / 8);
6057
6058 const float* ktm0 = kernel_tm0.row(r);
6059 #if __ARM_NEON
6060 #if __aarch64__
6061 asm volatile(
6062 "eor v8.16b, v8.16b, v8.16b \n"
6063 "eor v9.16b, v9.16b, v9.16b \n"
6064 "eor v10.16b, v10.16b, v10.16b \n"
6065 "eor v11.16b, v11.16b, v11.16b \n"
6066 "eor v12.16b, v12.16b, v12.16b \n"
6067 "eor v13.16b, v13.16b, v13.16b \n"
6068 "eor v14.16b, v14.16b, v14.16b \n"
6069 "eor v15.16b, v15.16b, v15.16b \n"
6070
6071 // inch loop
6072 "lsr w4, %w12, #2 \n" // w4 = nn = inch >> 2
6073 "cmp w4, #0 \n"
6074 "beq 1f \n"
6075
6076 "0: \n"
6077
6078 "prfm pldl1keep, [%4, #512] \n"
6079 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n"
6080
6081 "prfm pldl1keep, [%5, #512] \n"
6082 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
6083
6084 "fmla v8.4s, v4.4s, v0.s[0] \n"
6085 "fmla v9.4s, v5.4s, v0.s[0] \n"
6086 "fmla v10.4s, v4.4s, v0.s[1] \n"
6087 "fmla v11.4s, v5.4s, v0.s[1] \n"
6088 "fmla v12.4s, v4.4s, v0.s[2] \n"
6089 "fmla v13.4s, v5.4s, v0.s[2] \n"
6090 "fmla v14.4s, v4.4s, v0.s[3] \n"
6091 "fmla v15.4s, v5.4s, v0.s[3] \n"
6092
6093 "prfm pldl1keep, [%4, #512] \n"
6094 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
6095
6096 "fmla v8.4s, v6.4s, v1.s[0] \n"
6097 "fmla v9.4s, v7.4s, v1.s[0] \n"
6098 "fmla v10.4s, v6.4s, v1.s[1] \n"
6099 "fmla v11.4s, v7.4s, v1.s[1] \n"
6100 "fmla v12.4s, v6.4s, v1.s[2] \n"
6101 "fmla v13.4s, v7.4s, v1.s[2] \n"
6102 "fmla v14.4s, v6.4s, v1.s[3] \n"
6103 "fmla v15.4s, v7.4s, v1.s[3] \n"
6104
6105 "fmla v8.4s, v16.4s, v2.s[0] \n"
6106 "fmla v9.4s, v17.4s, v2.s[0] \n"
6107 "fmla v10.4s, v16.4s, v2.s[1] \n"
6108 "fmla v11.4s, v17.4s, v2.s[1] \n"
6109 "fmla v12.4s, v16.4s, v2.s[2] \n"
6110 "fmla v13.4s, v17.4s, v2.s[2] \n"
6111 "fmla v14.4s, v16.4s, v2.s[3] \n"
6112 "fmla v15.4s, v17.4s, v2.s[3] \n"
6113
6114 "fmla v8.4s, v18.4s, v3.s[0] \n"
6115 "fmla v9.4s, v19.4s, v3.s[0] \n"
6116 "fmla v10.4s, v18.4s, v3.s[1] \n"
6117 "fmla v11.4s, v19.4s, v3.s[1] \n"
6118 "fmla v12.4s, v18.4s, v3.s[2] \n"
6119 "fmla v13.4s, v19.4s, v3.s[2] \n"
6120 "fmla v14.4s, v18.4s, v3.s[3] \n"
6121 "fmla v15.4s, v19.4s, v3.s[3] \n"
6122
6123 "subs w4, w4, #1 \n"
6124 "bne 0b \n"
6125
6126 "1: \n"
6127
6128 // remain loop
6129 "and w4, %w12, #3 \n" // w4 = remain = tiles & 3;
6130 "cmp w4, #0 \n"
6131 "beq 3f \n"
6132
6133 "2: \n"
6134
6135 "prfm pldl1keep, [%4, #256] \n"
6136 "ld1 {v4.4s, v5.4s}, [%4], #32 \n"
6137
6138 "prfm pldl1keep, [%5, #128] \n"
6139 "ld1 {v0.4s}, [%5], #16 \n"
6140
6141 "fmla v8.4s, v4.4s, v0.s[0] \n"
6142 "fmla v9.4s, v5.4s, v0.s[0] \n"
6143 "fmla v10.4s, v4.4s, v0.s[1] \n"
6144 "fmla v11.4s, v5.4s, v0.s[1] \n"
6145 "fmla v12.4s, v4.4s, v0.s[2] \n"
6146 "fmla v13.4s, v5.4s, v0.s[2] \n"
6147 "fmla v14.4s, v4.4s, v0.s[3] \n"
6148 "fmla v15.4s, v5.4s, v0.s[3] \n"
6149
6150 "subs w4, w4, #1 \n"
6151 "bne 2b \n"
6152
6153 "3: \n"
6154
6155 "st1 {v8.4s, v9.4s}, [%0], #32 \n"
6156 "st1 {v10.4s, v11.4s}, [%1], #32 \n"
6157 "st1 {v12.4s, v13.4s}, [%2], #32 \n"
6158 "st1 {v14.4s, v15.4s}, [%3], #32 \n"
6159
6160 : "=r"(output0_tm), // %0
6161 "=r"(output1_tm), // %1
6162 "=r"(output2_tm), // %2
6163 "=r"(output3_tm), // %3
6164 "=r"(bb2p0), // %4
6165 "=r"(ktm0) // %5
6166 : "0"(output0_tm),
6167 "1"(output1_tm),
6168 "2"(output2_tm),
6169 "3"(output3_tm),
6170 "4"(bb2p0),
6171 "5"(ktm0),
6172 "r"(inch) // %12
6173 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
6174 #else // __aarch64__
6175 asm volatile(
6176 "veor q8, q8, q8 \n"
6177 "veor q9, q9, q9 \n"
6178 "veor q10, q10, q10 \n"
6179 "veor q11, q11, q11 \n"
6180 "veor q12, q12, q12 \n"
6181 "veor q13, q13, q13 \n"
6182 "veor q14, q14, q14 \n"
6183 "veor q15, q15, q15 \n"
6184
6185 // inch loop
6186 "lsr r4, %12, #2 \n" // r4 = nn = inch >> 2
6187 "cmp r4, #0 \n"
6188 "beq 1f \n"
6189
6190 "0: \n"
6191
6192 "pld [%4, #512] \n"
6193 "vldm %4!, {d8-d15} \n"
6194 // "vld1.f32 {d8-d11}, [%4 :128]! \n"
6195 // "vld1.f32 {d12-d15}, [%4 :128]! \n"
6196
6197 "pld [%5, #512] \n"
6198 "vldm %5!, {d0-d7} \n"
6199 // "vld1.f32 {d0-d3}, [%5 :128]! \n"
6200 // "vld1.f32 {d4-d7}, [%5 :128]! \n"
6201
6202 "vmla.f32 q8, q4, d0[0] \n"
6203 "vmla.f32 q9, q5, d0[0] \n"
6204 "vmla.f32 q10, q4, d0[1] \n"
6205 "vmla.f32 q11, q5, d0[1] \n"
6206 "vmla.f32 q12, q4, d1[0] \n"
6207 "vmla.f32 q13, q5, d1[0] \n"
6208 "vmla.f32 q14, q4, d1[1] \n"
6209 "vmla.f32 q15, q5, d1[1] \n"
6210
6211 "vmla.f32 q8, q6, d2[0] \n"
6212 "vmla.f32 q9, q7, d2[0] \n"
6213 "vmla.f32 q10, q6, d2[1] \n"
6214 "vmla.f32 q11, q7, d2[1] \n"
6215 "vmla.f32 q12, q6, d3[0] \n"
6216 "vmla.f32 q13, q7, d3[0] \n"
6217 "vmla.f32 q14, q6, d3[1] \n"
6218 "vmla.f32 q15, q7, d3[1] \n"
6219
6220 "pld [%4, #512] \n"
6221 "vldm %4!, {d8-d15} \n"
6222 // "vld1.f32 {d8-d11}, [%4 :128]! \n"
6223 // "vld1.f32 {d12-d15}, [%4 :128]! \n"
6224
6225 "vmla.f32 q8, q4, d4[0] \n"
6226 "vmla.f32 q9, q5, d4[0] \n"
6227 "vmla.f32 q10, q4, d4[1] \n"
6228 "vmla.f32 q11, q5, d4[1] \n"
6229 "vmla.f32 q12, q4, d5[0] \n"
6230 "vmla.f32 q13, q5, d5[0] \n"
6231 "vmla.f32 q14, q4, d5[1] \n"
6232 "vmla.f32 q15, q5, d5[1] \n"
6233
6234 "subs r4, r4, #1 \n"
6235
6236 "vmla.f32 q8, q6, d6[0] \n"
6237 "vmla.f32 q9, q7, d6[0] \n"
6238 "vmla.f32 q10, q6, d6[1] \n"
6239 "vmla.f32 q11, q7, d6[1] \n"
6240 "vmla.f32 q12, q6, d7[0] \n"
6241 "vmla.f32 q13, q7, d7[0] \n"
6242 "vmla.f32 q14, q6, d7[1] \n"
6243 "vmla.f32 q15, q7, d7[1] \n"
6244
6245 "bne 0b \n"
6246
6247 "1: \n"
6248
6249 // remain loop
6250 "and r4, %12, #3 \n" // r4 = remain = tiles & 3;
6251 "cmp r4, #0 \n"
6252 "beq 3f \n"
6253
6254 "2: \n"
6255
6256 "pld [%4, #256] \n"
6257 "vld1.f32 {d8-d11}, [%4 :128]! \n"
6258
6259 "pld [%5, #128] \n"
6260 "vld1.f32 {d0-d1}, [%5 :128]! \n"
6261
6262 "vmla.f32 q8, q4, d0[0] \n"
6263 "vmla.f32 q9, q5, d0[0] \n"
6264 "vmla.f32 q10, q4, d0[1] \n"
6265 "vmla.f32 q11, q5, d0[1] \n"
6266
6267 "subs r4, r4, #1 \n"
6268
6269 "vmla.f32 q12, q4, d1[0] \n"
6270 "vmla.f32 q13, q5, d1[0] \n"
6271 "vmla.f32 q14, q4, d1[1] \n"
6272 "vmla.f32 q15, q5, d1[1] \n"
6273
6274 "bne 2b \n"
6275
6276 "3: \n"
6277
6278 "vst1.f32 {d16-d19}, [%0]! \n"
6279 "vst1.f32 {d20-d23}, [%1]! \n"
6280 "vst1.f32 {d24-d27}, [%2]! \n"
6281 "vst1.f32 {d28-d31}, [%3]! \n"
6282
6283 : "=r"(output0_tm), // %0
6284 "=r"(output1_tm), // %1
6285 "=r"(output2_tm), // %2
6286 "=r"(output3_tm), // %3
6287 "=r"(bb2p0), // %4
6288 "=r"(ktm0) // %5
6289 : "0"(output0_tm),
6290 "1"(output1_tm),
6291 "2"(output2_tm),
6292 "3"(output3_tm),
6293 "4"(bb2p0),
6294 "5"(ktm0),
6295 "r"(inch) // %12
6296 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
6297 #endif // __aarch64__
6298 #else
6299 float sum0_0 = 0.f;
6300 float sum0_1 = 0.f;
6301 float sum0_2 = 0.f;
6302 float sum0_3 = 0.f;
6303 float sum0_4 = 0.f;
6304 float sum0_5 = 0.f;
6305 float sum0_6 = 0.f;
6306 float sum0_7 = 0.f;
6307
6308 float sum1_0 = 0.f;
6309 float sum1_1 = 0.f;
6310 float sum1_2 = 0.f;
6311 float sum1_3 = 0.f;
6312 float sum1_4 = 0.f;
6313 float sum1_5 = 0.f;
6314 float sum1_6 = 0.f;
6315 float sum1_7 = 0.f;
6316
6317 float sum2_0 = 0.f;
6318 float sum2_1 = 0.f;
6319 float sum2_2 = 0.f;
6320 float sum2_3 = 0.f;
6321 float sum2_4 = 0.f;
6322 float sum2_5 = 0.f;
6323 float sum2_6 = 0.f;
6324 float sum2_7 = 0.f;
6325
6326 float sum3_0 = 0.f;
6327 float sum3_1 = 0.f;
6328 float sum3_2 = 0.f;
6329 float sum3_3 = 0.f;
6330 float sum3_4 = 0.f;
6331 float sum3_5 = 0.f;
6332 float sum3_6 = 0.f;
6333 float sum3_7 = 0.f;
6334
6335 for (int q = 0; q < inch; q++)
6336 {
6337 sum0_0 += bb2p0[0] * ktm0[0];
6338 sum0_1 += bb2p0[1] * ktm0[0];
6339 sum0_2 += bb2p0[2] * ktm0[0];
6340 sum0_3 += bb2p0[3] * ktm0[0];
6341 sum0_4 += bb2p0[4] * ktm0[0];
6342 sum0_5 += bb2p0[5] * ktm0[0];
6343 sum0_6 += bb2p0[6] * ktm0[0];
6344 sum0_7 += bb2p0[7] * ktm0[0];
6345
6346 sum1_0 += bb2p0[0] * ktm0[1];
6347 sum1_1 += bb2p0[1] * ktm0[1];
6348 sum1_2 += bb2p0[2] * ktm0[1];
6349 sum1_3 += bb2p0[3] * ktm0[1];
6350 sum1_4 += bb2p0[4] * ktm0[1];
6351 sum1_5 += bb2p0[5] * ktm0[1];
6352 sum1_6 += bb2p0[6] * ktm0[1];
6353 sum1_7 += bb2p0[7] * ktm0[1];
6354
6355 sum2_0 += bb2p0[0] * ktm0[2];
6356 sum2_1 += bb2p0[1] * ktm0[2];
6357 sum2_2 += bb2p0[2] * ktm0[2];
6358 sum2_3 += bb2p0[3] * ktm0[2];
6359 sum2_4 += bb2p0[4] * ktm0[2];
6360 sum2_5 += bb2p0[5] * ktm0[2];
6361 sum2_6 += bb2p0[6] * ktm0[2];
6362 sum2_7 += bb2p0[7] * ktm0[2];
6363
6364 sum3_0 += bb2p0[0] * ktm0[3];
6365 sum3_1 += bb2p0[1] * ktm0[3];
6366 sum3_2 += bb2p0[2] * ktm0[3];
6367 sum3_3 += bb2p0[3] * ktm0[3];
6368 sum3_4 += bb2p0[4] * ktm0[3];
6369 sum3_5 += bb2p0[5] * ktm0[3];
6370 sum3_6 += bb2p0[6] * ktm0[3];
6371 sum3_7 += bb2p0[7] * ktm0[3];
6372
6373 bb2p0 += 8;
6374 ktm0 += 4;
6375 }
6376
6377 output0_tm[0] = sum0_0;
6378 output0_tm[1] = sum0_1;
6379 output0_tm[2] = sum0_2;
6380 output0_tm[3] = sum0_3;
6381 output0_tm[4] = sum0_4;
6382 output0_tm[5] = sum0_5;
6383 output0_tm[6] = sum0_6;
6384 output0_tm[7] = sum0_7;
6385
6386 output1_tm[0] = sum1_0;
6387 output1_tm[1] = sum1_1;
6388 output1_tm[2] = sum1_2;
6389 output1_tm[3] = sum1_3;
6390 output1_tm[4] = sum1_4;
6391 output1_tm[5] = sum1_5;
6392 output1_tm[6] = sum1_6;
6393 output1_tm[7] = sum1_7;
6394
6395 output2_tm[0] = sum2_0;
6396 output2_tm[1] = sum2_1;
6397 output2_tm[2] = sum2_2;
6398 output2_tm[3] = sum2_3;
6399 output2_tm[4] = sum2_4;
6400 output2_tm[5] = sum2_5;
6401 output2_tm[6] = sum2_6;
6402 output2_tm[7] = sum2_7;
6403
6404 output3_tm[0] = sum3_0;
6405 output3_tm[1] = sum3_1;
6406 output3_tm[2] = sum3_2;
6407 output3_tm[3] = sum3_3;
6408 output3_tm[4] = sum3_4;
6409 output3_tm[5] = sum3_5;
6410 output3_tm[6] = sum3_6;
6411 output3_tm[7] = sum3_7;
6412
6413 output0_tm += 8;
6414 output1_tm += 8;
6415 output2_tm += 8;
6416 output3_tm += 8;
6417 #endif // __ARM_NEON
6418 }
6419 for (; i + 3 < tiles; i += 4)
6420 {
6421 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
6422
6423 const float* ktm0 = kernel_tm0.row(r);
6424 #if __ARM_NEON
6425 #if __aarch64__
6426 asm volatile(
6427 "eor v8.16b, v8.16b, v8.16b \n"
6428 "eor v9.16b, v9.16b, v9.16b \n"
6429 "eor v10.16b, v10.16b, v10.16b \n"
6430 "eor v11.16b, v11.16b, v11.16b \n"
6431
6432 // inch loop
6433 "lsr w4, %w12, #2 \n" // w4 = nn = inch >> 2
6434 "cmp w4, #0 \n"
6435 "beq 1f \n"
6436
6437 "0: \n"
6438
6439 "prfm pldl1keep, [%4, #512] \n"
6440 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n"
6441
6442 "prfm pldl1keep, [%5, #512] \n"
6443 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
6444
6445 "fmla v8.4s, v4.4s, v0.s[0] \n"
6446 "fmla v9.4s, v4.4s, v0.s[1] \n"
6447 "fmla v10.4s, v4.4s, v0.s[2] \n"
6448 "fmla v11.4s, v4.4s, v0.s[3] \n"
6449
6450 "fmla v8.4s, v5.4s, v1.s[0] \n"
6451 "fmla v9.4s, v5.4s, v1.s[1] \n"
6452 "fmla v10.4s, v5.4s, v1.s[2] \n"
6453 "fmla v11.4s, v5.4s, v1.s[3] \n"
6454
6455 "fmla v8.4s, v6.4s, v2.s[0] \n"
6456 "fmla v9.4s, v6.4s, v2.s[1] \n"
6457 "fmla v10.4s, v6.4s, v2.s[2] \n"
6458 "fmla v11.4s, v6.4s, v2.s[3] \n"
6459
6460 "fmla v8.4s, v7.4s, v3.s[0] \n"
6461 "fmla v9.4s, v7.4s, v3.s[1] \n"
6462 "fmla v10.4s, v7.4s, v3.s[2] \n"
6463 "fmla v11.4s, v7.4s, v3.s[3] \n"
6464
6465 "subs w4, w4, #1 \n"
6466 "bne 0b \n"
6467
6468 "1: \n"
6469
6470 // remain loop
6471 "and w4, %w12, #3 \n" // w4 = remain = tiles & 3;
6472 "cmp w4, #0 \n"
6473 "beq 3f \n"
6474
6475 "2: \n"
6476
6477 "prfm pldl1keep, [%4, #128] \n"
6478 "ld1 {v4.4s}, [%4], #16 \n"
6479
6480 "prfm pldl1keep, [%5, #128] \n"
6481 "ld1 {v0.4s}, [%5], #16 \n"
6482
6483 "fmla v8.4s, v4.4s, v0.s[0] \n"
6484 "fmla v9.4s, v4.4s, v0.s[1] \n"
6485 "fmla v10.4s, v4.4s, v0.s[2] \n"
6486 "fmla v11.4s, v4.4s, v0.s[3] \n"
6487
6488 "subs w4, w4, #1 \n"
6489 "bne 2b \n"
6490
6491 "3: \n"
6492
6493 "st1 {v8.4s}, [%0], #16 \n"
6494 "st1 {v9.4s}, [%1], #16 \n"
6495 "st1 {v10.4s}, [%2], #16 \n"
6496 "st1 {v11.4s}, [%3], #16 \n"
6497
6498 : "=r"(output0_tm), // %0
6499 "=r"(output1_tm), // %1
6500 "=r"(output2_tm), // %2
6501 "=r"(output3_tm), // %3
6502 "=r"(bb2p0), // %4
6503 "=r"(ktm0) // %5
6504 : "0"(output0_tm),
6505 "1"(output1_tm),
6506 "2"(output2_tm),
6507 "3"(output3_tm),
6508 "4"(bb2p0),
6509 "5"(ktm0),
6510 "r"(inch) // %12
6511 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
6512 #else // __aarch64__
6513 asm volatile(
6514 "veor q8, q8, q8 \n"
6515 "veor q9, q9, q9 \n"
6516 "veor q10, q10, q10 \n"
6517 "veor q11, q11, q11 \n"
6518
6519 // inch loop
6520 "lsr r4, %12, #2 \n" // r4 = nn = inch >> 2
6521 "cmp r4, #0 \n"
6522 "beq 1f \n"
6523
6524 "0: \n"
6525
6526 "pld [%4, #512] \n"
6527 "vldm %4!, {d8-d15} \n"
6528 // "vld1.f32 {d8-d11}, [%4 :128]! \n"
6529 // "vld1.f32 {d12-d15}, [%4 :128]! \n"
6530
6531 "pld [%5, #512] \n"
6532 "vldm %5!, {d0-d7} \n"
6533 // "vld1.f32 {d0-d3}, [%5 :128]! \n"
6534 // "vld1.f32 {d4-d7}, [%5 :128]! \n"
6535
6536 "vmla.f32 q8, q4, d0[0] \n"
6537 "vmla.f32 q9, q4, d0[1] \n"
6538 "vmla.f32 q10, q4, d1[0] \n"
6539 "vmla.f32 q11, q4, d1[1] \n"
6540
6541 "vmla.f32 q8, q5, d2[0] \n"
6542 "vmla.f32 q9, q5, d2[1] \n"
6543 "vmla.f32 q10, q5, d3[0] \n"
6544 "vmla.f32 q11, q5, d3[1] \n"
6545
6546 "subs r4, r4, #1 \n"
6547
6548 "vmla.f32 q8, q6, d4[0] \n"
6549 "vmla.f32 q9, q6, d4[1] \n"
6550 "vmla.f32 q10, q6, d5[0] \n"
6551 "vmla.f32 q11, q6, d5[1] \n"
6552
6553 "vmla.f32 q8, q7, d6[0] \n"
6554 "vmla.f32 q9, q7, d6[1] \n"
6555 "vmla.f32 q10, q7, d7[0] \n"
6556 "vmla.f32 q11, q7, d7[1] \n"
6557
6558 "bne 0b \n"
6559
6560 "1: \n"
6561
6562 // remain loop
6563 "and r4, %12, #3 \n" // r4 = remain = tiles & 3;
6564 "cmp r4, #0 \n"
6565 "beq 3f \n"
6566
6567 "2: \n"
6568
6569 "pld [%4, #128] \n"
6570 "vld1.f32 {d8-d9}, [%4 :128]! \n"
6571
6572 "pld [%5, #128] \n"
6573 "vld1.f32 {d0-d1}, [%5 :128]! \n"
6574
6575 "subs r4, r4, #1 \n"
6576
6577 "vmla.f32 q8, q4, d0[0] \n"
6578 "vmla.f32 q9, q4, d0[1] \n"
6579 "vmla.f32 q10, q4, d1[0] \n"
6580 "vmla.f32 q11, q4, d1[1] \n"
6581
6582 "bne 2b \n"
6583
6584 "3: \n"
6585
6586 "vst1.f32 {d16-d17}, [%0]! \n"
6587 "vst1.f32 {d18-d19}, [%1]! \n"
6588 "vst1.f32 {d20-d21}, [%2]! \n"
6589 "vst1.f32 {d22-d23}, [%3]! \n"
6590
6591 : "=r"(output0_tm), // %0
6592 "=r"(output1_tm), // %1
6593 "=r"(output2_tm), // %2
6594 "=r"(output3_tm), // %3
6595 "=r"(bb2p0), // %4
6596 "=r"(ktm0) // %5
6597 : "0"(output0_tm),
6598 "1"(output1_tm),
6599 "2"(output2_tm),
6600 "3"(output3_tm),
6601 "4"(bb2p0),
6602 "5"(ktm0),
6603 "r"(inch) // %12
6604 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
6605 #endif // __aarch64__
6606 #else
6607 float sum0_0 = 0.f;
6608 float sum0_1 = 0.f;
6609 float sum0_2 = 0.f;
6610 float sum0_3 = 0.f;
6611
6612 float sum1_0 = 0.f;
6613 float sum1_1 = 0.f;
6614 float sum1_2 = 0.f;
6615 float sum1_3 = 0.f;
6616
6617 float sum2_0 = 0.f;
6618 float sum2_1 = 0.f;
6619 float sum2_2 = 0.f;
6620 float sum2_3 = 0.f;
6621
6622 float sum3_0 = 0.f;
6623 float sum3_1 = 0.f;
6624 float sum3_2 = 0.f;
6625 float sum3_3 = 0.f;
6626
6627 for (int q = 0; q < inch; q++)
6628 {
6629 sum0_0 += bb2p0[0] * ktm0[0];
6630 sum0_1 += bb2p0[1] * ktm0[0];
6631 sum0_2 += bb2p0[2] * ktm0[0];
6632 sum0_3 += bb2p0[3] * ktm0[0];
6633
6634 sum1_0 += bb2p0[0] * ktm0[1];
6635 sum1_1 += bb2p0[1] * ktm0[1];
6636 sum1_2 += bb2p0[2] * ktm0[1];
6637 sum1_3 += bb2p0[3] * ktm0[1];
6638
6639 sum2_0 += bb2p0[0] * ktm0[2];
6640 sum2_1 += bb2p0[1] * ktm0[2];
6641 sum2_2 += bb2p0[2] * ktm0[2];
6642 sum2_3 += bb2p0[3] * ktm0[2];
6643
6644 sum3_0 += bb2p0[0] * ktm0[3];
6645 sum3_1 += bb2p0[1] * ktm0[3];
6646 sum3_2 += bb2p0[2] * ktm0[3];
6647 sum3_3 += bb2p0[3] * ktm0[3];
6648
6649 bb2p0 += 4;
6650 ktm0 += 4;
6651 }
6652
6653 output0_tm[0] = sum0_0;
6654 output0_tm[1] = sum0_1;
6655 output0_tm[2] = sum0_2;
6656 output0_tm[3] = sum0_3;
6657
6658 output1_tm[0] = sum1_0;
6659 output1_tm[1] = sum1_1;
6660 output1_tm[2] = sum1_2;
6661 output1_tm[3] = sum1_3;
6662
6663 output2_tm[0] = sum2_0;
6664 output2_tm[1] = sum2_1;
6665 output2_tm[2] = sum2_2;
6666 output2_tm[3] = sum2_3;
6667
6668 output3_tm[0] = sum3_0;
6669 output3_tm[1] = sum3_1;
6670 output3_tm[2] = sum3_2;
6671 output3_tm[3] = sum3_3;
6672
6673 output0_tm += 4;
6674 output1_tm += 4;
6675 output2_tm += 4;
6676 output3_tm += 4;
6677 #endif // __ARM_NEON
6678 }
6679 for (; i < tiles; i++)
6680 {
6681 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
6682
6683 const float* ktm0 = kernel_tm0.row(r);
6684
6685 #if __ARM_NEON
6686 float32x4_t _sum0123 = vdupq_n_f32(0.f);
6687
6688 int q = 0;
6689 for (; q + 3 < inch; q += 4)
6690 {
6691 // asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
6692 float32x4_t _bb2p0 = vld1q_f32(bb2p0);
6693 bb2p0 += 4;
6694
6695 // asm volatile("prfm pldl1keep, [%0, #512] \n" : :"r"(ktm0) :);
6696 float32x4_t _ktm0 = vld1q_f32(ktm0 + 0);
6697 float32x4_t _ktm1 = vld1q_f32(ktm0 + 4);
6698 float32x4_t _ktm2 = vld1q_f32(ktm0 + 8);
6699 float32x4_t _ktm3 = vld1q_f32(ktm0 + 12);
6700 ktm0 += 16;
6701
6702 #if __aarch64__
6703 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm0, _bb2p0, 0);
6704 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm1, _bb2p0, 1);
6705 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm2, _bb2p0, 2);
6706 _sum0123 = vmlaq_laneq_f32(_sum0123, _ktm3, _bb2p0, 3);
6707 #else
6708 _sum0123 = vmlaq_lane_f32(_sum0123, _ktm0, vget_low_f32(_bb2p0), 0);
6709 _sum0123 = vmlaq_lane_f32(_sum0123, _ktm1, vget_low_f32(_bb2p0), 1);
6710 _sum0123 = vmlaq_lane_f32(_sum0123, _ktm2, vget_high_f32(_bb2p0), 0);
6711 _sum0123 = vmlaq_lane_f32(_sum0123, _ktm3, vget_high_f32(_bb2p0), 1);
6712 #endif // __aarch64__
6713 }
6714
6715 for (; q < inch; q++)
6716 {
6717 float32x4_t _bb2p0 = vld1q_dup_f32(bb2p0);
6718 float32x4_t _ktm0 = vld1q_f32(ktm0);
6719
6720 _sum0123 = vmlaq_f32(_sum0123, _bb2p0, _ktm0);
6721
6722 bb2p0 += 1;
6723 ktm0 += 4;
6724 }
6725
6726 float sum0 = vgetq_lane_f32(_sum0123, 0);
6727 float sum1 = vgetq_lane_f32(_sum0123, 1);
6728 float sum2 = vgetq_lane_f32(_sum0123, 2);
6729 float sum3 = vgetq_lane_f32(_sum0123, 3);
6730 #else
6731 float sum0 = 0.f;
6732 float sum1 = 0.f;
6733 float sum2 = 0.f;
6734 float sum3 = 0.f;
6735
6736 for (int q = 0; q < inch; q++)
6737 {
6738 sum0 += bb2p0[0] * ktm0[0];
6739 sum1 += bb2p0[0] * ktm0[1];
6740 sum2 += bb2p0[0] * ktm0[2];
6741 sum3 += bb2p0[0] * ktm0[3];
6742
6743 bb2p0 += 1;
6744 ktm0 += 4;
6745 }
6746 #endif // __ARM_NEON
6747
6748 output0_tm[0] = sum0;
6749 output1_tm[0] = sum1;
6750 output2_tm[0] = sum2;
6751 output3_tm[0] = sum3;
6752
6753 output0_tm += 1;
6754 output1_tm += 1;
6755 output2_tm += 1;
6756 output3_tm += 1;
6757 }
6758 }
6759 }
6760
6761 remain_outch_start += nn_outch << 2;
6762
6763 #pragma omp parallel for num_threads(opt.num_threads)
6764 for (int p = remain_outch_start; p < outch; p++)
6765 {
6766 #if __ARM_NEON && __aarch64__
6767 const Mat kernel_tm0 = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
6768 #else
6769 const Mat kernel_tm0 = kernel_tm.channel(p / 4 + p % 4);
6770 #endif
6771
6772 Mat out0_tm = top_blob_tm.channel(p);
6773
6774 float* output0_tm = out0_tm;
6775
6776 for (int r = 0; r < 64; r++)
6777 {
6778 const Mat bb2 = bottom_blob_tm2.channel(r);
6779
6780 // tile
6781 int i = 0;
6782 for (; i + 7 < tiles; i += 8)
6783 {
6784 const float* bb2p0 = bb2.row(i / 8);
6785
6786 const float* ktm0 = kernel_tm0.row(r);
6787 #if __ARM_NEON
6788 #if __aarch64__
6789 asm volatile(
6790 "eor v8.16b, v8.16b, v8.16b \n"
6791 "eor v9.16b, v9.16b, v9.16b \n"
6792
6793 // inch loop
6794 "lsr w4, %w6, #2 \n" // w4 = nn = inch >> 2
6795 "cmp w4, #0 \n"
6796 "beq 1f \n"
6797
6798 "0: \n"
6799
6800 "prfm pldl1keep, [%1, #512] \n"
6801 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
6802
6803 "prfm pldl1keep, [%2, #128] \n"
6804 "ld1 {v0.4s}, [%2], #16 \n"
6805
6806 "fmla v8.4s, v4.4s, v0.s[0] \n"
6807 "fmla v9.4s, v5.4s, v0.s[0] \n"
6808 "fmla v8.4s, v6.4s, v0.s[1] \n"
6809 "fmla v9.4s, v7.4s, v0.s[1] \n"
6810
6811 "prfm pldl1keep, [%1, #512] \n"
6812 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
6813
6814 "fmla v8.4s, v12.4s, v0.s[2] \n"
6815 "fmla v9.4s, v13.4s, v0.s[2] \n"
6816 "fmla v8.4s, v14.4s, v0.s[3] \n"
6817 "fmla v9.4s, v15.4s, v0.s[3] \n"
6818
6819 "subs w4, w4, #1 \n"
6820 "bne 0b \n"
6821
6822 "1: \n"
6823
6824 // remain loop
6825 "and w4, %w6, #3 \n" // w4 = remain = tiles & 3;
6826 "cmp w4, #0 \n"
6827 "beq 3f \n"
6828
6829 "2: \n"
6830
6831 "prfm pldl1keep, [%1, #256] \n"
6832 "ld1 {v4.4s, v5.4s}, [%1], #32 \n"
6833
6834 "prfm pldl1keep, [%2, #32] \n"
6835 "ld1r {v0.4s}, [%2], #4 \n"
6836
6837 "fmla v8.4s, v4.4s, v0.4s \n"
6838 "fmla v9.4s, v5.4s, v0.4s \n"
6839
6840 "subs w4, w4, #1 \n"
6841 "bne 2b \n"
6842
6843 "3: \n"
6844
6845 "st1 {v8.4s, v9.4s}, [%0], #32 \n"
6846
6847 : "=r"(output0_tm), // %0
6848 "=r"(bb2p0), // %1
6849 "=r"(ktm0) // %2
6850 : "0"(output0_tm),
6851 "1"(bb2p0),
6852 "2"(ktm0),
6853 "r"(inch) // %6
6854 : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v12", "v13", "v14", "v15");
6855 #else // __aarch64__
6856 asm volatile(
6857 "veor q8, q8, q8 \n"
6858 "veor q9, q9, q9 \n"
6859
6860 // inch loop
6861 "lsr r4, %6, #2 \n" // r4 = nn = inch >> 2
6862 "cmp r4, #0 \n"
6863 "beq 1f \n"
6864
6865 "0: \n"
6866
6867 "pld [%1, #512] \n"
6868 "vldm %1!, {d8-d15} \n"
6869 // "vld1.f32 {d8-d11}, [%1 :128]! \n"
6870 // "vld1.f32 {d12-d15}, [%1 :128]! \n"
6871
6872 "pld [%2, #128] \n"
6873 "vld1.f32 {d0-d1}, [%2 :128]! \n"
6874
6875 "vmla.f32 q8, q4, d0[0] \n"
6876 "vmla.f32 q9, q5, d0[0] \n"
6877 "vmla.f32 q8, q6, d0[1] \n"
6878 "vmla.f32 q9, q7, d0[1] \n"
6879
6880 "pld [%1, #512] \n"
6881 "vldm %1!, {d24-d31} \n"
6882 // "vld1.f32 {d24-d27}, [%1 :128]! \n"
6883 // "vld1.f32 {d28-d31}, [%1 :128]! \n"
6884
6885 "subs r4, r4, #1 \n"
6886
6887 "vmla.f32 q8, q12, d1[0] \n"
6888 "vmla.f32 q9, q13, d1[0] \n"
6889 "vmla.f32 q8, q14, d1[1] \n"
6890 "vmla.f32 q9, q15, d1[1] \n"
6891
6892 "bne 0b \n"
6893
6894 "1: \n"
6895
6896 // remain loop
6897 "and r4, %6, #3 \n" // r4 = remain = tiles & 3;
6898 "cmp r4, #0 \n"
6899 "beq 3f \n"
6900
6901 "2: \n"
6902
6903 "pld [%1, #256] \n"
6904 "vld1.f32 {d8-d11}, [%1 :128]! \n"
6905
6906 "pld [%2, #32] \n"
6907 "vld1.f32 {d0[],d1[]}, [%2]! \n"
6908
6909 "subs r4, r4, #1 \n"
6910
6911 "vmla.f32 q8, q4, q0 \n"
6912 "vmla.f32 q9, q5, q0 \n"
6913
6914 "bne 2b \n"
6915
6916 "3: \n"
6917
6918 "vst1.f32 {d16-d19}, [%0]! \n"
6919
6920 : "=r"(output0_tm), // %0
6921 "=r"(bb2p0), // %1
6922 "=r"(ktm0) // %2
6923 : "0"(output0_tm),
6924 "1"(bb2p0),
6925 "2"(ktm0),
6926 "r"(inch) // %6
6927 : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q12", "q13", "q14", "q15");
6928 #endif // __aarch64__
6929 #else
6930 float sum0 = 0.f;
6931 float sum1 = 0.f;
6932 float sum2 = 0.f;
6933 float sum3 = 0.f;
6934 float sum4 = 0.f;
6935 float sum5 = 0.f;
6936 float sum6 = 0.f;
6937 float sum7 = 0.f;
6938
6939 for (int q = 0; q < inch; q++)
6940 {
6941 sum0 += bb2p0[0] * ktm0[0];
6942 sum1 += bb2p0[1] * ktm0[0];
6943 sum2 += bb2p0[2] * ktm0[0];
6944 sum3 += bb2p0[3] * ktm0[0];
6945 sum4 += bb2p0[4] * ktm0[0];
6946 sum5 += bb2p0[5] * ktm0[0];
6947 sum6 += bb2p0[6] * ktm0[0];
6948 sum7 += bb2p0[7] * ktm0[0];
6949
6950 bb2p0 += 8;
6951 ktm0 += 1;
6952 }
6953
6954 output0_tm[0] = sum0;
6955 output0_tm[1] = sum1;
6956 output0_tm[2] = sum2;
6957 output0_tm[3] = sum3;
6958 output0_tm[4] = sum4;
6959 output0_tm[5] = sum5;
6960 output0_tm[6] = sum6;
6961 output0_tm[7] = sum7;
6962
6963 output0_tm += 8;
6964 #endif // __ARM_NEON
6965 }
6966 for (; i + 3 < tiles; i += 4)
6967 {
6968 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4);
6969
6970 const float* ktm0 = kernel_tm0.row(r);
6971 #if __ARM_NEON
6972 #if __aarch64__
6973 asm volatile(
6974 "eor v8.16b, v8.16b, v8.16b \n"
6975
6976 // inch loop
6977 "lsr w4, %w6, #2 \n" // w4 = nn = inch >> 2
6978 "cmp w4, #0 \n"
6979 "beq 1f \n"
6980
6981 "0: \n"
6982
6983 "prfm pldl1keep, [%4, #512] \n"
6984 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n"
6985
6986 "prfm pldl1keep, [%5, #128] \n"
6987 "ld1 {v0.4s}, [%5], #16 \n"
6988
6989 "fmla v8.4s, v4.4s, v0.s[0] \n"
6990 "fmla v8.4s, v5.4s, v0.s[1] \n"
6991 "fmla v8.4s, v6.4s, v0.s[2] \n"
6992 "fmla v8.4s, v7.4s, v0.s[3] \n"
6993
6994 "subs w4, w4, #1 \n"
6995 "bne 0b \n"
6996
6997 "1: \n"
6998
6999 // remain loop
7000 "and w4, %w6, #3 \n" // w4 = remain = tiles & 3;
7001 "cmp w4, #0 \n"
7002 "beq 3f \n"
7003
7004 "2: \n"
7005
7006 "prfm pldl1keep, [%4, #128] \n"
7007 "ld1 {v4.4s}, [%4], #16 \n"
7008
7009 "prfm pldl1keep, [%5, #32] \n"
7010 "ld1r {v0.4s}, [%5], #4 \n"
7011
7012 "fmla v8.4s, v4.4s, v0.4s \n"
7013
7014 "subs w4, w4, #1 \n"
7015 "bne 2b \n"
7016
7017 "3: \n"
7018
7019 "st1 {v8.4s}, [%0], #16 \n"
7020
7021 : "=r"(output0_tm), // %0
7022 "=r"(bb2p0), // %1
7023 "=r"(ktm0) // %2
7024 : "0"(output0_tm),
7025 "1"(bb2p0),
7026 "2"(ktm0),
7027 "r"(inch) // %6
7028 : "cc", "memory", "x4", "v0", "v4", "v5", "v6", "v7", "v8");
7029 #else // __aarch64__
7030 asm volatile(
7031 "veor q8, q8, q8 \n"
7032
7033 // inch loop
7034 "lsr r4, %6, #2 \n" // r4 = nn = inch >> 2
7035 "cmp r4, #0 \n"
7036 "beq 1f \n"
7037
7038 "0: \n"
7039
7040 "pld [%4, #512] \n"
7041 "vldm %4!, {d8-d15} \n"
7042 // "vld1.f32 {d8-d11}, [%4 :128]! \n"
7043 // "vld1.f32 {d12-d15}, [%4 :128]! \n"
7044
7045 "pld [%5, #128] \n"
7046 "vld1.f32 {d0-d1}, [%5 :128]! \n"
7047
7048 "subs r4, r4, #1 \n"
7049
7050 "vmla.f32 q8, q4, d0[0] \n"
7051 "vmla.f32 q8, q5, d0[1] \n"
7052 "vmla.f32 q8, q6, d1[0] \n"
7053 "vmla.f32 q8, q7, d1[1] \n"
7054
7055 "bne 0b \n"
7056
7057 "1: \n"
7058
7059 // remain loop
7060 "and r4, %6, #3 \n" // r4 = remain = tiles & 3;
7061 "cmp r4, #0 \n"
7062 "beq 3f \n"
7063
7064 "2: \n"
7065
7066 "pld [%4, #128] \n"
7067 "vld1.f32 {d8-d9}, [%4]! \n"
7068
7069 "pld [%5, #32] \n"
7070 "vld1.f32 {d0[],d1[]}, [%5]! \n"
7071
7072 "subs r4, r4, #1 \n"
7073
7074 "vmla.f32 q8, q4, q0 \n"
7075
7076 "bne 2b \n"
7077
7078 "3: \n"
7079
7080 "vst1.f32 {d16-d17}, [%0]! \n"
7081
7082 : "=r"(output0_tm), // %0
7083 "=r"(bb2p0), // %1
7084 "=r"(ktm0) // %2
7085 : "0"(output0_tm),
7086 "1"(bb2p0),
7087 "2"(ktm0),
7088 "r"(inch) // %6
7089 : "cc", "memory", "r4", "q0", "q4", "q5", "q6", "q7", "q8");
7090 #endif // __aarch64__
7091 #else
7092 float sum0 = 0.f;
7093 float sum1 = 0.f;
7094 float sum2 = 0.f;
7095 float sum3 = 0.f;
7096
7097 for (int q = 0; q < inch; q++)
7098 {
7099 sum0 += bb2p0[0] * ktm0[0];
7100 sum1 += bb2p0[1] * ktm0[0];
7101 sum2 += bb2p0[2] * ktm0[0];
7102 sum3 += bb2p0[3] * ktm0[0];
7103
7104 bb2p0 += 4;
7105 ktm0 += 1;
7106 }
7107
7108 output0_tm[0] = sum0;
7109 output0_tm[1] = sum1;
7110 output0_tm[2] = sum2;
7111 output0_tm[3] = sum3;
7112
7113 output0_tm += 4;
7114 #endif // __ARM_NEON
7115 }
7116 for (; i < tiles; i++)
7117 {
7118 const float* bb2p0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
7119
7120 const float* ktm0 = kernel_tm0.row(r);
7121
7122 int q = 0;
7123 #if __ARM_NEON
7124 float32x4_t _sum0 = vdupq_n_f32(0.f);
7125 for (; q + 3 < inch; q += 4)
7126 {
7127 // asm volatile("prfm pldl1keep, [%0, #128] \n" : :"r"(bb2p0) :);
7128 float32x4_t _bb2p0 = vld1q_f32(bb2p0);
7129 bb2p0 += 4;
7130
7131 float32x4_t _ktm0 = vld1q_f32(ktm0);
7132 ktm0 += 4;
7133
7134 _sum0 = vmlaq_f32(_sum0, _bb2p0, _ktm0);
7135 }
7136
7137 #if __aarch64__
7138 float sum0 = vaddvq_f32(_sum0);
7139 #else
7140 float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
7141 float sum0 = vget_lane_f32(vpadd_f32(_ss0, _ss0), 0);
7142 #endif // __aarch64__
7143 #else
7144 float sum0 = 0.f;
7145 #endif
7146 for (; q < inch; q++)
7147 {
7148 sum0 += bb2p0[0] * ktm0[0];
7149
7150 bb2p0 += 1;
7151 ktm0 += 1;
7152 }
7153
7154 output0_tm[0] = sum0;
7155
7156 output0_tm += 1;
7157 }
7158 }
7159 }
7160 }
7161 bottom_blob_tm = Mat();
7162 // END dot
7163
7164 // BEGIN transform output
7165 Mat top_blob_bordered;
7166 if (outw == top_blob.w && outh == top_blob.h)
7167 {
7168 top_blob_bordered = top_blob;
7169 }
7170 else
7171 {
7172 top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
7173 }
7174 {
7175 // const float otm[6][8] = {
7176 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
7177 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
7178 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
7179 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
7180 // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
7181 // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
7182 // };
7183
7184 // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
7185 // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
7186 // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
7187 // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
7188 // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
7189 // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
7190
7191 #if __ARM_NEON
7192 const float coeff[4] = {4.f, 8.f, 16.f, 32.f};
7193 float32x4_t _coeff = vld1q_f32(coeff);
7194 #endif // __ARM_NEON
7195
7196 int w_tm = outw / 6 * 8;
7197 int h_tm = outh / 6 * 8;
7198 const int tiles = w_tm / 8 * h_tm / 8;
7199
7200 #pragma omp parallel for num_threads(opt.num_threads)
7201 for (int p = 0; p < outch; p++)
7202 {
7203 const Mat out0_tm = top_blob_tm.channel(p);
7204 Mat out0 = top_blob_bordered.channel(p);
7205
7206 const float bias0 = bias ? bias[p] : 0.f;
7207 #if __ARM_NEON
7208 float32x2_t _bias0 = vdup_n_f32(bias0);
7209 #endif // __ARM_NEON
7210
7211 float tmp[6][8];
7212
7213 // tile
7214 for (int i = 0; i < outh / 6; i++)
7215 {
7216 for (int j = 0; j < outw / 6; j++)
7217 {
7218 #if __ARM_NEON
7219 #if __aarch64__
7220 const float* output0_tm0 = out0_tm.row(i * w_tm / 8 + j);
7221 const float* output0_tm1 = out0_tm.row(i * w_tm / 8 + j + tiles * 8);
7222 const float* output0_tm2 = out0_tm.row(i * w_tm / 8 + j + tiles * 16);
7223 const float* output0_tm3 = out0_tm.row(i * w_tm / 8 + j + tiles * 24);
7224
7225 for (int m = 0; m + 3 < 8; m += 4)
7226 {
7227 float32x4_t _output0_tm_00 = {};
7228 float32x4_t _output0_tm_11 = {};
7229 float32x4_t _output0_tm_22 = {};
7230 float32x4_t _output0_tm_33 = {};
7231 float32x4_t _output0_tm_44 = {};
7232 float32x4_t _output0_tm_55 = {};
7233 float32x4_t _output0_tm_66 = {};
7234 float32x4_t _output0_tm_77 = {};
7235
7236 _output0_tm_00 = vsetq_lane_f32(output0_tm0[0], _output0_tm_00, 0);
7237 output0_tm0 += out0_tm.w * tiles;
7238 _output0_tm_00 = vsetq_lane_f32(output0_tm1[0], _output0_tm_00, 1);
7239 output0_tm1 += out0_tm.w * tiles;
7240 _output0_tm_00 = vsetq_lane_f32(output0_tm2[0], _output0_tm_00, 2);
7241 output0_tm2 += out0_tm.w * tiles;
7242 _output0_tm_00 = vsetq_lane_f32(output0_tm3[0], _output0_tm_00, 3);
7243 output0_tm3 += out0_tm.w * tiles;
7244
7245 _output0_tm_11 = vsetq_lane_f32(output0_tm0[0], _output0_tm_11, 0);
7246 output0_tm0 += out0_tm.w * tiles;
7247 _output0_tm_11 = vsetq_lane_f32(output0_tm1[0], _output0_tm_11, 1);
7248 output0_tm1 += out0_tm.w * tiles;
7249 _output0_tm_11 = vsetq_lane_f32(output0_tm2[0], _output0_tm_11, 2);
7250 output0_tm2 += out0_tm.w * tiles;
7251 _output0_tm_11 = vsetq_lane_f32(output0_tm3[0], _output0_tm_11, 3);
7252 output0_tm3 += out0_tm.w * tiles;
7253
7254 _output0_tm_22 = vsetq_lane_f32(output0_tm0[0], _output0_tm_22, 0);
7255 output0_tm0 += out0_tm.w * tiles;
7256 _output0_tm_22 = vsetq_lane_f32(output0_tm1[0], _output0_tm_22, 1);
7257 output0_tm1 += out0_tm.w * tiles;
7258 _output0_tm_22 = vsetq_lane_f32(output0_tm2[0], _output0_tm_22, 2);
7259 output0_tm2 += out0_tm.w * tiles;
7260 _output0_tm_22 = vsetq_lane_f32(output0_tm3[0], _output0_tm_22, 3);
7261 output0_tm3 += out0_tm.w * tiles;
7262
7263 _output0_tm_33 = vsetq_lane_f32(output0_tm0[0], _output0_tm_33, 0);
7264 output0_tm0 += out0_tm.w * tiles;
7265 _output0_tm_33 = vsetq_lane_f32(output0_tm1[0], _output0_tm_33, 1);
7266 output0_tm1 += out0_tm.w * tiles;
7267 _output0_tm_33 = vsetq_lane_f32(output0_tm2[0], _output0_tm_33, 2);
7268 output0_tm2 += out0_tm.w * tiles;
7269 _output0_tm_33 = vsetq_lane_f32(output0_tm3[0], _output0_tm_33, 3);
7270 output0_tm3 += out0_tm.w * tiles;
7271
7272 _output0_tm_44 = vsetq_lane_f32(output0_tm0[0], _output0_tm_44, 0);
7273 output0_tm0 += out0_tm.w * tiles;
7274 _output0_tm_44 = vsetq_lane_f32(output0_tm1[0], _output0_tm_44, 1);
7275 output0_tm1 += out0_tm.w * tiles;
7276 _output0_tm_44 = vsetq_lane_f32(output0_tm2[0], _output0_tm_44, 2);
7277 output0_tm2 += out0_tm.w * tiles;
7278 _output0_tm_44 = vsetq_lane_f32(output0_tm3[0], _output0_tm_44, 3);
7279 output0_tm3 += out0_tm.w * tiles;
7280
7281 _output0_tm_55 = vsetq_lane_f32(output0_tm0[0], _output0_tm_55, 0);
7282 output0_tm0 += out0_tm.w * tiles;
7283 _output0_tm_55 = vsetq_lane_f32(output0_tm1[0], _output0_tm_55, 1);
7284 output0_tm1 += out0_tm.w * tiles;
7285 _output0_tm_55 = vsetq_lane_f32(output0_tm2[0], _output0_tm_55, 2);
7286 output0_tm2 += out0_tm.w * tiles;
7287 _output0_tm_55 = vsetq_lane_f32(output0_tm3[0], _output0_tm_55, 3);
7288 output0_tm3 += out0_tm.w * tiles;
7289
7290 _output0_tm_66 = vsetq_lane_f32(output0_tm0[0], _output0_tm_66, 0);
7291 output0_tm0 += out0_tm.w * tiles;
7292 _output0_tm_66 = vsetq_lane_f32(output0_tm1[0], _output0_tm_66, 1);
7293 output0_tm1 += out0_tm.w * tiles;
7294 _output0_tm_66 = vsetq_lane_f32(output0_tm2[0], _output0_tm_66, 2);
7295 output0_tm2 += out0_tm.w * tiles;
7296 _output0_tm_66 = vsetq_lane_f32(output0_tm3[0], _output0_tm_66, 3);
7297 output0_tm3 += out0_tm.w * tiles;
7298
7299 _output0_tm_77 = vsetq_lane_f32(output0_tm0[0], _output0_tm_77, 0);
7300 _output0_tm_77 = vsetq_lane_f32(output0_tm1[0], _output0_tm_77, 1);
7301 _output0_tm_77 = vsetq_lane_f32(output0_tm2[0], _output0_tm_77, 2);
7302 _output0_tm_77 = vsetq_lane_f32(output0_tm3[0], _output0_tm_77, 3);
7303
7304 float32x4_t _tmp024a = vaddq_f32(_output0_tm_11, _output0_tm_22);
7305 float32x4_t _tmp135a = vsubq_f32(_output0_tm_11, _output0_tm_22);
7306
7307 float32x4_t _tmp024b = vaddq_f32(_output0_tm_33, _output0_tm_44);
7308 float32x4_t _tmp135b = vsubq_f32(_output0_tm_33, _output0_tm_44);
7309
7310 float32x4_t _tmp024c = vaddq_f32(_output0_tm_55, _output0_tm_66);
7311 float32x4_t _tmp135c = vsubq_f32(_output0_tm_55, _output0_tm_66);
7312
7313 float32x4_t _tmp0 = vaddq_f32(_output0_tm_00, _tmp024a);
7314 _tmp0 = vmlaq_lane_f32(_tmp0, _tmp024c, vget_high_f32(_coeff), 1);
7315 _tmp0 = vaddq_f32(_tmp0, _tmp024b);
7316
7317 float32x4_t _tmp2 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
7318 _tmp2 = vmlaq_lane_f32(_tmp2, _tmp024c, vget_low_f32(_coeff), 1);
7319
7320 float32x4_t _tmp4 = vmlaq_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
7321 _tmp4 = vaddq_f32(_tmp4, _tmp024c);
7322 _tmp4 = vaddq_f32(_tmp4, _tmp024c);
7323
7324 vst1q_f32(&tmp[0][m], _tmp0);
7325 vst1q_f32(&tmp[2][m], _tmp2);
7326 vst1q_f32(&tmp[4][m], _tmp4);
7327
7328 float32x4_t _tmp1 = vmlaq_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
7329 _tmp1 = vaddq_f32(_tmp1, _tmp135b);
7330 _tmp1 = vaddq_f32(_tmp1, _tmp135b);
7331
7332 float32x4_t _tmp3 = vmlaq_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
7333 _tmp3 = vmlaq_lane_f32(_tmp3, _tmp135c, vget_low_f32(_coeff), 0);
7334
7335 float32x4_t _tmp5 = vaddq_f32(_output0_tm_77, _tmp135a);
7336 _tmp5 = vmlaq_lane_f32(_tmp5, _tmp135b, vget_high_f32(_coeff), 1);
7337 _tmp5 = vaddq_f32(_tmp5, _tmp135c);
7338
7339 vst1q_f32(&tmp[1][m], _tmp1);
7340 vst1q_f32(&tmp[3][m], _tmp3);
7341 vst1q_f32(&tmp[5][m], _tmp5);
7342
7343 output0_tm0 += out0_tm.w * tiles * 25;
7344 output0_tm1 += out0_tm.w * tiles * 25;
7345 output0_tm2 += out0_tm.w * tiles * 25;
7346 output0_tm3 += out0_tm.w * tiles * 25;
7347 }
7348
7349 const float* t0 = tmp[0];
7350 const float* t1 = tmp[1];
7351
7352 float* output0 = out0.row(i * 6) + j * 6;
7353 float* output1 = output0 + outw;
7354
7355 for (int m = 0; m + 1 < 6; m += 2)
7356 {
7357 float32x4_t _t0_0123 = vld1q_f32(t0);
7358 float32x4_t _t0_4567 = vld1q_f32(t0 + 4);
7359 float32x4_t _t1_0123 = vld1q_f32(t1);
7360 float32x4_t _t1_4567 = vld1q_f32(t1 + 4);
7361
7362 float32x4x2_t _t01_00221133 = vtrnq_f32(_t0_0123, _t1_0123);
7363 float32x4x2_t _t01_44665577 = vtrnq_f32(_t0_4567, _t1_4567);
7364
7365 float32x2_t _t_00 = vget_low_f32(_t01_00221133.val[0]);
7366 float32x2_t _t_11 = vget_low_f32(_t01_00221133.val[1]);
7367 float32x2_t _t_22 = vget_high_f32(_t01_00221133.val[0]);
7368 float32x2_t _t_33 = vget_high_f32(_t01_00221133.val[1]);
7369 float32x2_t _t_44 = vget_low_f32(_t01_44665577.val[0]);
7370 float32x2_t _t_55 = vget_low_f32(_t01_44665577.val[1]);
7371 float32x2_t _t_66 = vget_high_f32(_t01_44665577.val[0]);
7372 float32x2_t _t_77 = vget_high_f32(_t01_44665577.val[1]);
7373
7374 float32x2_t _tmp024a = vadd_f32(_t_11, _t_22);
7375 float32x2_t _tmp135a = vsub_f32(_t_11, _t_22);
7376
7377 float32x2_t _tmp024b = vadd_f32(_t_33, _t_44);
7378 float32x2_t _tmp135b = vsub_f32(_t_33, _t_44);
7379
7380 float32x2_t _tmp024c = vadd_f32(_t_55, _t_66);
7381 float32x2_t _tmp135c = vsub_f32(_t_55, _t_66);
7382
7383 float32x2_t _output_0 = vadd_f32(_t_00, _tmp024a);
7384 _output_0 = vmla_lane_f32(_output_0, _tmp024c, vget_high_f32(_coeff), 1);
7385 _output_0 = vadd_f32(_output_0, _tmp024b);
7386 _output_0 = vadd_f32(_output_0, _bias0);
7387
7388 float32x2_t _output_2 = vmla_lane_f32(_tmp024a, _tmp024b, vget_low_f32(_coeff), 0);
7389 _output_2 = vmla_lane_f32(_output_2, _tmp024c, vget_low_f32(_coeff), 1);
7390 _output_2 = vadd_f32(_output_2, _bias0);
7391
7392 float32x2_t _output_4 = vmla_lane_f32(_tmp024a, _tmp024b, vget_high_f32(_coeff), 0);
7393 _output_4 = vadd_f32(_output_4, _tmp024c);
7394 _output_4 = vadd_f32(_output_4, _tmp024c);
7395 _output_4 = vadd_f32(_output_4, _bias0);
7396
7397 output0[0] = vget_lane_f32(_output_0, 0);
7398 output1[0] = vget_lane_f32(_output_0, 1);
7399 output0[2] = vget_lane_f32(_output_2, 0);
7400 output1[2] = vget_lane_f32(_output_2, 1);
7401 output0[4] = vget_lane_f32(_output_4, 0);
7402 output1[4] = vget_lane_f32(_output_4, 1);
7403
7404 float32x2_t _output_1 = vmla_lane_f32(_tmp135a, _tmp135c, vget_high_f32(_coeff), 0);
7405 _output_1 = vadd_f32(_output_1, _tmp135b);
7406 _output_1 = vadd_f32(_output_1, _tmp135b);
7407 _output_1 = vadd_f32(_output_1, _bias0);
7408
7409 float32x2_t _output_3 = vmla_lane_f32(_tmp135a, _tmp135b, vget_low_f32(_coeff), 1);
7410 _output_3 = vmla_lane_f32(_output_3, _tmp135c, vget_low_f32(_coeff), 0);
7411 _output_3 = vadd_f32(_output_3, _bias0);
7412
7413 float32x2_t _output_5 = vadd_f32(_t_77, _tmp135a);
7414 _output_5 = vmla_lane_f32(_output_5, _tmp135b, vget_high_f32(_coeff), 1);
7415 _output_5 = vadd_f32(_output_5, _tmp135c);
7416 _output_5 = vadd_f32(_output_5, _bias0);
7417
7418 output0[1] = vget_lane_f32(_output_1, 0);
7419 output1[1] = vget_lane_f32(_output_1, 1);
7420 output0[3] = vget_lane_f32(_output_3, 0);
7421 output1[3] = vget_lane_f32(_output_3, 1);
7422 output0[5] = vget_lane_f32(_output_5, 0);
7423 output1[5] = vget_lane_f32(_output_5, 1);
7424
7425 t0 += 8 * 2;
7426 t1 += 8 * 2;
7427 output0 += outw * 2;
7428 output1 += outw * 2;
7429 }
7430 #else // __aarch64__
7431 const float* output0_tm0_0 = out0_tm.row(i * w_tm / 8 + j);
7432 const float* output0_tm1_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 8);
7433 const float* output0_tm2_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 16);
7434 const float* output0_tm3_0 = out0_tm.row(i * w_tm / 8 + j + tiles * 24);
7435 const float* output0_tm0_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 32);
7436 const float* output0_tm1_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 40);
7437 const float* output0_tm2_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 48);
7438 const float* output0_tm3_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 56);
7439
7440 float* t0 = tmp[0];
7441 float* t1 = tmp[1];
7442
7443 // int step = out0_tm.w * tiles * 2*4 *4;
7444 int step = out0_tm.w * tiles * 4;
7445
7446 asm volatile(
7447
7448 // loop0
7449 // "vld1.f32 {d16-d17}, [%2], %21 \n"
7450 // "vld1.f32 {d18-d19}, [%3], %21 \n"
7451 // "vld1.f32 {d20-d21}, [%4], %21 \n"
7452 // "vld1.f32 {d22-d23}, [%5], %21 \n"
7453 // "vld1.f32 {d24-d25}, [%6], %21 \n"
7454 // "vld1.f32 {d26-d27}, [%7], %21 \n"
7455 // "vld1.f32 {d28-d29}, [%8], %21 \n"
7456 // "vld1.f32 {d30-d31}, [%9], %21 \n"
7457
7458 // "vtrn.32 q8, q10 \n"
7459 // "vtrn.32 q9, q11 \n"
7460 // "vtrn.32 q12, q14 \n"
7461 // "vtrn.32 q13, q15 \n"
7462
7463 // "vswp d17, d24 \n"
7464 // "vswp d19, d26 \n"
7465 // "vswp d21, d28 \n"// q8 = 00 q9 = 44 q10 = 11 q11 = 55
7466 // "vswp d23, d30 \n"// q12 = 22 q13 = 66 q14 = 33 q15 = 77
7467 "vld1.f32 {d16[0]}, [%2], %21 \n"
7468 "vld1.f32 {d16[1]}, [%3], %21 \n"
7469 "vld1.f32 {d17[0]}, [%4], %21 \n"
7470 "vld1.f32 {d17[1]}, [%5], %21 \n"
7471
7472 "vld1.f32 {d20[0]}, [%2], %21 \n"
7473 "vld1.f32 {d20[1]}, [%3], %21 \n"
7474 "vld1.f32 {d21[0]}, [%4], %21 \n"
7475 "vld1.f32 {d21[1]}, [%5], %21 \n"
7476
7477 "vld1.f32 {d24[0]}, [%2], %21 \n"
7478 "vld1.f32 {d24[1]}, [%3], %21 \n"
7479 "vld1.f32 {d25[0]}, [%4], %21 \n"
7480 "vld1.f32 {d25[1]}, [%5], %21 \n"
7481
7482 "vadd.f32 q2, q10, q12 \n"
7483 "vsub.f32 q3, q10, q12 \n"
7484
7485 "vld1.f32 {d28[0]}, [%2], %21 \n"
7486 "vld1.f32 {d28[1]}, [%3], %21 \n"
7487 "vld1.f32 {d29[0]}, [%4], %21 \n"
7488 "vld1.f32 {d29[1]}, [%5], %21 \n"
7489
7490 "vld1.f32 {d18[0]}, [%2], %21 \n"
7491 "vld1.f32 {d18[1]}, [%3], %21 \n"
7492 "vld1.f32 {d19[0]}, [%4], %21 \n"
7493 "vld1.f32 {d19[1]}, [%5], %21 \n"
7494
7495 "vadd.f32 q4, q14, q9 \n"
7496 "vsub.f32 q5, q14, q9 \n"
7497
7498 "vld1.f32 {d22[0]}, [%2], %21 \n"
7499 "vld1.f32 {d22[1]}, [%3], %21 \n"
7500 "vld1.f32 {d23[0]}, [%4], %21 \n"
7501 "vld1.f32 {d23[1]}, [%5], %21 \n"
7502
7503 "vld1.f32 {d26[0]}, [%2], %21 \n"
7504 "vld1.f32 {d26[1]}, [%3], %21 \n"
7505 "vld1.f32 {d27[0]}, [%4], %21 \n"
7506 "vld1.f32 {d27[1]}, [%5], %21 \n"
7507
7508 "vadd.f32 q6, q11, q13 \n"
7509 "vsub.f32 q7, q11, q13 \n" // spare q9 q10 q11 q12 q13 q14
7510
7511 "vld1.f32 {d30[0]}, [%2] \n"
7512 "vld1.f32 {d30[1]}, [%3] \n"
7513 "vld1.f32 {d31[0]}, [%4] \n"
7514 "vld1.f32 {d31[1]}, [%5] \n"
7515
7516 "vmov q9, q3 \n"
7517 "vadd.f32 q8, q8, q2 \n"
7518 "vmla.f32 q9, q7, %f20[0] \n"
7519 "vmov q12, q2 \n"
7520 "vmov q10, q2 \n"
7521 "vmov q11, q3 \n"
7522 "vmla.f32 q12, q4, %f20[0] \n"
7523 "vadd.f32 q15, q15, q3 \n"
7524 "vmla.f32 q8, q6, %f20[1] \n"
7525 "vadd.f32 q9, q9, q5 \n"
7526 "vmla.f32 q10, q4, %e20[0] \n"
7527 "vmla.f32 q11, q5, %e20[1] \n"
7528 "vadd.f32 q12, q12, q6 \n"
7529 "vmla.f32 q15, q5, %f20[1] \n"
7530 "vadd.f32 q8, q8, q4 \n"
7531 "vadd.f32 q9, q9, q5 \n"
7532 "vmla.f32 q10, q6, %e20[1] \n"
7533 "vmla.f32 q11, q7, %e20[0] \n"
7534 "vadd.f32 q12, q12, q6 \n"
7535 "vadd.f32 q15, q15, q7 \n"
7536
7537 "vst1.f32 {d16-d17}, [%0] \n"
7538 "add %0, %0, #64 \n"
7539
7540 "vst1.f32 {d18-d19}, [%1] \n"
7541 "add %1, %1, #64 \n"
7542
7543 "vst1.f32 {d20-d21}, [%0] \n"
7544 "add %0, %0, #64 \n"
7545
7546 "vst1.f32 {d22-d23}, [%1] \n"
7547 "add %1, %1, #64 \n"
7548
7549 "vst1.f32 {d24-d25}, [%0] \n"
7550 "sub %0, %0, #112 \n"
7551
7552 "vst1.f32 {d30-d31}, [%1] \n"
7553 "sub %1, %1, #112 \n"
7554
7555 // loop1
7556 // "vld1.f32 {d16-d17}, [%2] \n"
7557 // "vld1.f32 {d18-d19}, [%3] \n"
7558 // "vld1.f32 {d20-d21}, [%4] \n"
7559 // "vld1.f32 {d22-d23}, [%5] \n"
7560 // "vld1.f32 {d24-d25}, [%6] \n"
7561 // "vld1.f32 {d26-d27}, [%7] \n"
7562 // "vld1.f32 {d28-d29}, [%8] \n"
7563 // "vld1.f32 {d30-d31}, [%9] \n"
7564
7565 // "vtrn.32 q8, q10 \n"
7566 // "vtrn.32 q9, q11 \n"
7567 // "vtrn.32 q12, q14 \n"
7568 // "vtrn.32 q13, q15 \n"
7569
7570 // "vswp d17, d24 \n"
7571 // "vswp d19, d26 \n"
7572 // "vswp d21, d28 \n"// q8 = 00 q9 = 44 q10 = 11 q11 = 55
7573 // "vswp d23, d30 \n"// q12 = 22 q13 = 66 q14 = 33 q15 = 77
7574 "vld1.f32 {d16[0]}, [%6], %21 \n"
7575 "vld1.f32 {d16[1]}, [%7], %21 \n"
7576 "vld1.f32 {d17[0]}, [%8], %21 \n"
7577 "vld1.f32 {d17[1]}, [%9], %21 \n"
7578
7579 "vld1.f32 {d20[0]}, [%6], %21 \n"
7580 "vld1.f32 {d20[1]}, [%7], %21 \n"
7581 "vld1.f32 {d21[0]}, [%8], %21 \n"
7582 "vld1.f32 {d21[1]}, [%9], %21 \n"
7583
7584 "vld1.f32 {d24[0]}, [%6], %21 \n"
7585 "vld1.f32 {d24[1]}, [%7], %21 \n"
7586 "vld1.f32 {d25[0]}, [%8], %21 \n"
7587 "vld1.f32 {d25[1]}, [%9], %21 \n"
7588
7589 "vadd.f32 q2, q10, q12 \n"
7590 "vsub.f32 q3, q10, q12 \n"
7591
7592 "vld1.f32 {d28[0]}, [%6], %21 \n"
7593 "vld1.f32 {d28[1]}, [%7], %21 \n"
7594 "vld1.f32 {d29[0]}, [%8], %21 \n"
7595 "vld1.f32 {d29[1]}, [%9], %21 \n"
7596
7597 "vld1.f32 {d18[0]}, [%6], %21 \n"
7598 "vld1.f32 {d18[1]}, [%7], %21 \n"
7599 "vld1.f32 {d19[0]}, [%8], %21 \n"
7600 "vld1.f32 {d19[1]}, [%9], %21 \n"
7601
7602 "vadd.f32 q4, q14, q9 \n"
7603 "vsub.f32 q5, q14, q9 \n"
7604
7605 "vld1.f32 {d22[0]}, [%6], %21 \n"
7606 "vld1.f32 {d22[1]}, [%7], %21 \n"
7607 "vld1.f32 {d23[0]}, [%8], %21 \n"
7608 "vld1.f32 {d23[1]}, [%9], %21 \n"
7609
7610 "vld1.f32 {d26[0]}, [%6], %21 \n"
7611 "vld1.f32 {d26[1]}, [%7], %21 \n"
7612 "vld1.f32 {d27[0]}, [%8], %21 \n"
7613 "vld1.f32 {d27[1]}, [%9], %21 \n"
7614
7615 "vadd.f32 q6, q11, q13 \n"
7616 "vsub.f32 q7, q11, q13 \n" // spare q9 q10 q11 q12 q13 q14
7617
7618 "vld1.f32 {d30[0]}, [%6] \n"
7619 "vld1.f32 {d30[1]}, [%7] \n"
7620 "vld1.f32 {d31[0]}, [%8] \n"
7621 "vld1.f32 {d31[1]}, [%9] \n"
7622
7623 "vmov q9, q3 \n"
7624 "vadd.f32 q8, q8, q2 \n"
7625 "vmla.f32 q9, q7, %f20[0] \n"
7626 "vmov q12, q2 \n"
7627 "vmov q10, q2 \n"
7628 "vmov q11, q3 \n"
7629 "vmla.f32 q12, q4, %f20[0] \n"
7630 "vadd.f32 q15, q15, q3 \n"
7631 "vmla.f32 q8, q6, %f20[1] \n"
7632 "vadd.f32 q9, q9, q5 \n"
7633 "vmla.f32 q10, q4, %e20[0] \n"
7634 "vmla.f32 q11, q5, %e20[1] \n"
7635 "vadd.f32 q12, q12, q6 \n"
7636 "vmla.f32 q15, q5, %f20[1] \n"
7637 "vadd.f32 q8, q8, q4 \n"
7638 "vadd.f32 q9, q9, q5 \n"
7639 "vmla.f32 q10, q6, %e20[1] \n"
7640 "vmla.f32 q11, q7, %e20[0] \n"
7641 "vadd.f32 q12, q12, q6 \n"
7642 "vadd.f32 q15, q15, q7 \n"
7643
7644 "vst1.f32 {d16-d17}, [%0] \n"
7645 "add %0, %0, #64 \n"
7646
7647 "vst1.f32 {d18-d19}, [%1] \n"
7648 "add %1, %1, #64 \n"
7649
7650 "vst1.f32 {d20-d21}, [%0] \n"
7651 "add %0, %0, #64 \n"
7652
7653 "vst1.f32 {d22-d23}, [%1] \n"
7654 "add %1, %1, #64 \n"
7655
7656 "vst1.f32 {d24-d25}, [%0] \n"
7657
7658 "vst1.f32 {d30-d31}, [%1] \n"
7659
7660 : "=r"(t0), // %0
7661 "=r"(t1), // %1
7662 "=r"(output0_tm0_0), // %2
7663 "=r"(output0_tm1_0), // %3
7664 "=r"(output0_tm2_0), // %4
7665 "=r"(output0_tm3_0), // %5
7666 "=r"(output0_tm0_4), // %6
7667 "=r"(output0_tm1_4), // %7
7668 "=r"(output0_tm2_4), // %8
7669 "=r"(output0_tm3_4) // %9
7670 : "0"(t0),
7671 "1"(t1),
7672 "2"(output0_tm0_0),
7673 "3"(output0_tm1_0),
7674 "4"(output0_tm2_0),
7675 "5"(output0_tm3_0),
7676 "6"(output0_tm0_4),
7677 "7"(output0_tm1_4),
7678 "8"(output0_tm2_4),
7679 "9"(output0_tm3_4),
7680 "w"(_coeff), // %20
7681 "r"(step) // %21
7682 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
7683
7684 t0 = tmp[0];
7685 t1 = tmp[1];
7686
7687 float* output0 = out0.row(i * 6) + j * 6;
7688 float* output1 = output0 + outw;
7689
7690 int stepw = outw * 2 * 4;
7691
7692 asm volatile(
7693
7694 // loop0
7695 "vld1.f32 {d16-d19}, [%2] \n"
7696 "vld1.f32 {d20-d23}, [%3] \n"
7697
7698 "add %2, %2, #64 \n"
7699 "add %3, %3, #64 \n"
7700
7701 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
7702 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
7703
7704 "vadd.f32 d4, d20, d17 \n"
7705 "vsub.f32 d5, d20, d17 \n"
7706
7707 "vadd.f32 d6, d21, d18 \n"
7708 "vsub.f32 d7, d21, d18 \n"
7709
7710 "vadd.f32 d8, d22, d19 \n"
7711 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
7712
7713 "vmov d20, d5 \n"
7714 "vmov d18, d4 \n"
7715
7716 "vadd.f32 d16, d16, d4 \n"
7717 "vmla.f32 d20, d9, %f8[0] \n"
7718 "vmov d17, d4 \n"
7719 "vmov d21, d5 \n"
7720 "vmla.f32 d18, d6, %f8[0] \n"
7721 "vadd.f32 d22, d23, d5 \n"
7722
7723 "vmla.f32 d16, d8, %f8[1] \n"
7724 "vadd.f32 d20, d20, d7 \n"
7725 "vmla.f32 d17, d6, %e8[0] \n"
7726 "vmla.f32 d21, d7, %e8[1] \n"
7727 "vadd.f32 d18, d18, d8 \n"
7728 "vmla.f32 d22, d7, %f8[1] \n"
7729
7730 "vadd.f32 d16, d16, d6 \n"
7731 "vadd.f32 d20, d20, d7 \n"
7732 "vmla.f32 d17, d8, %e8[1] \n"
7733 "vmla.f32 d21, d9, %e8[0] \n"
7734 "vadd.f32 d18, d18, d8 \n"
7735 "vadd.f32 d22, d22, d9 \n"
7736
7737 "vadd.f32 d16, d16, %P9 \n" // _bias0
7738 "vadd.f32 d20, d20, %P9 \n" // _bias0
7739 "vadd.f32 d17, d17, %P9 \n" // _bias0
7740 "vadd.f32 d21, d21, %P9 \n" // _bias0
7741 "vadd.f32 d18, d18, %P9 \n" // _bias0
7742 "vadd.f32 d22, d22, %P9 \n" // _bias0
7743
7744 "vtrn.f32 q8, q10 \n"
7745 "vtrn.f32 d18, d22 \n"
7746
7747 "vst1.f32 {d16-d18}, [%0], %10 \n"
7748 "vst1.f32 {d20-d22}, [%1], %10 \n"
7749
7750 // loop1
7751 "vld1.f32 {d16-d19}, [%2] \n"
7752 "vld1.f32 {d20-d23}, [%3] \n"
7753
7754 "add %2, %2, #64 \n"
7755 "add %3, %3, #64 \n"
7756
7757 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
7758 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
7759
7760 "vadd.f32 d4, d20, d17 \n"
7761 "vsub.f32 d5, d20, d17 \n"
7762
7763 "vadd.f32 d6, d21, d18 \n"
7764 "vsub.f32 d7, d21, d18 \n"
7765
7766 "vadd.f32 d8, d22, d19 \n"
7767 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
7768
7769 "vmov d20, d5 \n"
7770 "vmov d18, d4 \n"
7771
7772 "vadd.f32 d16, d16, d4 \n"
7773 "vmla.f32 d20, d9, %f8[0] \n"
7774 "vmov d17, d4 \n"
7775 "vmov d21, d5 \n"
7776 "vmla.f32 d18, d6, %f8[0] \n"
7777 "vadd.f32 d22, d23, d5 \n"
7778
7779 "vmla.f32 d16, d8, %f8[1] \n"
7780 "vadd.f32 d20, d20, d7 \n"
7781 "vmla.f32 d17, d6, %e8[0] \n"
7782 "vmla.f32 d21, d7, %e8[1] \n"
7783 "vadd.f32 d18, d18, d8 \n"
7784 "vmla.f32 d22, d7, %f8[1] \n"
7785
7786 "vadd.f32 d16, d16, d6 \n"
7787 "vadd.f32 d20, d20, d7 \n"
7788 "vmla.f32 d17, d8, %e8[1] \n"
7789 "vmla.f32 d21, d9, %e8[0] \n"
7790 "vadd.f32 d18, d18, d8 \n"
7791 "vadd.f32 d22, d22, d9 \n"
7792
7793 "vadd.f32 d16, d16, %P9 \n" // _bias0
7794 "vadd.f32 d20, d20, %P9 \n" // _bias0
7795 "vadd.f32 d17, d17, %P9 \n" // _bias0
7796 "vadd.f32 d21, d21, %P9 \n" // _bias0
7797 "vadd.f32 d18, d18, %P9 \n" // _bias0
7798 "vadd.f32 d22, d22, %P9 \n" // _bias0
7799
7800 "vtrn.f32 q8, q10 \n"
7801 "vtrn.f32 d18, d22 \n"
7802
7803 "vst1.f32 {d16-d18}, [%0], %10 \n"
7804 "vst1.f32 {d20-d22}, [%1], %10 \n"
7805
7806 // loop2
7807 "vld1.f32 {d16-d19}, [%2] \n"
7808 "vld1.f32 {d20-d23}, [%3] \n"
7809
7810 "add %2, %2, #64 \n"
7811 "add %3, %3, #64 \n"
7812
7813 "vtrn.32 q8, q10 \n" // q8 = 0 2 q10 = 1 3
7814 "vtrn.32 q9, q11 \n" // q9 = 4 6 q11 = 5 7
7815
7816 "vadd.f32 d4, d20, d17 \n"
7817 "vsub.f32 d5, d20, d17 \n"
7818
7819 "vadd.f32 d6, d21, d18 \n"
7820 "vsub.f32 d7, d21, d18 \n"
7821
7822 "vadd.f32 d8, d22, d19 \n"
7823 "vsub.f32 d9, d22, d19 \n" // spare d17 ~ d22
7824
7825 "vmov d20, d5 \n"
7826 "vmov d18, d4 \n"
7827
7828 "vadd.f32 d16, d16, d4 \n"
7829 "vmla.f32 d20, d9, %f8[0] \n"
7830 "vmov d17, d4 \n"
7831 "vmov d21, d5 \n"
7832 "vmla.f32 d18, d6, %f8[0] \n"
7833 "vadd.f32 d22, d23, d5 \n"
7834
7835 "vmla.f32 d16, d8, %f8[1] \n"
7836 "vadd.f32 d20, d20, d7 \n"
7837 "vmla.f32 d17, d6, %e8[0] \n"
7838 "vmla.f32 d21, d7, %e8[1] \n"
7839 "vadd.f32 d18, d18, d8 \n"
7840 "vmla.f32 d22, d7, %f8[1] \n"
7841
7842 "vadd.f32 d16, d16, d6 \n"
7843 "vadd.f32 d20, d20, d7 \n"
7844 "vmla.f32 d17, d8, %e8[1] \n"
7845 "vmla.f32 d21, d9, %e8[0] \n"
7846 "vadd.f32 d18, d18, d8 \n"
7847 "vadd.f32 d22, d22, d9 \n"
7848
7849 "vadd.f32 d16, d16, %P9 \n" // _bias0
7850 "vadd.f32 d20, d20, %P9 \n" // _bias0
7851 "vadd.f32 d17, d17, %P9 \n" // _bias0
7852 "vadd.f32 d21, d21, %P9 \n" // _bias0
7853 "vadd.f32 d18, d18, %P9 \n" // _bias0
7854 "vadd.f32 d22, d22, %P9 \n" // _bias0
7855
7856 "vtrn.f32 q8, q10 \n"
7857 "vtrn.f32 d18, d22 \n"
7858
7859 "vst1.f32 {d16-d18}, [%0], %10 \n"
7860 "vst1.f32 {d20-d22}, [%1], %10 \n"
7861
7862 : "=r"(output0), // %0
7863 "=r"(output1), // %1
7864 "=r"(t0), // %2
7865 "=r"(t1) // %3
7866 : "0"(output0),
7867 "1"(output1),
7868 "2"(t0),
7869 "3"(t1),
7870 "w"(_coeff), // %8
7871 "w"(_bias0), // %9
7872 "r"(stepw) // %10
7873 : "memory", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
7874 #endif // __aarch64__
7875 #else
7876 const float* output0_tm_0 = out0_tm.row(i * w_tm / 8 + j);
7877 const float* output0_tm_1 = out0_tm.row(i * w_tm / 8 + j + tiles);
7878 const float* output0_tm_2 = out0_tm.row(i * w_tm / 8 + j + tiles * 2);
7879 const float* output0_tm_3 = out0_tm.row(i * w_tm / 8 + j + tiles * 3);
7880 const float* output0_tm_4 = out0_tm.row(i * w_tm / 8 + j + tiles * 4);
7881 const float* output0_tm_5 = out0_tm.row(i * w_tm / 8 + j + tiles * 5);
7882 const float* output0_tm_6 = out0_tm.row(i * w_tm / 8 + j + tiles * 6);
7883 const float* output0_tm_7 = out0_tm.row(i * w_tm / 8 + j + tiles * 7);
7884
7885 for (int m = 0; m < 8; m++)
7886 {
7887 float tmp024a = output0_tm_1[0] + output0_tm_2[0];
7888 float tmp135a = output0_tm_1[0] - output0_tm_2[0];
7889
7890 float tmp024b = output0_tm_3[0] + output0_tm_4[0];
7891 float tmp135b = output0_tm_3[0] - output0_tm_4[0];
7892
7893 float tmp024c = output0_tm_5[0] + output0_tm_6[0];
7894 float tmp135c = output0_tm_5[0] - output0_tm_6[0];
7895
7896 tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
7897 tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
7898 tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
7899
7900 tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
7901 tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
7902 tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
7903
7904 output0_tm_0 += out0_tm.w * tiles * 8;
7905 output0_tm_1 += out0_tm.w * tiles * 8;
7906 output0_tm_2 += out0_tm.w * tiles * 8;
7907 output0_tm_3 += out0_tm.w * tiles * 8;
7908 output0_tm_4 += out0_tm.w * tiles * 8;
7909 output0_tm_5 += out0_tm.w * tiles * 8;
7910 output0_tm_6 += out0_tm.w * tiles * 8;
7911 output0_tm_7 += out0_tm.w * tiles * 8;
7912 }
7913
7914 float* output0 = out0.row(i * 6) + j * 6;
7915
7916 for (int m = 0; m < 6; m++)
7917 {
7918 const float* tmp0 = tmp[m];
7919
7920 float tmp024a = tmp0[1] + tmp0[2];
7921 float tmp135a = tmp0[1] - tmp0[2];
7922
7923 float tmp024b = tmp0[3] + tmp0[4];
7924 float tmp135b = tmp0[3] - tmp0[4];
7925
7926 float tmp024c = tmp0[5] + tmp0[6];
7927 float tmp135c = tmp0[5] - tmp0[6];
7928
7929 output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
7930 output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
7931 output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
7932
7933 output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
7934 output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
7935 output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
7936
7937 output0 += outw;
7938 }
7939 #endif // __ARM_NEON
7940 }
7941 }
7942 }
7943 }
7944 // END transform output
7945
7946 // cut result pad
7947 if (top_blob_bordered.w != top_blob.w || top_blob_bordered.h != top_blob.h)
7948 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
7949 }
7950
conv3x3s2_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)7951 static void conv3x3s2_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
7952 {
7953 int w = bottom_blob.w;
7954 int inch = bottom_blob.c;
7955
7956 int outw = top_blob.w;
7957 int outh = top_blob.h;
7958 int outch = top_blob.c;
7959
7960 const int tailstep = w - 2 * outw + w;
7961
7962 const float* kernel = _kernel;
7963 const float* bias = _bias;
7964
7965 int nn_outch = outch >> 1;
7966 int remain_outch_start = nn_outch << 1;
7967
7968 #pragma omp parallel for num_threads(opt.num_threads)
7969 for (int pp = 0; pp < nn_outch; pp++)
7970 {
7971 int p = pp * 2;
7972
7973 Mat out0 = top_blob.channel(p);
7974 Mat out1 = top_blob.channel(p + 1);
7975
7976 const float bias0 = bias ? bias[p] : 0.f;
7977 const float bias1 = bias ? bias[p + 1] : 0.f;
7978
7979 out0.fill(bias0);
7980 out1.fill(bias1);
7981
7982 const float* k0 = kernel + p * inch * 9;
7983 const float* k1 = kernel + (p + 1) * inch * 9;
7984
7985 for (int q = 0; q < inch; q++)
7986 {
7987 float* outptr0 = out0;
7988 float* outptr1 = out1;
7989
7990 const float* img0 = bottom_blob.channel(q);
7991
7992 const float* r0 = img0;
7993 const float* r1 = img0 + w;
7994 const float* r2 = img0 + w * 2;
7995
7996 #if __ARM_NEON
7997 float32x4_t _k00 = vld1q_f32(k0);
7998 float32x4_t _k03 = vld1q_f32(k0 + 3);
7999 float32x4_t _k06 = vld1q_f32(k0 + 6);
8000
8001 float32x4_t _k10 = vld1q_f32(k1);
8002 float32x4_t _k13 = vld1q_f32(k1 + 3);
8003 float32x4_t _k16 = vld1q_f32(k1 + 6);
8004 #endif // __ARM_NEON
8005
8006 int i = 0;
8007
8008 for (; i < outh; i++)
8009 {
8010 #if __ARM_NEON
8011 int nn = outw >> 2;
8012 int remain = outw & 3;
8013 #else
8014 int remain = outw;
8015 #endif // __ARM_NEON
8016
8017 #if __ARM_NEON
8018 #if __aarch64__
8019 if (nn > 0)
8020 {
8021 asm volatile(
8022 "prfm pldl1keep, [%3, #256] \n"
8023 "ld2 {v8.4s, v9.4s}, [%3], #32 \n" // v8 v9 = r0
8024
8025 "0: \n"
8026
8027 "prfm pldl1keep, [%1, #128] \n"
8028 "ld1 {v6.4s}, [%1] \n" // v6 = _sum0
8029
8030 "fmul v12.4s, v8.4s, %12.s[0] \n"
8031
8032 "prfm pldl1keep, [%2, #128] \n"
8033 "ld1 {v7.4s}, [%2] \n" // v7 = _sum1
8034
8035 "fmul v13.4s, v8.4s, %15.s[0] \n"
8036
8037 "prfm pldl1keep, [%3, #128] \n"
8038 "ld2 {v10.4s, v11.4s}, [%3] \n" // v10
8039
8040 "fmla v6.4s, v9.4s, %12.s[1] \n"
8041
8042 "ext v14.16b, v8.16b, v10.16b, #4\n"
8043
8044 "fmla v7.4s, v9.4s, %15.s[1] \n"
8045
8046 "prfm pldl1keep, [%4, #256] \n"
8047 "ld2 {v8.4s, v9.4s}, [%4], #32 \n" // r1
8048
8049 "fmla v12.4s, v14.4s, %12.s[2] \n"
8050 "fmla v13.4s, v14.4s, %15.s[2] \n"
8051
8052 "prfm pldl1keep, [%4, #128] \n"
8053 "ld2 {v10.4s, v11.4s}, [%4] \n"
8054
8055 "fmla v6.4s, v8.4s, %13.s[0] \n"
8056 "fmla v7.4s, v8.4s, %16.s[0] \n"
8057
8058 "ext v14.16b, v8.16b, v10.16b, #4\n"
8059
8060 "fmla v12.4s, v9.4s, %13.s[1] \n"
8061 "fmla v13.4s, v9.4s, %16.s[1] \n"
8062
8063 "prfm pldl1keep, [%5, #256] \n"
8064 "ld2 {v8.4s, v9.4s}, [%5], #32 \n" // r2
8065
8066 "fmla v6.4s, v14.4s, %13.s[2] \n"
8067 "fmla v7.4s, v14.4s, %16.s[2] \n"
8068
8069 "prfm pldl1keep, [%5, #128] \n"
8070 "ld2 {v10.4s, v11.4s}, [%5] \n"
8071
8072 "fmla v12.4s, v8.4s, %14.s[0] \n"
8073 "fmla v13.4s, v8.4s, %17.s[0] \n"
8074
8075 "ext v14.16b, v8.16b, v10.16b, #4\n"
8076
8077 "fmla v6.4s, v9.4s, %14.s[1] \n"
8078 "fmla v7.4s, v9.4s, %17.s[1] \n"
8079
8080 "fmla v12.4s, v14.4s, %14.s[2] \n"
8081 "fmla v13.4s, v14.4s, %17.s[2] \n"
8082
8083 "prfm pldl1keep, [%3, #256] \n"
8084 "ld2 {v8.4s, v9.4s}, [%3], #32 \n" // v8 v9 = r0
8085
8086 "fadd v6.4s, v6.4s, v12.4s \n"
8087 "fadd v7.4s, v7.4s, v13.4s \n"
8088
8089 "subs %w0, %w0, #1 \n"
8090
8091 "st1 {v6.4s}, [%1], #16 \n"
8092 "st1 {v7.4s}, [%2], #16 \n"
8093
8094 "bne 0b \n"
8095 "sub %3, %3, #32 \n"
8096
8097 : "=r"(nn), // %0
8098 "=r"(outptr0), // %1
8099 "=r"(outptr1), // %2
8100 "=r"(r0), // %3
8101 "=r"(r1), // %4
8102 "=r"(r2) // %5
8103 : "0"(nn),
8104 "1"(outptr0),
8105 "2"(outptr1),
8106 "3"(r0),
8107 "4"(r1),
8108 "5"(r2),
8109 "w"(_k00), // %12
8110 "w"(_k03), // %13
8111 "w"(_k06), // %14
8112 "w"(_k10), // %15
8113 "w"(_k13), // %16
8114 "w"(_k16) // %17
8115 : "cc", "memory", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8116 }
8117 #else
8118 if (nn > 0)
8119 {
8120 asm volatile(
8121 "pld [%3, #256] \n"
8122 "vld2.f32 {d16-d19}, [%3]! \n" // q8 q9 = r0
8123
8124 "0: \n"
8125
8126 "pld [%1, #128] \n"
8127 "vld1.f32 {d12-d13}, [%1] \n" // q6 = _sum0
8128
8129 "vmul.f32 q12, q8, %e12[0] \n"
8130
8131 "pld [%2, #128] \n"
8132 "vld1.f32 {d14-d15}, [%2] \n" // q7 = _sum1
8133
8134 "vmul.f32 q13, q8, %e15[0] \n"
8135
8136 "pld [%3, #128] \n"
8137 "vld2.f32 {d20-d21}, [%3] \n" // q10
8138
8139 "vmla.f32 q6, q9, %e12[1] \n"
8140
8141 "vext.32 q11, q8, q10, #1 \n"
8142
8143 "vmla.f32 q7, q9, %e15[1] \n"
8144
8145 "pld [%4, #256] \n"
8146 "vld2.f32 {d16-d19}, [%4]! \n" // r1
8147
8148 "vmla.f32 q12, q11, %f12[0] \n"
8149 "vmla.f32 q13, q11, %f15[0] \n"
8150
8151 "pld [%4, #128] \n"
8152 "vld2.f32 {d20-d21}, [%4] \n"
8153
8154 "vmla.f32 q6, q8, %e13[0] \n"
8155 "vmla.f32 q7, q8, %e16[0] \n"
8156
8157 "vext.32 q11, q8, q10, #1 \n"
8158
8159 "vmla.f32 q12, q9, %e13[1] \n"
8160 "vmla.f32 q13, q9, %e16[1] \n"
8161
8162 "pld [%5, #256] \n"
8163 "vld2.f32 {d16-d19}, [%5]! \n" // r2
8164
8165 "vmla.f32 q6, q11, %f13[0] \n"
8166 "vmla.f32 q7, q11, %f16[0] \n"
8167
8168 "pld [%5, #128] \n"
8169 "vld2.f32 {d20-d21}, [%5] \n"
8170
8171 "vmla.f32 q12, q8, %e14[0] \n"
8172 "vmla.f32 q13, q8, %e17[0] \n"
8173
8174 "vext.32 q11, q8, q10, #1 \n"
8175
8176 "vmla.f32 q6, q9, %e14[1] \n"
8177 "vmla.f32 q7, q9, %e17[1] \n"
8178
8179 "vmla.f32 q12, q11, %f14[0] \n"
8180 "vmla.f32 q13, q11, %f17[0] \n"
8181
8182 "pld [%3, #256] \n"
8183 "vld2.f32 {d16-d19}, [%3]! \n" // q8 q9 = r0
8184
8185 "vadd.f32 q6, q6, q12 \n"
8186 "vadd.f32 q7, q7, q13 \n"
8187
8188 "subs %0, #1 \n"
8189
8190 "vst1.f32 {d12-d13}, [%1]! \n"
8191 "vst1.f32 {d14-d15}, [%2]! \n"
8192
8193 "bne 0b \n"
8194 "sub %3, #32 \n"
8195
8196 : "=r"(nn), // %0
8197 "=r"(outptr0), // %1
8198 "=r"(outptr1), // %2
8199 "=r"(r0), // %3
8200 "=r"(r1), // %4
8201 "=r"(r2) // %5
8202 : "0"(nn),
8203 "1"(outptr0),
8204 "2"(outptr1),
8205 "3"(r0),
8206 "4"(r1),
8207 "5"(r2),
8208 "w"(_k00), // %12
8209 "w"(_k03), // %13
8210 "w"(_k06), // %14
8211 "w"(_k10), // %15
8212 "w"(_k13), // %16
8213 "w"(_k16) // %17
8214 : "cc", "memory", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
8215 }
8216 #endif // __aarch64__
8217 #endif // __ARM_NEON
8218 for (; remain > 0; remain--)
8219 {
8220 #if __ARM_NEON
8221 float32x4_t _r00 = vld1q_f32(r0);
8222 float32x4_t _r10 = vld1q_f32(r1);
8223 float32x4_t _r20 = vld1q_f32(r2);
8224
8225 float32x4_t _sum0 = vmulq_f32(_r00, _k00);
8226 float32x4_t _sum1 = vmulq_f32(_r00, _k10);
8227 _sum0 = vmlaq_f32(_sum0, _r10, _k03);
8228 _sum1 = vmlaq_f32(_sum1, _r10, _k13);
8229 _sum0 = vmlaq_f32(_sum0, _r20, _k06);
8230 _sum1 = vmlaq_f32(_sum1, _r20, _k16);
8231
8232 _sum0 = vsetq_lane_f32(*outptr0, _sum0, 3);
8233 _sum1 = vsetq_lane_f32(*outptr1, _sum1, 3);
8234 #if __aarch64__
8235 *outptr0 = vaddvq_f32(_sum0);
8236 *outptr1 = vaddvq_f32(_sum1);
8237 #else
8238 float32x2_t _ss0 = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
8239 float32x2_t _ss1 = vadd_f32(vget_low_f32(_sum1), vget_high_f32(_sum1));
8240 float32x2_t _ss01 = vpadd_f32(_ss0, _ss1);
8241
8242 *outptr0 = vget_lane_f32(_ss01, 0);
8243 *outptr1 = vget_lane_f32(_ss01, 1);
8244 #endif // __aarch64__
8245 #else
8246 float sum0 = 0.f;
8247 float sum1 = 0.f;
8248
8249 sum0 += r0[0] * k0[0];
8250 sum0 += r0[1] * k0[1];
8251 sum0 += r0[2] * k0[2];
8252 sum0 += r1[0] * k0[3];
8253 sum0 += r1[1] * k0[4];
8254 sum0 += r1[2] * k0[5];
8255 sum0 += r2[0] * k0[6];
8256 sum0 += r2[1] * k0[7];
8257 sum0 += r2[2] * k0[8];
8258
8259 sum1 += r0[0] * k1[0];
8260 sum1 += r0[1] * k1[1];
8261 sum1 += r0[2] * k1[2];
8262 sum1 += r1[0] * k1[3];
8263 sum1 += r1[1] * k1[4];
8264 sum1 += r1[2] * k1[5];
8265 sum1 += r2[0] * k1[6];
8266 sum1 += r2[1] * k1[7];
8267 sum1 += r2[2] * k1[8];
8268
8269 *outptr0 += sum0;
8270 *outptr1 += sum1;
8271 #endif // __ARM_NEON
8272
8273 r0 += 2;
8274 r1 += 2;
8275 r2 += 2;
8276 outptr0++;
8277 outptr1++;
8278 }
8279
8280 r0 += tailstep;
8281 r1 += tailstep;
8282 r2 += tailstep;
8283 }
8284
8285 k0 += 9;
8286 k1 += 9;
8287 }
8288 }
8289
8290 #pragma omp parallel for num_threads(opt.num_threads)
8291 for (int p = remain_outch_start; p < outch; p++)
8292 {
8293 Mat out = top_blob.channel(p);
8294
8295 const float bias0 = bias ? bias[p] : 0.f;
8296
8297 out.fill(bias0);
8298
8299 const float* kernel0 = kernel + p * inch * 9;
8300
8301 for (int q = 0; q < inch; q++)
8302 {
8303 float* outptr = out;
8304
8305 const float* img0 = bottom_blob.channel(q);
8306
8307 const float* r0 = img0;
8308 const float* r1 = img0 + w;
8309 const float* r2 = img0 + w * 2;
8310
8311 const float* k0 = kernel0;
8312 const float* k1 = kernel0 + 3;
8313 const float* k2 = kernel0 + 6;
8314
8315 #if __ARM_NEON
8316 float32x4_t _k0123 = vld1q_f32(k0);
8317 float32x4_t _k3456 = vld1q_f32(k1);
8318 float32x4_t _k6789 = vld1q_f32(k2);
8319 #endif // __ARM_NEON
8320
8321 int i = 0;
8322
8323 for (; i < outh; i++)
8324 {
8325 #if __ARM_NEON
8326 int nn = outw >> 2;
8327 int remain = outw & 3;
8328 #else
8329 int remain = outw;
8330 #endif // __ARM_NEON
8331
8332 #if __ARM_NEON
8333 #if __aarch64__
8334 if (nn > 0)
8335 {
8336 asm volatile(
8337 "prfm pldl1keep, [%2, #256] \n"
8338 "ld2 {v2.4s, v3.4s}, [%2], #32 \n"
8339 "0: \n"
8340
8341 "prfm pldl1keep, [%1, #128] \n"
8342 "ld1 {v0.4s}, [%1] \n"
8343
8344 "fmla v0.4s, v2.4s, %10.s[0] \n"
8345 "fmul v10.4s, v3.4s, %10.s[1] \n"
8346
8347 "prfm pldl1keep, [%2, #256] \n"
8348 "ld2 {v8.4s, v9.4s}, [%2] \n"
8349 "ext v1.16b, v2.16b, v8.16b, #4 \n"
8350
8351 "fmul v11.4s, v1.4s, %10.s[2] \n"
8352
8353 "prfm pldl1keep, [%3, #256] \n"
8354 "ld2 {v2.4s, v3.4s}, [%3], #32 \n"
8355
8356 "fmla v0.4s, v2.4s, %11.s[0] \n"
8357 "fmla v10.4s, v3.4s, %11.s[1] \n"
8358
8359 "prfm pldl1keep, [%3, #256] \n"
8360 "ld2 {v8.4s, v9.4s}, [%3] \n"
8361 "ext v1.16b, v2.16b, v8.16b, #4 \n"
8362
8363 "fmla v11.4s, v1.4s, %11.s[2] \n"
8364
8365 "prfm pldl1keep, [%4, #256] \n"
8366 "ld2 {v2.4s, v3.4s}, [%4], #32 \n"
8367
8368 "fmla v0.4s, v2.4s, %12.s[0] \n"
8369 "fmla v10.4s, v3.4s, %12.s[1] \n"
8370
8371 "prfm pldl1keep, [%4, #256] \n"
8372 "ld2 {v8.4s, v9.4s}, [%4] \n"
8373 "ext v1.16b, v2.16b, v8.16b, #4 \n"
8374
8375 "fmla v11.4s, v1.4s, %12.s[2] \n"
8376
8377 "prfm pldl1keep, [%2, #256] \n"
8378 "ld2 {v2.4s, v3.4s}, [%2], #32 \n"
8379
8380 "fadd v0.4s, v0.4s, v10.4s \n"
8381 "fadd v0.4s, v0.4s, v11.4s \n"
8382
8383 "subs %w0, %w0, #1 \n"
8384 "st1 {v0.4s}, [%1], #16 \n"
8385 "bne 0b \n"
8386 "sub %2, %2, #32 \n"
8387 : "=r"(nn), // %0
8388 "=r"(outptr), // %1
8389 "=r"(r0), // %2
8390 "=r"(r1), // %3
8391 "=r"(r2) // %4
8392 : "0"(nn),
8393 "1"(outptr),
8394 "2"(r0),
8395 "3"(r1),
8396 "4"(r2),
8397 "w"(_k0123), // %10
8398 "w"(_k3456), // %11
8399 "w"(_k6789) // %12
8400 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8401 }
8402 #else
8403 if (nn > 0)
8404 {
8405 asm volatile(
8406 "pld [%2, #256] \n"
8407 "vld2.f32 {d4-d7}, [%2]! \n"
8408
8409 "0: \n"
8410 "pld [%1, #128] \n"
8411 "vld1.f32 {d0-d1}, [%1] \n"
8412
8413 "vmla.f32 q0, q2, %e10[0] \n"
8414 "vmul.f32 q10, q3, %e10[1] \n"
8415
8416 "pld [%2, #128] \n"
8417 "vld2.f32 {d16-d17}, [%2] \n"
8418 "vext.32 q1, q2, q8, #1 \n"
8419
8420 "vmul.f32 q11, q1, %f10[0] \n"
8421
8422 "pld [%3, #256] \n"
8423 "vld2.f32 {d4-d7}, [%3]! \n"
8424
8425 "vmla.f32 q0, q2, %e11[0] \n"
8426 "vmla.f32 q10, q3, %e11[1] \n"
8427
8428 "pld [%3, #128] \n"
8429 "vld2.f32 {d16-d17}, [%3] \n"
8430 "vext.32 q1, q2, q8, #1 \n"
8431
8432 "vmla.f32 q11, q1, %f11[0] \n"
8433
8434 "pld [%4, #256] \n"
8435 "vld2.f32 {d4-d7}, [%4]! \n"
8436
8437 "vmla.f32 q0, q2, %e12[0] \n"
8438 "vmla.f32 q10, q3, %e12[1] \n"
8439
8440 "pld [%4, #128] \n"
8441 "vld2.f32 {d16-d17}, [%4] \n"
8442 "vext.32 q1, q2, q8, #1 \n"
8443
8444 "vmla.f32 q11, q1, %f12[0] \n"
8445
8446 "pld [%2, #256] \n"
8447 "vld2.f32 {d4-d7}, [%2]! \n"
8448
8449 "vadd.f32 q0, q0, q10 \n"
8450 "vadd.f32 q0, q0, q11 \n"
8451
8452 "subs %0, #1 \n"
8453 "vst1.f32 {d0-d1}, [%1]! \n"
8454 "bne 0b \n"
8455 "sub %2, #32 \n"
8456 : "=r"(nn), // %0
8457 "=r"(outptr), // %1
8458 "=r"(r0), // %2
8459 "=r"(r1), // %3
8460 "=r"(r2) // %4
8461 : "0"(nn),
8462 "1"(outptr),
8463 "2"(r0),
8464 "3"(r1),
8465 "4"(r2),
8466 "w"(_k0123), // %10
8467 "w"(_k3456), // %11
8468 "w"(_k6789) // %12
8469 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
8470 }
8471 #endif // __aarch64__
8472 #endif // __ARM_NEON
8473 for (; remain > 0; remain--)
8474 {
8475 #if __ARM_NEON
8476 float32x4_t _r00 = vld1q_f32(r0);
8477 float32x4_t _r10 = vld1q_f32(r1);
8478 float32x4_t _r20 = vld1q_f32(r2);
8479
8480 float32x4_t _sum = vmulq_f32(_r00, _k0123);
8481 _sum = vmlaq_f32(_sum, _r10, _k3456);
8482 _sum = vmlaq_f32(_sum, _r20, _k6789);
8483
8484 _sum = vsetq_lane_f32(*outptr, _sum, 3);
8485
8486 #if __aarch64__
8487 *outptr = vaddvq_f32(_sum);
8488 #else
8489 float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
8490 _ss = vpadd_f32(_ss, _ss);
8491
8492 *outptr = vget_lane_f32(_ss, 0);
8493 #endif // __aarch64__
8494 #else
8495 float sum = 0;
8496
8497 sum += r0[0] * k0[0];
8498 sum += r0[1] * k0[1];
8499 sum += r0[2] * k0[2];
8500 sum += r1[0] * k1[0];
8501 sum += r1[1] * k1[1];
8502 sum += r1[2] * k1[2];
8503 sum += r2[0] * k2[0];
8504 sum += r2[1] * k2[1];
8505 sum += r2[2] * k2[2];
8506
8507 *outptr += sum;
8508 #endif // __ARM_NEON
8509
8510 r0 += 2;
8511 r1 += 2;
8512 r2 += 2;
8513 outptr++;
8514 }
8515
8516 r0 += tailstep;
8517 r1 += tailstep;
8518 r2 += tailstep;
8519 }
8520
8521 kernel0 += 9;
8522 }
8523 }
8524 }
8525
conv3x3s2_transform_kernel_neon(const Mat & _kernel,Mat & kernel_tm,int inch,int outch)8526 static void conv3x3s2_transform_kernel_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch)
8527 {
8528 kernel_tm.create(8 * 9, inch, outch / 8 + outch % 8);
8529
8530 const float* kernel = _kernel;
8531
8532 int p = 0;
8533 for (; p + 7 < outch; p += 8)
8534 {
8535 const float* k0 = kernel + (p + 0) * inch * 9;
8536 const float* k1 = kernel + (p + 1) * inch * 9;
8537 const float* k2 = kernel + (p + 2) * inch * 9;
8538 const float* k3 = kernel + (p + 3) * inch * 9;
8539 const float* k4 = kernel + (p + 4) * inch * 9;
8540 const float* k5 = kernel + (p + 5) * inch * 9;
8541 const float* k6 = kernel + (p + 6) * inch * 9;
8542 const float* k7 = kernel + (p + 7) * inch * 9;
8543
8544 float* ktmp = kernel_tm.channel(p / 8);
8545
8546 for (int q = 0; q < inch; q++)
8547 {
8548 for (int k = 0; k < 9; k++)
8549 {
8550 ktmp[0] = k0[k];
8551 ktmp[1] = k1[k];
8552 ktmp[2] = k2[k];
8553 ktmp[3] = k3[k];
8554 ktmp[4] = k4[k];
8555 ktmp[5] = k5[k];
8556 ktmp[6] = k6[k];
8557 ktmp[7] = k7[k];
8558 ktmp += 8;
8559 }
8560
8561 k0 += 9;
8562 k1 += 9;
8563 k2 += 9;
8564 k3 += 9;
8565 k4 += 9;
8566 k5 += 9;
8567 k6 += 9;
8568 k7 += 9;
8569 }
8570 }
8571 for (; p < outch; p++)
8572 {
8573 const float* k0 = kernel + (p + 0) * inch * 9;
8574
8575 float* ktmp = kernel_tm.channel(p / 8 + p % 8);
8576
8577 for (int q = 0; q < inch; q++)
8578 {
8579 for (int k = 0; k < 9; k++)
8580 {
8581 ktmp[k] = k0[k];
8582 }
8583 ktmp += 9;
8584
8585 k0 += 9;
8586 }
8587 }
8588 }
8589
conv3x3s2_packed_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & _kernel,const Mat & _bias,const Option & opt)8590 static void conv3x3s2_packed_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Mat& _bias, const Option& opt)
8591 {
8592 int w = bottom_blob.w;
8593 int inch = bottom_blob.c;
8594
8595 int outw = top_blob.w;
8596 int outh = top_blob.h;
8597 int outch = top_blob.c;
8598
8599 const int tailstep = w - 2 * outw + w;
8600
8601 // const float* kernel = _kernel;
8602 const float* bias = _bias;
8603
8604 int nn_outch = outch >> 3;
8605 int remain_outch_start = nn_outch << 3;
8606
8607 #pragma omp parallel for num_threads(opt.num_threads)
8608 for (int pp = 0; pp < nn_outch; pp++)
8609 {
8610 int p = pp * 8;
8611
8612 Mat out0 = top_blob.channel(p + 0);
8613 Mat out1 = top_blob.channel(p + 1);
8614 Mat out2 = top_blob.channel(p + 2);
8615 Mat out3 = top_blob.channel(p + 3);
8616 Mat out4 = top_blob.channel(p + 4);
8617 Mat out5 = top_blob.channel(p + 5);
8618 Mat out6 = top_blob.channel(p + 6);
8619 Mat out7 = top_blob.channel(p + 7);
8620
8621 const float bias0 = bias ? bias[p + 0] : 0.f;
8622 const float bias1 = bias ? bias[p + 1] : 0.f;
8623 const float bias2 = bias ? bias[p + 2] : 0.f;
8624 const float bias3 = bias ? bias[p + 3] : 0.f;
8625 const float bias4 = bias ? bias[p + 4] : 0.f;
8626 const float bias5 = bias ? bias[p + 5] : 0.f;
8627 const float bias6 = bias ? bias[p + 6] : 0.f;
8628 const float bias7 = bias ? bias[p + 7] : 0.f;
8629
8630 out0.fill(bias0);
8631 out1.fill(bias1);
8632 out2.fill(bias2);
8633 out3.fill(bias3);
8634 out4.fill(bias4);
8635 out5.fill(bias5);
8636 out6.fill(bias6);
8637 out7.fill(bias7);
8638
8639 const float* ktmp = _kernel.channel(p / 8);
8640
8641 for (int q = 0; q < inch; q++)
8642 {
8643 float* outptr0 = out0;
8644 float* outptr1 = out1;
8645 float* outptr2 = out2;
8646 float* outptr3 = out3;
8647 float* outptr4 = out4;
8648 float* outptr5 = out5;
8649 float* outptr6 = out6;
8650 float* outptr7 = out7;
8651
8652 const float* img0 = bottom_blob.channel(q);
8653
8654 const float* r0 = img0;
8655 const float* r1 = img0 + w;
8656 const float* r2 = img0 + w * 2;
8657
8658 int i = 0;
8659
8660 for (; i < outh; i++)
8661 {
8662 #if __ARM_NEON
8663 int nn = outw >> 2;
8664 int remain = outw & 3;
8665 #else
8666 int remain = outw;
8667 #endif // __ARM_NEON
8668
8669 #if __ARM_NEON
8670 #if __aarch64__
8671 if (nn > 0)
8672 {
8673 asm volatile(
8674 "0: \n"
8675
8676 "prfm pldl1keep, [%1, #128] \n"
8677 "ld1 {v8.4s}, [%1] \n"
8678 "prfm pldl1keep, [%2, #128] \n"
8679 "ld1 {v9.4s}, [%2] \n"
8680
8681 "prfm pldl1keep, [%3, #128] \n"
8682 "ld1 {v10.4s}, [%3] \n"
8683 "prfm pldl1keep, [%4, #128] \n"
8684 "ld1 {v11.4s}, [%4] \n"
8685
8686 ///
8687 "prfm pldl1keep, [%9, #256] \n"
8688 "ld2 {v4.4s, v5.4s}, [%9], #32 \n" // v4=00 v5=01
8689
8690 "ld1 {v0.4s, v1.4s}, [%12], #32 \n"
8691
8692 "fmla v8.4s, v4.4s, v0.s[0] \n"
8693 "fmla v9.4s, v4.4s, v0.s[1] \n"
8694
8695 "prfm pldl1keep, [%5, #128] \n"
8696 "ld1 {v12.4s}, [%5] \n"
8697 "prfm pldl1keep, [%6, #128] \n"
8698 "ld1 {v13.4s}, [%6] \n"
8699
8700 "fmla v10.4s, v4.4s, v0.s[2] \n"
8701 "fmla v11.4s, v4.4s, v0.s[3] \n"
8702
8703 "prfm pldl1keep, [%7, #128] \n"
8704 "ld1 {v14.4s}, [%7] \n"
8705 "prfm pldl1keep, [%8, #128] \n"
8706 "ld1 {v15.4s}, [%8] \n"
8707
8708 "ld1 {v2.4s, v3.4s}, [%12], #32 \n"
8709
8710 "fmla v12.4s, v4.4s, v1.s[0] \n"
8711 "fmla v13.4s, v4.4s, v1.s[1] \n"
8712 "fmla v14.4s, v4.4s, v1.s[2] \n"
8713 "fmla v15.4s, v4.4s, v1.s[3] \n"
8714
8715 "prfm pldl1keep, [%9, #256] \n"
8716 "ld2 {v6.4s, v7.4s}, [%9] \n" // v6
8717
8718 "fmla v8.4s, v5.4s, v2.s[0] \n"
8719 "fmla v9.4s, v5.4s, v2.s[1] \n"
8720 "fmla v10.4s, v5.4s, v2.s[2] \n"
8721 "fmla v11.4s, v5.4s, v2.s[3] \n"
8722
8723 "ext v6.16b, v4.16b, v6.16b, #4 \n" // v6=02
8724
8725 "ld1 {v0.4s, v1.4s}, [%12], #32 \n"
8726
8727 "fmla v12.4s, v5.4s, v3.s[0] \n"
8728 "fmla v13.4s, v5.4s, v3.s[1] \n"
8729 "fmla v14.4s, v5.4s, v3.s[2] \n"
8730 "fmla v15.4s, v5.4s, v3.s[3] \n"
8731
8732 ///
8733 "prfm pldl1keep, [%10, #256] \n"
8734 "ld2 {v4.4s, v5.4s}, [%10], #32 \n" // v4=10 v5=11
8735
8736 "fmla v8.4s, v6.4s, v0.s[0] \n"
8737 "fmla v9.4s, v6.4s, v0.s[1] \n"
8738 "fmla v10.4s, v6.4s, v0.s[2] \n"
8739 "fmla v11.4s, v6.4s, v0.s[3] \n"
8740
8741 "ld1 {v2.4s, v3.4s}, [%12], #32 \n"
8742
8743 "fmla v12.4s, v6.4s, v1.s[0] \n"
8744 "fmla v13.4s, v6.4s, v1.s[1] \n"
8745 "fmla v14.4s, v6.4s, v1.s[2] \n"
8746 "fmla v15.4s, v6.4s, v1.s[3] \n"
8747
8748 "fmla v8.4s, v4.4s, v2.s[0] \n"
8749 "fmla v9.4s, v4.4s, v2.s[1] \n"
8750 "fmla v10.4s, v4.4s, v2.s[2] \n"
8751 "fmla v11.4s, v4.4s, v2.s[3] \n"
8752
8753 "ld1 {v0.4s, v1.4s}, [%12], #32 \n"
8754
8755 "fmla v12.4s, v4.4s, v3.s[0] \n"
8756 "fmla v13.4s, v4.4s, v3.s[1] \n"
8757 "fmla v14.4s, v4.4s, v3.s[2] \n"
8758 "fmla v15.4s, v4.4s, v3.s[3] \n"
8759
8760 "prfm pldl1keep, [%10, #256] \n"
8761 "ld2 {v6.4s, v7.4s}, [%10] \n" // v6
8762
8763 "fmla v8.4s, v5.4s, v0.s[0] \n"
8764 "fmla v9.4s, v5.4s, v0.s[1] \n"
8765 "fmla v10.4s, v5.4s, v0.s[2] \n"
8766 "fmla v11.4s, v5.4s, v0.s[3] \n"
8767
8768 "ld1 {v2.4s, v3.4s}, [%12], #32 \n"
8769
8770 "ext v6.16b, v4.16b, v6.16b, #4 \n" // v6=12
8771
8772 "fmla v12.4s, v5.4s, v1.s[0] \n"
8773 "fmla v13.4s, v5.4s, v1.s[1] \n"
8774 "fmla v14.4s, v5.4s, v1.s[2] \n"
8775 "fmla v15.4s, v5.4s, v1.s[3] \n"
8776
8777 ///
8778 "prfm pldl1keep, [%11, #256] \n"
8779 "ld2 {v4.4s, v5.4s}, [%11], #32 \n" // v4=20 v5=21
8780
8781 "fmla v8.4s, v6.4s, v2.s[0] \n"
8782 "fmla v9.4s, v6.4s, v2.s[1] \n"
8783 "fmla v10.4s, v6.4s, v2.s[2] \n"
8784 "fmla v11.4s, v6.4s, v2.s[3] \n"
8785
8786 "ld1 {v0.4s, v1.4s}, [%12], #32 \n"
8787
8788 "fmla v12.4s, v6.4s, v3.s[0] \n"
8789 "fmla v13.4s, v6.4s, v3.s[1] \n"
8790 "fmla v14.4s, v6.4s, v3.s[2] \n"
8791 "fmla v15.4s, v6.4s, v3.s[3] \n"
8792
8793 "fmla v8.4s, v4.4s, v0.s[0] \n"
8794 "fmla v9.4s, v4.4s, v0.s[1] \n"
8795 "fmla v10.4s, v4.4s, v0.s[2] \n"
8796 "fmla v11.4s, v4.4s, v0.s[3] \n"
8797
8798 "ld1 {v2.4s, v3.4s}, [%12], #32 \n"
8799
8800 "fmla v12.4s, v4.4s, v1.s[0] \n"
8801 "fmla v13.4s, v4.4s, v1.s[1] \n"
8802 "fmla v14.4s, v4.4s, v1.s[2] \n"
8803 "fmla v15.4s, v4.4s, v1.s[3] \n"
8804
8805 "prfm pldl1keep, [%11, #256] \n"
8806 "ld2 {v6.4s, v7.4s}, [%11] \n" // v6
8807
8808 "fmla v8.4s, v5.4s, v2.s[0] \n"
8809 "fmla v9.4s, v5.4s, v2.s[1] \n"
8810 "fmla v10.4s, v5.4s, v2.s[2] \n"
8811 "fmla v11.4s, v5.4s, v2.s[3] \n"
8812
8813 "ext v6.16b, v4.16b, v6.16b, #4 \n" // v6=22
8814
8815 "ld1 {v0.4s, v1.4s}, [%12], #32 \n"
8816
8817 "fmla v12.4s, v5.4s, v3.s[0] \n"
8818 "fmla v13.4s, v5.4s, v3.s[1] \n"
8819 "fmla v14.4s, v5.4s, v3.s[2] \n"
8820 "fmla v15.4s, v5.4s, v3.s[3] \n"
8821
8822 "fmla v8.4s, v6.4s, v0.s[0] \n"
8823 "fmla v9.4s, v6.4s, v0.s[1] \n"
8824 "fmla v10.4s, v6.4s, v0.s[2] \n"
8825 "fmla v11.4s, v6.4s, v0.s[3] \n"
8826
8827 "fmla v12.4s, v6.4s, v1.s[0] \n"
8828 "fmla v13.4s, v6.4s, v1.s[1] \n"
8829
8830 "st1 {v8.4s}, [%1], #16 \n"
8831 "st1 {v9.4s}, [%2], #16 \n"
8832
8833 "fmla v14.4s, v6.4s, v1.s[2] \n"
8834 "fmla v15.4s, v6.4s, v1.s[3] \n"
8835
8836 "st1 {v10.4s}, [%3], #16 \n"
8837 "st1 {v11.4s}, [%4], #16 \n"
8838
8839 "sub %12, %12, #288 \n"
8840
8841 "st1 {v12.4s}, [%5], #16 \n"
8842 "st1 {v13.4s}, [%6], #16 \n"
8843
8844 "subs %w0, %w0, #1 \n"
8845
8846 "st1 {v14.4s}, [%7], #16 \n"
8847 "st1 {v15.4s}, [%8], #16 \n"
8848
8849 "bne 0b \n"
8850 : "=r"(nn), // %0
8851 "=r"(outptr0), // %1
8852 "=r"(outptr1), // %2
8853 "=r"(outptr2), // %3
8854 "=r"(outptr3), // %4
8855 "=r"(outptr4), // %5
8856 "=r"(outptr5), // %6
8857 "=r"(outptr6), // %7
8858 "=r"(outptr7), // %8
8859 "=r"(r0), // %9
8860 "=r"(r1), // %10
8861 "=r"(r2), // %11
8862 "=r"(ktmp) // %12
8863 : "0"(nn),
8864 "1"(outptr0),
8865 "2"(outptr1),
8866 "3"(outptr2),
8867 "4"(outptr3),
8868 "5"(outptr4),
8869 "6"(outptr5),
8870 "7"(outptr6),
8871 "8"(outptr7),
8872 "9"(r0),
8873 "10"(r1),
8874 "11"(r2),
8875 "12"(ktmp)
8876 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
8877 }
8878 #else // __aarch64__
8879 if (nn > 0)
8880 {
8881 asm volatile(
8882 "0: \n"
8883
8884 "pld [%1, #128] \n"
8885 "vld1.f32 {d16-d17}, [%1] \n"
8886 "pld [%2, #128] \n"
8887 "vld1.f32 {d18-d19}, [%2] \n"
8888
8889 "pld [%3, #128] \n"
8890 "vld1.f32 {d20-d21}, [%3] \n"
8891 "pld [%4, #128] \n"
8892 "vld1.f32 {d22-d23}, [%4] \n"
8893
8894 ///
8895 "pld [%9, #256] \n"
8896 "vld2.f32 {d8-d11}, [%9]! \n" // q4=00 q5=01
8897
8898 "vld1.f32 {d0-d3}, [%12 :128]! \n"
8899
8900 "vmla.f32 q8, q4, d0[0] \n"
8901 "vmla.f32 q9, q4, d0[1] \n"
8902
8903 "pld [%5, #128] \n"
8904 "vld1.f32 {d24-d25}, [%5] \n"
8905 "pld [%6, #128] \n"
8906 "vld1.f32 {d26-d27}, [%6] \n"
8907
8908 "vmla.f32 q10, q4, d1[0] \n"
8909 "vmla.f32 q11, q4, d1[1] \n"
8910
8911 "pld [%7, #128] \n"
8912 "vld1.f32 {d28-d29}, [%7] \n"
8913 "pld [%8, #128] \n"
8914 "vld1.f32 {d30-d31}, [%8] \n"
8915
8916 "vld1.f32 {d4-d7}, [%12 :128]! \n"
8917
8918 "vmla.f32 q12, q4, d2[0] \n"
8919 "vmla.f32 q13, q4, d2[1] \n"
8920 "vmla.f32 q14, q4, d3[0] \n"
8921 "vmla.f32 q15, q4, d3[1] \n"
8922
8923 "pld [%9, #128] \n"
8924 "vld2.f32 {d12-d13}, [%9] \n" // q6
8925
8926 "vmla.f32 q8, q5, d4[0] \n"
8927 "vmla.f32 q9, q5, d4[1] \n"
8928 "vmla.f32 q10, q5, d5[0] \n"
8929 "vmla.f32 q11, q5, d5[1] \n"
8930
8931 "vext.f32 q6, q4, q6, #1 \n" // q6=02
8932
8933 "vld1.f32 {d0-d3}, [%12 :128]! \n"
8934
8935 "vmla.f32 q12, q5, d6[0] \n"
8936 "vmla.f32 q13, q5, d6[1] \n"
8937 "vmla.f32 q14, q5, d7[0] \n"
8938 "vmla.f32 q15, q5, d7[1] \n"
8939
8940 ///
8941 "pld [%10, #256] \n"
8942 "vld2.f32 {d8-d11}, [%10]! \n" // q4=10 q5=11
8943
8944 "vmla.f32 q8, q6, d0[0] \n"
8945 "vmla.f32 q9, q6, d0[1] \n"
8946 "vmla.f32 q10, q6, d1[0] \n"
8947 "vmla.f32 q11, q6, d1[1] \n"
8948
8949 "vld1.f32 {d4-d7}, [%12 :128]! \n"
8950
8951 "vmla.f32 q12, q6, d2[0] \n"
8952 "vmla.f32 q13, q6, d2[1] \n"
8953 "vmla.f32 q14, q6, d3[0] \n"
8954 "vmla.f32 q15, q6, d3[1] \n"
8955
8956 "vmla.f32 q8, q4, d4[0] \n"
8957 "vmla.f32 q9, q4, d4[1] \n"
8958 "vmla.f32 q10, q4, d5[0] \n"
8959 "vmla.f32 q11, q4, d5[1] \n"
8960
8961 "vld1.f32 {d0-d3}, [%12 :128]! \n"
8962
8963 "vmla.f32 q12, q4, d6[0] \n"
8964 "vmla.f32 q13, q4, d6[1] \n"
8965 "vmla.f32 q14, q4, d7[0] \n"
8966 "vmla.f32 q15, q4, d7[1] \n"
8967
8968 "pld [%10, #128] \n"
8969 "vld2.f32 {d12-d13}, [%10] \n" // q6
8970
8971 "vmla.f32 q8, q5, d0[0] \n"
8972 "vmla.f32 q9, q5, d0[1] \n"
8973 "vmla.f32 q10, q5, d1[0] \n"
8974 "vmla.f32 q11, q5, d1[1] \n"
8975
8976 "vld1.f32 {d4-d7}, [%12 :128]! \n"
8977
8978 "vext.f32 q6, q4, q6, #1 \n" // q6=12
8979
8980 "vmla.f32 q12, q5, d2[0] \n"
8981 "vmla.f32 q13, q5, d2[1] \n"
8982 "vmla.f32 q14, q5, d3[0] \n"
8983 "vmla.f32 q15, q5, d3[1] \n"
8984
8985 ///
8986 "pld [%11, #256] \n"
8987 "vld2.f32 {d8-d11}, [%11]! \n" // q4=20 q5=21
8988
8989 "vmla.f32 q8, q6, d4[0] \n"
8990 "vmla.f32 q9, q6, d4[1] \n"
8991 "vmla.f32 q10, q6, d5[0] \n"
8992 "vmla.f32 q11, q6, d5[1] \n"
8993
8994 "vld1.f32 {d0-d3}, [%12 :128]! \n"
8995
8996 "vmla.f32 q12, q6, d6[0] \n"
8997 "vmla.f32 q13, q6, d6[1] \n"
8998 "vmla.f32 q14, q6, d7[0] \n"
8999 "vmla.f32 q15, q6, d7[1] \n"
9000
9001 "vmla.f32 q8, q4, d0[0] \n"
9002 "vmla.f32 q9, q4, d0[1] \n"
9003 "vmla.f32 q10, q4, d1[0] \n"
9004 "vmla.f32 q11, q4, d1[1] \n"
9005
9006 "vld1.f32 {d4-d7}, [%12 :128]! \n"
9007
9008 "vmla.f32 q12, q4, d2[0] \n"
9009 "vmla.f32 q13, q4, d2[1] \n"
9010 "vmla.f32 q14, q4, d3[0] \n"
9011 "vmla.f32 q15, q4, d3[1] \n"
9012
9013 "pld [%11, #128] \n"
9014 "vld2.f32 {d12-d13}, [%11] \n" // q6
9015
9016 "vmla.f32 q8, q5, d4[0] \n"
9017 "vmla.f32 q9, q5, d4[1] \n"
9018 "vmla.f32 q10, q5, d5[0] \n"
9019 "vmla.f32 q11, q5, d5[1] \n"
9020
9021 "vext.f32 q6, q4, q6, #1 \n" // q6=22
9022
9023 "vld1.f32 {d0-d3}, [%12 :128]! \n"
9024
9025 "vmla.f32 q12, q5, d6[0] \n"
9026 "vmla.f32 q13, q5, d6[1] \n"
9027 "vmla.f32 q14, q5, d7[0] \n"
9028 "vmla.f32 q15, q5, d7[1] \n"
9029
9030 "vmla.f32 q8, q6, d0[0] \n"
9031 "vmla.f32 q9, q6, d0[1] \n"
9032 "vmla.f32 q10, q6, d1[0] \n"
9033 "vmla.f32 q11, q6, d1[1] \n"
9034
9035 "vmla.f32 q12, q6, d2[0] \n"
9036 "vmla.f32 q13, q6, d2[1] \n"
9037
9038 "vst1.f32 {d16-d17}, [%1]! \n"
9039 "vst1.f32 {d18-d19}, [%2]! \n"
9040
9041 "vmla.f32 q14, q6, d3[0] \n"
9042 "vmla.f32 q15, q6, d3[1] \n"
9043
9044 "vst1.f32 {d20-d21}, [%3]! \n"
9045 "vst1.f32 {d22-d23}, [%4]! \n"
9046
9047 "sub %12, %12, #288 \n"
9048
9049 "vst1.f32 {d24-d25}, [%5]! \n"
9050 "vst1.f32 {d26-d27}, [%6]! \n"
9051
9052 "subs %0, #1 \n"
9053
9054 "vst1.f32 {d28-d29}, [%7]! \n"
9055 "vst1.f32 {d30-d31}, [%8]! \n"
9056
9057 "bne 0b \n"
9058 : "=r"(nn), // %0
9059 "=r"(outptr0), // %1
9060 "=r"(outptr1), // %2
9061 "=r"(outptr2), // %3
9062 "=r"(outptr3), // %4
9063 "=r"(outptr4), // %5
9064 "=r"(outptr5), // %6
9065 "=r"(outptr6), // %7
9066 "=r"(outptr7), // %8
9067 "=r"(r0), // %9
9068 "=r"(r1), // %10
9069 "=r"(r2), // %11
9070 "=r"(ktmp) // %12
9071 : "0"(nn),
9072 "1"(outptr0),
9073 "2"(outptr1),
9074 "3"(outptr2),
9075 "4"(outptr3),
9076 "5"(outptr4),
9077 "6"(outptr5),
9078 "7"(outptr6),
9079 "8"(outptr7),
9080 "9"(r0),
9081 "10"(r1),
9082 "11"(r2),
9083 "12"(ktmp)
9084 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9085 }
9086 #endif // __aarch64__
9087 #endif // __ARM_NEON
9088 for (; remain > 0; remain--)
9089 {
9090 #if __ARM_NEON
9091 #if __aarch64__
9092 asm volatile(
9093 "ld1 {v10.4s, v11.4s}, [%11], #32 \n"
9094
9095 "prfm pldl1keep, [%8, #128] \n"
9096 "ld1 {v0.4s}, [%8] \n"
9097
9098 "ld1 {v12.4s, v13.4s}, [%11], #32 \n"
9099
9100 "ld1 {v8.s}[0], [%0] \n"
9101 "ld1 {v8.s}[1], [%1] \n"
9102 "ld1 {v8.s}[2], [%2] \n"
9103 "ld1 {v8.s}[3], [%3] \n"
9104
9105 "fmul v14.4s, v10.4s, v0.s[0] \n"
9106 "fmul v15.4s, v11.4s, v0.s[0] \n"
9107
9108 "ld1 {v9.s}[0], [%4] \n"
9109 "ld1 {v9.s}[1], [%5] \n"
9110 "ld1 {v9.s}[2], [%6] \n"
9111 "ld1 {v9.s}[3], [%7] \n"
9112
9113 "ld1 {v10.4s, v11.4s}, [%11], #32 \n"
9114
9115 "fmla v8.4s, v12.4s, v0.s[1] \n"
9116 "fmla v9.4s, v13.4s, v0.s[1] \n"
9117
9118 "ld1 {v12.4s, v13.4s}, [%11], #32 \n"
9119
9120 "fmla v14.4s, v10.4s, v0.s[2] \n"
9121 "fmla v15.4s, v11.4s, v0.s[2] \n"
9122
9123 "prfm pldl1keep, [%9, #128] \n"
9124 "ld1 {v1.4s}, [%9] \n"
9125
9126 "ld1 {v10.4s, v11.4s}, [%11], #32 \n"
9127
9128 "fmla v8.4s, v12.4s, v1.s[0] \n"
9129 "fmla v9.4s, v13.4s, v1.s[0] \n"
9130
9131 "ld1 {v12.4s, v13.4s}, [%11], #32 \n"
9132
9133 "fmla v14.4s, v10.4s, v1.s[1] \n"
9134 "fmla v15.4s, v11.4s, v1.s[1] \n"
9135
9136 "ld1 {v10.4s, v11.4s}, [%11], #32 \n"
9137
9138 "fmla v8.4s, v12.4s, v1.s[2] \n"
9139 "fmla v9.4s, v13.4s, v1.s[2] \n"
9140
9141 "prfm pldl1keep, [%10, #128] \n"
9142 "ld1 {v0.4s}, [%10] \n"
9143
9144 "ld1 {v12.4s, v13.4s}, [%11], #32 \n"
9145
9146 "fmla v14.4s, v10.4s, v0.s[0] \n"
9147 "fmla v15.4s, v11.4s, v0.s[0] \n"
9148
9149 "ld1 {v10.4s, v11.4s}, [%11], #32 \n"
9150
9151 "fmla v8.4s, v12.4s, v0.s[1] \n"
9152 "fmla v9.4s, v13.4s, v0.s[1] \n"
9153
9154 "fmla v14.4s, v10.4s, v0.s[2] \n"
9155 "fmla v15.4s, v11.4s, v0.s[2] \n"
9156
9157 "fadd v8.4s, v8.4s, v14.4s \n"
9158 "fadd v9.4s, v9.4s, v15.4s \n"
9159
9160 "sub %11, %11, #288 \n"
9161
9162 "st1 {v8.s}[0], [%0], #4 \n"
9163 "st1 {v8.s}[1], [%1], #4 \n"
9164 "st1 {v8.s}[2], [%2], #4 \n"
9165 "st1 {v8.s}[3], [%3], #4 \n"
9166
9167 "st1 {v9.s}[0], [%4], #4 \n"
9168 "st1 {v9.s}[1], [%5], #4 \n"
9169 "st1 {v9.s}[2], [%6], #4 \n"
9170 "st1 {v9.s}[3], [%7], #4 \n"
9171
9172 : "=r"(outptr0), // %0
9173 "=r"(outptr1), // %1
9174 "=r"(outptr2), // %2
9175 "=r"(outptr3), // %3
9176 "=r"(outptr4), // %4
9177 "=r"(outptr5), // %5
9178 "=r"(outptr6), // %6
9179 "=r"(outptr7), // %7
9180 "=r"(r0), // %8
9181 "=r"(r1), // %9
9182 "=r"(r2), // %10
9183 "=r"(ktmp) // %11
9184 : "0"(outptr0),
9185 "1"(outptr1),
9186 "2"(outptr2),
9187 "3"(outptr3),
9188 "4"(outptr4),
9189 "5"(outptr5),
9190 "6"(outptr6),
9191 "7"(outptr7),
9192 "8"(r0),
9193 "9"(r1),
9194 "10"(r2),
9195 "11"(ktmp)
9196 : "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
9197 #else // __aarch64__
9198 asm volatile(
9199 "vld1.f32 {d20-d23}, [%11 :128]! \n"
9200
9201 "pld [%8, #128] \n"
9202 "vld1.f32 {d0-d1}, [%8] \n"
9203
9204 "vld1.f32 {d24-d27}, [%11 :128]! \n"
9205
9206 "vld1.f32 {d16[0]}, [%0] \n"
9207 "vld1.f32 {d16[1]}, [%1] \n"
9208 "vld1.f32 {d17[0]}, [%2] \n"
9209 "vld1.f32 {d17[1]}, [%3] \n"
9210
9211 "vmul.f32 q14, q10, d0[0] \n"
9212 "vmul.f32 q15, q11, d0[0] \n"
9213
9214 "vld1.f32 {d18[0]}, [%4] \n"
9215 "vld1.f32 {d18[1]}, [%5] \n"
9216 "vld1.f32 {d19[0]}, [%6] \n"
9217 "vld1.f32 {d19[1]}, [%7] \n"
9218
9219 "vld1.f32 {d20-d23}, [%11 :128]! \n"
9220
9221 "vmla.f32 q8, q12, d0[1] \n"
9222 "vmla.f32 q9, q13, d0[1] \n"
9223
9224 "vld1.f32 {d24-d27}, [%11 :128]! \n"
9225
9226 "vmla.f32 q14, q10, d1[0] \n"
9227 "vmla.f32 q15, q11, d1[0] \n"
9228
9229 "pld [%9, #128] \n"
9230 "vld1.f32 {d2-d3}, [%9] \n"
9231
9232 "vld1.f32 {d20-d23}, [%11 :128]! \n"
9233
9234 "vmla.f32 q8, q12, d2[0] \n"
9235 "vmla.f32 q9, q13, d2[0] \n"
9236
9237 "vld1.f32 {d24-d27}, [%11 :128]! \n"
9238
9239 "vmla.f32 q14, q10, d2[1] \n"
9240 "vmla.f32 q15, q11, d2[1] \n"
9241
9242 "vld1.f32 {d20-d23}, [%11 :128]! \n"
9243
9244 "vmla.f32 q8, q12, d3[0] \n"
9245 "vmla.f32 q9, q13, d3[0] \n"
9246
9247 "pld [%10, #128] \n"
9248 "vld1.f32 {d0-d1}, [%10] \n"
9249
9250 "vld1.f32 {d24-d27}, [%11 :128]! \n"
9251
9252 "vmla.f32 q14, q10, d0[0] \n"
9253 "vmla.f32 q15, q11, d0[0] \n"
9254
9255 "vld1.f32 {d20-d23}, [%11 :128]! \n"
9256
9257 "vmla.f32 q8, q12, d0[1] \n"
9258 "vmla.f32 q9, q13, d0[1] \n"
9259
9260 "vmla.f32 q14, q10, d1[0] \n"
9261 "vmla.f32 q15, q11, d1[0] \n"
9262
9263 "vadd.f32 q8, q8, q14 \n"
9264 "vadd.f32 q9, q9, q15 \n"
9265
9266 "sub %11, %11, #288 \n"
9267
9268 "vst1.f32 {d16[0]}, [%0]! \n"
9269 "vst1.f32 {d16[1]}, [%1]! \n"
9270 "vst1.f32 {d17[0]}, [%2]! \n"
9271 "vst1.f32 {d17[1]}, [%3]! \n"
9272
9273 "vst1.f32 {d18[0]}, [%4]! \n"
9274 "vst1.f32 {d18[1]}, [%5]! \n"
9275 "vst1.f32 {d19[0]}, [%6]! \n"
9276 "vst1.f32 {d19[1]}, [%7]! \n"
9277
9278 : "=r"(outptr0), // %0
9279 "=r"(outptr1), // %1
9280 "=r"(outptr2), // %2
9281 "=r"(outptr3), // %3
9282 "=r"(outptr4), // %4
9283 "=r"(outptr5), // %5
9284 "=r"(outptr6), // %6
9285 "=r"(outptr7), // %7
9286 "=r"(r0), // %8
9287 "=r"(r1), // %9
9288 "=r"(r2), // %10
9289 "=r"(ktmp) // %11
9290 : "0"(outptr0),
9291 "1"(outptr1),
9292 "2"(outptr2),
9293 "3"(outptr3),
9294 "4"(outptr4),
9295 "5"(outptr5),
9296 "6"(outptr6),
9297 "7"(outptr7),
9298 "8"(r0),
9299 "9"(r1),
9300 "10"(r2),
9301 "11"(ktmp)
9302 : "memory", "q0", "q1", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9303 #endif // __aarch64__
9304 #else // __ARM_NEON
9305 float sum0 = 0.f;
9306 float sum1 = 0.f;
9307 float sum2 = 0.f;
9308 float sum3 = 0.f;
9309 float sum4 = 0.f;
9310 float sum5 = 0.f;
9311 float sum6 = 0.f;
9312 float sum7 = 0.f;
9313
9314 sum0 += r0[0] * ktmp[0];
9315 sum1 += r0[0] * ktmp[1];
9316 sum2 += r0[0] * ktmp[2];
9317 sum3 += r0[0] * ktmp[3];
9318 sum4 += r0[0] * ktmp[4];
9319 sum5 += r0[0] * ktmp[5];
9320 sum6 += r0[0] * ktmp[6];
9321 sum7 += r0[0] * ktmp[7];
9322 ktmp += 8;
9323
9324 sum0 += r0[1] * ktmp[0];
9325 sum1 += r0[1] * ktmp[1];
9326 sum2 += r0[1] * ktmp[2];
9327 sum3 += r0[1] * ktmp[3];
9328 sum4 += r0[1] * ktmp[4];
9329 sum5 += r0[1] * ktmp[5];
9330 sum6 += r0[1] * ktmp[6];
9331 sum7 += r0[1] * ktmp[7];
9332 ktmp += 8;
9333
9334 sum0 += r0[2] * ktmp[0];
9335 sum1 += r0[2] * ktmp[1];
9336 sum2 += r0[2] * ktmp[2];
9337 sum3 += r0[2] * ktmp[3];
9338 sum4 += r0[2] * ktmp[4];
9339 sum5 += r0[2] * ktmp[5];
9340 sum6 += r0[2] * ktmp[6];
9341 sum7 += r0[2] * ktmp[7];
9342 ktmp += 8;
9343
9344 sum0 += r1[0] * ktmp[0];
9345 sum1 += r1[0] * ktmp[1];
9346 sum2 += r1[0] * ktmp[2];
9347 sum3 += r1[0] * ktmp[3];
9348 sum4 += r1[0] * ktmp[4];
9349 sum5 += r1[0] * ktmp[5];
9350 sum6 += r1[0] * ktmp[6];
9351 sum7 += r1[0] * ktmp[7];
9352 ktmp += 8;
9353
9354 sum0 += r1[1] * ktmp[0];
9355 sum1 += r1[1] * ktmp[1];
9356 sum2 += r1[1] * ktmp[2];
9357 sum3 += r1[1] * ktmp[3];
9358 sum4 += r1[1] * ktmp[4];
9359 sum5 += r1[1] * ktmp[5];
9360 sum6 += r1[1] * ktmp[6];
9361 sum7 += r1[1] * ktmp[7];
9362 ktmp += 8;
9363
9364 sum0 += r1[2] * ktmp[0];
9365 sum1 += r1[2] * ktmp[1];
9366 sum2 += r1[2] * ktmp[2];
9367 sum3 += r1[2] * ktmp[3];
9368 sum4 += r1[2] * ktmp[4];
9369 sum5 += r1[2] * ktmp[5];
9370 sum6 += r1[2] * ktmp[6];
9371 sum7 += r1[2] * ktmp[7];
9372 ktmp += 8;
9373
9374 sum0 += r2[0] * ktmp[0];
9375 sum1 += r2[0] * ktmp[1];
9376 sum2 += r2[0] * ktmp[2];
9377 sum3 += r2[0] * ktmp[3];
9378 sum4 += r2[0] * ktmp[4];
9379 sum5 += r2[0] * ktmp[5];
9380 sum6 += r2[0] * ktmp[6];
9381 sum7 += r2[0] * ktmp[7];
9382 ktmp += 8;
9383
9384 sum0 += r2[1] * ktmp[0];
9385 sum1 += r2[1] * ktmp[1];
9386 sum2 += r2[1] * ktmp[2];
9387 sum3 += r2[1] * ktmp[3];
9388 sum4 += r2[1] * ktmp[4];
9389 sum5 += r2[1] * ktmp[5];
9390 sum6 += r2[1] * ktmp[6];
9391 sum7 += r2[1] * ktmp[7];
9392 ktmp += 8;
9393
9394 sum0 += r2[2] * ktmp[0];
9395 sum1 += r2[2] * ktmp[1];
9396 sum2 += r2[2] * ktmp[2];
9397 sum3 += r2[2] * ktmp[3];
9398 sum4 += r2[2] * ktmp[4];
9399 sum5 += r2[2] * ktmp[5];
9400 sum6 += r2[2] * ktmp[6];
9401 sum7 += r2[2] * ktmp[7];
9402 ktmp += 8;
9403
9404 *outptr0 += sum0;
9405 *outptr1 += sum1;
9406 *outptr2 += sum2;
9407 *outptr3 += sum3;
9408 *outptr4 += sum4;
9409 *outptr5 += sum5;
9410 *outptr6 += sum6;
9411 *outptr7 += sum7;
9412
9413 ktmp -= 8 * 9;
9414
9415 outptr0++;
9416 outptr1++;
9417 outptr2++;
9418 outptr3++;
9419 outptr4++;
9420 outptr5++;
9421 outptr6++;
9422 outptr7++;
9423 #endif // __ARM_NEON
9424 r0 += 2;
9425 r1 += 2;
9426 r2 += 2;
9427 }
9428
9429 r0 += tailstep;
9430 r1 += tailstep;
9431 r2 += tailstep;
9432 }
9433
9434 ktmp += 8 * 9;
9435 }
9436 }
9437
9438 #pragma omp parallel for num_threads(opt.num_threads)
9439 for (int p = remain_outch_start; p < outch; p++)
9440 {
9441 Mat out = top_blob.channel(p);
9442
9443 const float bias0 = bias ? bias[p] : 0.f;
9444
9445 out.fill(bias0);
9446
9447 const float* ktmp = _kernel.channel(p / 8 + p % 8);
9448
9449 for (int q = 0; q < inch; q++)
9450 {
9451 float* outptr = out;
9452
9453 const float* img0 = bottom_blob.channel(q);
9454
9455 const float* r0 = img0;
9456 const float* r1 = img0 + w;
9457 const float* r2 = img0 + w * 2;
9458
9459 const float* k0 = ktmp;
9460 const float* k1 = ktmp + 3;
9461 const float* k2 = ktmp + 6;
9462
9463 #if __ARM_NEON
9464 float32x4_t _k0123 = vld1q_f32(k0);
9465 float32x4_t _k3456 = vld1q_f32(k1);
9466 float32x4_t _k6789 = vld1q_f32(k2);
9467 #endif // __ARM_NEON
9468
9469 int i = 0;
9470
9471 for (; i < outh; i++)
9472 {
9473 #if __ARM_NEON
9474 int nn = outw >> 2;
9475 int remain = outw & 3;
9476 #else
9477 int remain = outw;
9478 #endif // __ARM_NEON
9479
9480 #if __ARM_NEON
9481 #if __aarch64__
9482 if (nn > 0)
9483 {
9484 asm volatile(
9485 "prfm pldl1keep, [%2, #256] \n"
9486 "ld2 {v2.4s, v3.4s}, [%2], #32 \n"
9487 "0: \n"
9488
9489 "prfm pldl1keep, [%1, #128] \n"
9490 "ld1 {v0.4s}, [%1] \n"
9491
9492 "fmla v0.4s, v2.4s, %10.s[0] \n"
9493 "fmul v10.4s, v3.4s, %10.s[1] \n"
9494
9495 "prfm pldl1keep, [%2, #256] \n"
9496 "ld2 {v8.4s, v9.4s}, [%2] \n"
9497 "ext v1.16b, v2.16b, v8.16b, #4 \n"
9498
9499 "fmul v11.4s, v1.4s, %10.s[2] \n"
9500
9501 "prfm pldl1keep, [%3, #256] \n"
9502 "ld2 {v2.4s, v3.4s}, [%3], #32 \n"
9503
9504 "fmla v0.4s, v2.4s, %11.s[0] \n"
9505 "fmla v10.4s, v3.4s, %11.s[1] \n"
9506
9507 "prfm pldl1keep, [%3, #256] \n"
9508 "ld2 {v8.4s, v9.4s}, [%3] \n"
9509 "ext v1.16b, v2.16b, v8.16b, #4 \n"
9510
9511 "fmla v11.4s, v1.4s, %11.s[2] \n"
9512
9513 "prfm pldl1keep, [%4, #256] \n"
9514 "ld2 {v2.4s, v3.4s}, [%4], #32 \n"
9515
9516 "fmla v0.4s, v2.4s, %12.s[0] \n"
9517 "fmla v10.4s, v3.4s, %12.s[1] \n"
9518
9519 "prfm pldl1keep, [%4, #256] \n"
9520 "ld2 {v8.4s, v9.4s}, [%4] \n"
9521 "ext v1.16b, v2.16b, v8.16b, #4 \n"
9522
9523 "fmla v11.4s, v1.4s, %12.s[2] \n"
9524
9525 "prfm pldl1keep, [%2, #256] \n"
9526 "ld2 {v2.4s, v3.4s}, [%2], #32 \n"
9527
9528 "fadd v0.4s, v0.4s, v10.4s \n"
9529 "fadd v0.4s, v0.4s, v11.4s \n"
9530
9531 "subs %w0, %w0, #1 \n"
9532 "st1 {v0.4s}, [%1], #16 \n"
9533 "bne 0b \n"
9534 "sub %2, %2, #32 \n"
9535 : "=r"(nn), // %0
9536 "=r"(outptr), // %1
9537 "=r"(r0), // %2
9538 "=r"(r1), // %3
9539 "=r"(r2) // %4
9540 : "0"(nn),
9541 "1"(outptr),
9542 "2"(r0),
9543 "3"(r1),
9544 "4"(r2),
9545 "w"(_k0123), // %10
9546 "w"(_k3456), // %11
9547 "w"(_k6789) // %12
9548 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
9549 }
9550 #else
9551 if (nn > 0)
9552 {
9553 asm volatile(
9554 "pld [%2, #256] \n"
9555 "vld2.f32 {d4-d7}, [%2]! \n"
9556
9557 "0: \n"
9558 "pld [%1, #128] \n"
9559 "vld1.f32 {d0-d1}, [%1] \n"
9560
9561 "vmla.f32 q0, q2, %e10[0] \n"
9562 "vmul.f32 q10, q3, %e10[1] \n"
9563
9564 "pld [%2, #128] \n"
9565 "vld2.f32 {d16-d17}, [%2] \n"
9566 "vext.32 q1, q2, q8, #1 \n"
9567
9568 "vmul.f32 q11, q1, %f10[0] \n"
9569
9570 "pld [%3, #256] \n"
9571 "vld2.f32 {d4-d7}, [%3]! \n"
9572
9573 "vmla.f32 q0, q2, %e11[0] \n"
9574 "vmla.f32 q10, q3, %e11[1] \n"
9575
9576 "pld [%3, #128] \n"
9577 "vld2.f32 {d16-d17}, [%3] \n"
9578 "vext.32 q1, q2, q8, #1 \n"
9579
9580 "vmla.f32 q11, q1, %f11[0] \n"
9581
9582 "pld [%4, #256] \n"
9583 "vld2.f32 {d4-d7}, [%4]! \n"
9584
9585 "vmla.f32 q0, q2, %e12[0] \n"
9586 "vmla.f32 q10, q3, %e12[1] \n"
9587
9588 "pld [%4, #128] \n"
9589 "vld2.f32 {d16-d17}, [%4] \n"
9590 "vext.32 q1, q2, q8, #1 \n"
9591
9592 "vmla.f32 q11, q1, %f12[0] \n"
9593
9594 "pld [%2, #256] \n"
9595 "vld2.f32 {d4-d7}, [%2]! \n"
9596
9597 "vadd.f32 q0, q0, q10 \n"
9598 "vadd.f32 q0, q0, q11 \n"
9599
9600 "subs %0, #1 \n"
9601 "vst1.f32 {d0-d1}, [%1]! \n"
9602 "bne 0b \n"
9603 "sub %2, #32 \n"
9604 : "=r"(nn), // %0
9605 "=r"(outptr), // %1
9606 "=r"(r0), // %2
9607 "=r"(r1), // %3
9608 "=r"(r2) // %4
9609 : "0"(nn),
9610 "1"(outptr),
9611 "2"(r0),
9612 "3"(r1),
9613 "4"(r2),
9614 "w"(_k0123), // %10
9615 "w"(_k3456), // %11
9616 "w"(_k6789) // %12
9617 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
9618 }
9619 #endif // __aarch64__
9620 #endif // __ARM_NEON
9621 for (; remain > 0; remain--)
9622 {
9623 #if __ARM_NEON
9624 float32x4_t _r00 = vld1q_f32(r0);
9625 float32x4_t _r10 = vld1q_f32(r1);
9626 float32x4_t _r20 = vld1q_f32(r2);
9627
9628 float32x4_t _sum = vmulq_f32(_r00, _k0123);
9629 _sum = vmlaq_f32(_sum, _r10, _k3456);
9630 _sum = vmlaq_f32(_sum, _r20, _k6789);
9631
9632 _sum = vsetq_lane_f32(*outptr, _sum, 3);
9633
9634 #if __aarch64__
9635 *outptr = vaddvq_f32(_sum);
9636 #else
9637 float32x2_t _ss = vadd_f32(vget_low_f32(_sum), vget_high_f32(_sum));
9638 _ss = vpadd_f32(_ss, _ss);
9639
9640 *outptr = vget_lane_f32(_ss, 0);
9641 #endif // __aarch64__
9642 #else
9643 float sum = 0;
9644
9645 sum += r0[0] * ktmp[0];
9646 sum += r0[1] * ktmp[1];
9647 sum += r0[2] * ktmp[2];
9648 sum += r1[0] * ktmp[3];
9649 sum += r1[1] * ktmp[4];
9650 sum += r1[2] * ktmp[5];
9651 sum += r2[0] * ktmp[6];
9652 sum += r2[1] * ktmp[7];
9653 sum += r2[2] * ktmp[8];
9654
9655 *outptr += sum;
9656 #endif // __ARM_NEON
9657
9658 r0 += 2;
9659 r1 += 2;
9660 r2 += 2;
9661 outptr++;
9662 }
9663
9664 r0 += tailstep;
9665 r1 += tailstep;
9666 r2 += tailstep;
9667 }
9668
9669 ktmp += 9;
9670 }
9671 }
9672 }
9673