1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_winograd64_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)15 static void conv3x3s1_winograd64_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
16 {
17 // winograd63 transform kernel
18 Mat kernel_tm;
19 kernel_tm.create(8 * 8, inch, outch);
20
21 const float ktm[8][3] = {
22 {1.0f, 0.0f, 0.0f},
23 {-2.0f / 9, -2.0f / 9, -2.0f / 9},
24 {-2.0f / 9, 2.0f / 9, -2.0f / 9},
25 {1.0f / 90, 1.0f / 45, 2.0f / 45},
26 {1.0f / 90, -1.0f / 45, 2.0f / 45},
27 {1.0f / 45, 1.0f / 90, 1.0f / 180},
28 {1.0f / 45, -1.0f / 90, 1.0f / 180},
29 {0.0f, 0.0f, 1.0f}
30 };
31
32 #pragma omp parallel for
33 for (int p = 0; p < outch; p++)
34 {
35 for (int q = 0; q < inch; q++)
36 {
37 const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
38 float* kernel_tm0 = kernel_tm.channel(p).row(q);
39
40 // transform kernel, transposed
41 const float* k0 = kernel0;
42 const float* k1 = kernel0 + 3;
43 const float* k2 = kernel0 + 6;
44
45 // h
46 float tmp[8][3];
47 for (int i = 0; i < 8; i++)
48 {
49 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
50 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
51 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
52 }
53
54 // v
55 for (int j = 0; j < 8; j++)
56 {
57 float* tmpp = &tmp[j][0];
58
59 for (int i = 0; i < 8; i++)
60 {
61 kernel_tm0[j * 8 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
62 }
63 }
64 }
65 }
66
67 // interleave
68 // src = 64-inch-outch
69 // dst = 4b-4a-inch/4a-64-outch/4b;
70 #if __aarch64__
71 kernel_tm_pack4.create(2 * inch / 4, 64, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
72 #else
73 kernel_tm_pack4.create(inch / 4, 64, outch / 4, (size_t)4u * 16, 16);
74 #endif
75
76 int q = 0;
77 #if __aarch64__
78 for (; q + 7 < outch; q += 8)
79 {
80 const Mat k0 = kernel_tm.channel(q);
81 const Mat k1 = kernel_tm.channel(q + 1);
82 const Mat k2 = kernel_tm.channel(q + 2);
83 const Mat k3 = kernel_tm.channel(q + 3);
84 const Mat k4 = kernel_tm.channel(q + 4);
85 const Mat k5 = kernel_tm.channel(q + 5);
86 const Mat k6 = kernel_tm.channel(q + 6);
87 const Mat k7 = kernel_tm.channel(q + 7);
88
89 Mat g0 = kernel_tm_pack4.channel(q / 8);
90
91 for (int k = 0; k < 64; k++)
92 {
93 float* g00 = g0.row(k);
94
95 for (int p = 0; p + 3 < inch; p += 4)
96 {
97 const float* k00 = k0.row(p);
98 const float* k01 = k0.row(p + 1);
99 const float* k02 = k0.row(p + 2);
100 const float* k03 = k0.row(p + 3);
101
102 const float* k10 = k1.row(p);
103 const float* k11 = k1.row(p + 1);
104 const float* k12 = k1.row(p + 2);
105 const float* k13 = k1.row(p + 3);
106
107 const float* k20 = k2.row(p);
108 const float* k21 = k2.row(p + 1);
109 const float* k22 = k2.row(p + 2);
110 const float* k23 = k2.row(p + 3);
111
112 const float* k30 = k3.row(p);
113 const float* k31 = k3.row(p + 1);
114 const float* k32 = k3.row(p + 2);
115 const float* k33 = k3.row(p + 3);
116
117 const float* k40 = k4.row(p);
118 const float* k41 = k4.row(p + 1);
119 const float* k42 = k4.row(p + 2);
120 const float* k43 = k4.row(p + 3);
121
122 const float* k50 = k5.row(p);
123 const float* k51 = k5.row(p + 1);
124 const float* k52 = k5.row(p + 2);
125 const float* k53 = k5.row(p + 3);
126
127 const float* k60 = k6.row(p);
128 const float* k61 = k6.row(p + 1);
129 const float* k62 = k6.row(p + 2);
130 const float* k63 = k6.row(p + 3);
131
132 const float* k70 = k7.row(p);
133 const float* k71 = k7.row(p + 1);
134 const float* k72 = k7.row(p + 2);
135 const float* k73 = k7.row(p + 3);
136
137 g00[0] = k00[k];
138 g00[1] = k10[k];
139 g00[2] = k20[k];
140 g00[3] = k30[k];
141
142 g00[4] = k40[k];
143 g00[5] = k50[k];
144 g00[6] = k60[k];
145 g00[7] = k70[k];
146
147 g00[8] = k01[k];
148 g00[9] = k11[k];
149 g00[10] = k21[k];
150 g00[11] = k31[k];
151
152 g00[12] = k41[k];
153 g00[13] = k51[k];
154 g00[14] = k61[k];
155 g00[15] = k71[k];
156
157 g00[16] = k02[k];
158 g00[17] = k12[k];
159 g00[18] = k22[k];
160 g00[19] = k32[k];
161
162 g00[20] = k42[k];
163 g00[21] = k52[k];
164 g00[22] = k62[k];
165 g00[23] = k72[k];
166
167 g00[24] = k03[k];
168 g00[25] = k13[k];
169 g00[26] = k23[k];
170 g00[27] = k33[k];
171
172 g00[28] = k43[k];
173 g00[29] = k53[k];
174 g00[30] = k63[k];
175 g00[31] = k73[k];
176
177 g00 += 32;
178 }
179 }
180 }
181 #endif // __aarch64__
182 for (; q + 3 < outch; q += 4)
183 {
184 const Mat k0 = kernel_tm.channel(q);
185 const Mat k1 = kernel_tm.channel(q + 1);
186 const Mat k2 = kernel_tm.channel(q + 2);
187 const Mat k3 = kernel_tm.channel(q + 3);
188
189 #if __aarch64__
190 Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
191 #else
192 Mat g0 = kernel_tm_pack4.channel(q / 4);
193 #endif
194
195 for (int k = 0; k < 64; k++)
196 {
197 float* g00 = g0.row(k);
198
199 for (int p = 0; p + 3 < inch; p += 4)
200 {
201 const float* k00 = k0.row(p);
202 const float* k01 = k0.row(p + 1);
203 const float* k02 = k0.row(p + 2);
204 const float* k03 = k0.row(p + 3);
205
206 const float* k10 = k1.row(p);
207 const float* k11 = k1.row(p + 1);
208 const float* k12 = k1.row(p + 2);
209 const float* k13 = k1.row(p + 3);
210
211 const float* k20 = k2.row(p);
212 const float* k21 = k2.row(p + 1);
213 const float* k22 = k2.row(p + 2);
214 const float* k23 = k2.row(p + 3);
215
216 const float* k30 = k3.row(p);
217 const float* k31 = k3.row(p + 1);
218 const float* k32 = k3.row(p + 2);
219 const float* k33 = k3.row(p + 3);
220
221 g00[0] = k00[k];
222 g00[1] = k10[k];
223 g00[2] = k20[k];
224 g00[3] = k30[k];
225
226 g00[4] = k01[k];
227 g00[5] = k11[k];
228 g00[6] = k21[k];
229 g00[7] = k31[k];
230
231 g00[8] = k02[k];
232 g00[9] = k12[k];
233 g00[10] = k22[k];
234 g00[11] = k32[k];
235
236 g00[12] = k03[k];
237 g00[13] = k13[k];
238 g00[14] = k23[k];
239 g00[15] = k33[k];
240
241 g00 += 16;
242 }
243 }
244 }
245 }
246
conv3x3s1_winograd64_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)247 static void conv3x3s1_winograd64_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
248 {
249 int w = bottom_blob.w;
250 int h = bottom_blob.h;
251 int inch = bottom_blob.c;
252 size_t elemsize = bottom_blob.elemsize;
253 int elempack = bottom_blob.elempack;
254
255 int outw = top_blob.w;
256 int outh = top_blob.h;
257 int outch = top_blob.c;
258
259 // pad to 6n+2
260 Mat bottom_blob_bordered = bottom_blob;
261
262 outw = (outw + 5) / 6 * 6;
263 outh = (outh + 5) / 6 * 6;
264
265 w = outw + 2;
266 h = outh + 2;
267 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
268
269 const float* bias = _bias;
270
271 // BEGIN transform input
272 Mat bottom_blob_tm;
273 {
274 int w_tm = outw / 6 * 8;
275 int h_tm = outh / 6 * 8;
276
277 const int tiles = w_tm / 8 * h_tm / 8;
278
279 bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
280
281 // const float itm[8][8] = {
282 // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f},
283 //
284 // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f},
285 // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f},
286 //
287 // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f},
288 // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f},
289 //
290 // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f},
291 // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f},
292 //
293 // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f}
294 // };
295
296 // 0 = r00 - r06 + (r04 - r02) * 5.25
297 // 7 = r07 - r01 + (r03 - r05) * 5.25
298
299 // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
300 // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
301
302 // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
303 // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
304
305 // reuse r04 * 1.25
306 // reuse r03 * 2.5
307 // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
308 // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
309
310 #pragma omp parallel for num_threads(opt.num_threads)
311 for (int q = 0; q < inch; q++)
312 {
313 const Mat img0 = bottom_blob_bordered.channel(q);
314 Mat img0_tm = bottom_blob_tm.channel(q);
315
316 float tmp[8][8][4];
317
318 // tile
319 for (int i = 0; i < h_tm / 8; i++)
320 {
321 for (int j = 0; j < w_tm / 8; j++)
322 {
323 const float* r0 = img0.row(i * 6) + (j * 6) * 4;
324
325 for (int m = 0; m < 8; m++)
326 {
327 float32x4_t _r00 = vld1q_f32(r0);
328 float32x4_t _r01 = vld1q_f32(r0 + 4);
329 float32x4_t _r02 = vld1q_f32(r0 + 8);
330 float32x4_t _r03 = vld1q_f32(r0 + 12);
331 float32x4_t _r04 = vld1q_f32(r0 + 16);
332 float32x4_t _r05 = vld1q_f32(r0 + 20);
333 float32x4_t _r06 = vld1q_f32(r0 + 24);
334 float32x4_t _r07 = vld1q_f32(r0 + 28);
335
336 float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
337 float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
338 vst1q_f32(tmp[0][m], _tmp0m);
339 vst1q_f32(tmp[7][m], _tmp7m);
340
341 // tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25;
342 // tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25;
343
344 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
345 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
346
347 // float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25);
348 // float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25);
349
350 float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
351 float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
352 vst1q_f32(tmp[1][m], _tmp1m);
353 vst1q_f32(tmp[2][m], _tmp2m);
354
355 // tmp[1][m] = tmp12a + tmp12b;
356 // tmp[2][m] = tmp12a - tmp12b;
357
358 float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
359 float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
360
361 // float tmp34a = (r0[6] + r0[2] * 0.25 - r0[4] * 1.25);
362 // float tmp34b = (r0[1] * 0.5 - r0[3] * 2.5 + r0[5] * 2);
363
364 float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
365 float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
366 vst1q_f32(tmp[3][m], _tmp3m);
367 vst1q_f32(tmp[4][m], _tmp4m);
368
369 // tmp[3][m] = tmp34a + tmp34b;
370 // tmp[4][m] = tmp34a - tmp34b;
371
372 float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
373 float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
374
375 // float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25) * 4);
376 // float tmp56b = (r0[1] * 2 - r0[3] * 2.5 + r0[5] * 0.5);
377
378 float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
379 float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
380 vst1q_f32(tmp[5][m], _tmp5m);
381 vst1q_f32(tmp[6][m], _tmp6m);
382
383 // tmp[5][m] = tmp56a + tmp56b;
384 // tmp[6][m] = tmp56a - tmp56b;
385
386 r0 += w * 4;
387 }
388
389 float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 8 + j) * 4;
390 float* r0_tm_1 = r0_tm_0 + tiles * 4;
391 float* r0_tm_2 = r0_tm_0 + tiles * 8;
392 float* r0_tm_3 = r0_tm_0 + tiles * 12;
393 float* r0_tm_4 = r0_tm_0 + tiles * 16;
394 float* r0_tm_5 = r0_tm_0 + tiles * 20;
395 float* r0_tm_6 = r0_tm_0 + tiles * 24;
396 float* r0_tm_7 = r0_tm_0 + tiles * 28;
397
398 for (int m = 0; m < 8; m++)
399 {
400 float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
401 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
402 float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
403 float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
404 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
405 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
406 float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
407 float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
408
409 float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
410 float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
411
412 // r0_tm[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25;
413 // r0_tm[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25;
414
415 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
416 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
417
418 // float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25);
419 // float tmp12b = (tmp0[1] + tmp0[5] - tmp0[3] * 4.25);
420
421 float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
422 float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
423
424 // r0_tm[1] = tmp12a + tmp12b;
425 // r0_tm[2] = tmp12a - tmp12b;
426
427 float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
428 float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
429
430 // float tmp34a = (tmp0[6] + tmp0[2] * 0.25 - tmp0[4] * 1.25);
431 // float tmp34b = (tmp0[1] * 0.5 - tmp0[3] * 2.5 + tmp0[5] * 2);
432
433 float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
434 float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
435
436 // r0_tm[3] = tmp34a + tmp34b;
437 // r0_tm[4] = tmp34a - tmp34b;
438
439 float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
440 float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
441
442 // float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25) * 4);
443 // float tmp56b = (tmp0[1] * 2 - tmp0[3] * 2.5 + tmp0[5] * 0.5);
444
445 float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
446 float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
447
448 // r0_tm[5] = tmp56a + tmp56b;
449 // r0_tm[6] = tmp56a - tmp56b;
450
451 vst1q_f32(r0_tm_0, _r0tm0);
452 vst1q_f32(r0_tm_1, _r0tm1);
453 vst1q_f32(r0_tm_2, _r0tm2);
454 vst1q_f32(r0_tm_3, _r0tm3);
455 vst1q_f32(r0_tm_4, _r0tm4);
456 vst1q_f32(r0_tm_5, _r0tm5);
457 vst1q_f32(r0_tm_6, _r0tm6);
458 vst1q_f32(r0_tm_7, _r0tm7);
459
460 r0_tm_0 += tiles * 32;
461 r0_tm_1 += tiles * 32;
462 r0_tm_2 += tiles * 32;
463 r0_tm_3 += tiles * 32;
464 r0_tm_4 += tiles * 32;
465 r0_tm_5 += tiles * 32;
466 r0_tm_6 += tiles * 32;
467 r0_tm_7 += tiles * 32;
468 }
469 }
470 }
471 }
472 }
473 bottom_blob_bordered = Mat();
474 // END transform input
475
476 // BEGIN dot
477 Mat top_blob_tm;
478 {
479 int w_tm = outw / 6 * 8;
480 int h_tm = outh / 6 * 8;
481
482 const int tiles = h_tm / 8 * w_tm / 8;
483
484 // permute
485 // bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
486 Mat bottom_blob_tm2;
487 #if __aarch64__
488 if (tiles >= 12)
489 bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, 64, elemsize, elempack, opt.workspace_allocator);
490 else if (tiles >= 8)
491 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
492 else if (tiles >= 4)
493 bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
494 else if (tiles >= 2)
495 bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
496 else // if (tiles >= 1)
497 bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
498 #else
499 if (tiles >= 8)
500 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
501 else if (tiles >= 4)
502 bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
503 else if (tiles >= 2)
504 bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 64, elemsize, elempack, opt.workspace_allocator);
505 else // if (tiles >= 1)
506 bottom_blob_tm2.create(1 * inch, tiles, 64, elemsize, elempack, opt.workspace_allocator);
507 #endif
508
509 #pragma omp parallel for num_threads(opt.num_threads)
510 for (int r = 0; r < 64; r++)
511 {
512 Mat tm2 = bottom_blob_tm2.channel(r);
513
514 // tile
515 int i = 0;
516 #if __aarch64__
517 for (; i + 11 < tiles; i += 12)
518 {
519 float* tm2p = tm2.row(i / 12);
520
521 const float* r0 = bottom_blob_tm;
522
523 r0 += (r * tiles + i) * 4;
524
525 for (int q = 0; q < inch; q++)
526 {
527 asm volatile(
528 "prfm pldl1keep, [%0, #512] \n"
529 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
530 "prfm pldl1keep, [%0, #512] \n"
531 "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
532 "prfm pldl1keep, [%0, #512] \n"
533 "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
534 "st1 {v0.4s}, [%1], #16 \n"
535 "st1 {v4.4s}, [%1], #16 \n"
536 "st1 {v8.4s}, [%1], #16 \n"
537 "sub %0, %0, #128 \n"
538 "st1 {v1.4s}, [%1], #16 \n"
539 "st1 {v5.4s}, [%1], #16 \n"
540 "st1 {v9.4s}, [%1], #16 \n"
541 "st1 {v2.4s}, [%1], #16 \n"
542 "st1 {v6.4s}, [%1], #16 \n"
543 "st1 {v10.4s}, [%1], #16 \n"
544 "st1 {v3.4s}, [%1], #16 \n"
545 "st1 {v7.4s}, [%1], #16 \n"
546 "st1 {v11.4s}, [%1], #16 \n"
547 : "=r"(r0), // %0
548 "=r"(tm2p) // %1
549 : "0"(r0),
550 "1"(tm2p)
551 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
552 r0 += bottom_blob_tm.cstep * 4;
553 }
554 }
555 #endif
556 for (; i + 7 < tiles; i += 8)
557 {
558 #if __aarch64__
559 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
560 #else
561 float* tm2p = tm2.row(i / 8);
562 #endif
563
564 const float* r0 = bottom_blob_tm;
565
566 r0 += (r * tiles + i) * 4;
567
568 for (int q = 0; q < inch; q++)
569 {
570 #if __aarch64__
571 asm volatile(
572 "prfm pldl1keep, [%0, #512] \n"
573 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
574 "prfm pldl1keep, [%0, #512] \n"
575 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
576 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
577 "sub %0, %0, #64 \n"
578 "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
579 : "=r"(r0), // %0
580 "=r"(tm2p) // %1
581 : "0"(r0),
582 "1"(tm2p)
583 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
584 #else
585 asm volatile(
586 "pld [%0, #512] \n"
587 "vldm %0!, {d0-d7} \n"
588 "pld [%0, #512] \n"
589 "vldm %0, {d16-d23} \n"
590
591 // transpose 8x4
592 "vtrn.32 q0, q1 \n"
593 "vtrn.32 q2, q3 \n"
594 "vtrn.32 q8, q9 \n"
595 "vtrn.32 q10, q11 \n"
596 "vswp d1, d4 \n"
597 "vswp d3, d6 \n"
598 "vswp d17, d20 \n"
599 "vswp d19, d22 \n"
600 "vswp q1, q8 \n"
601 "vswp q3, q10 \n"
602
603 "vst1.f32 {d0-d3}, [%1 :128]! \n"
604 "vst1.f32 {d16-d19}, [%1 :128]! \n"
605 "sub %0, %0, #64 \n"
606 "vst1.f32 {d4-d7}, [%1 :128]! \n"
607 "vst1.f32 {d20-d23}, [%1 :128]! \n"
608 : "=r"(r0), // %0
609 "=r"(tm2p) // %1
610 : "0"(r0),
611 "1"(tm2p)
612 : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
613 #endif
614 r0 += bottom_blob_tm.cstep * 4;
615 }
616 }
617 for (; i + 3 < tiles; i += 4)
618 {
619 #if __aarch64__
620 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
621 #else
622 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
623 #endif
624
625 const float* r0 = bottom_blob_tm;
626
627 r0 += (r * tiles + i) * 4;
628
629 for (int q = 0; q < inch; q++)
630 {
631 #if __aarch64__
632 asm volatile(
633 "prfm pldl1keep, [%0, #512] \n"
634 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
635 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
636 : "=r"(r0), // %0
637 "=r"(tm2p) // %1
638 : "0"(r0),
639 "1"(tm2p)
640 : "memory", "v0", "v1", "v2", "v3");
641 #else
642 asm volatile(
643 "pld [%0, #512] \n"
644 "vldm %0, {d0-d7} \n"
645 "vstm %1!, {d0-d7} \n"
646 : "=r"(r0), // %0
647 "=r"(tm2p) // %1
648 : "0"(r0),
649 "1"(tm2p)
650 : "memory", "q0", "q1", "q2", "q3");
651 #endif // __aarch64__
652 r0 += bottom_blob_tm.cstep * 4;
653 }
654 }
655 for (; i + 1 < tiles; i += 2)
656 {
657 #if __aarch64__
658 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
659 #else
660 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
661 #endif
662
663 const float* r0 = bottom_blob_tm;
664
665 r0 += (r * tiles + i) * 4;
666
667 for (int q = 0; q < inch; q++)
668 {
669 #if __aarch64__
670 asm volatile(
671 "prfm pldl1keep, [%0, #256] \n"
672 "ld1 {v0.4s, v1.4s}, [%0] \n"
673 "st1 {v0.4s, v1.4s}, [%1], #32 \n"
674 : "=r"(r0), // %0
675 "=r"(tm2p) // %1
676 : "0"(r0),
677 "1"(tm2p)
678 : "memory", "v0", "v1");
679 #else
680 asm volatile(
681 "pld [%0, #256] \n"
682 "vld1.f32 {d0-d3}, [%0 :128] \n"
683 "vst1.f32 {d0-d3}, [%1 :128]! \n"
684 : "=r"(r0), // %0
685 "=r"(tm2p) // %1
686 : "0"(r0),
687 "1"(tm2p)
688 : "memory", "q0", "q1");
689 #endif // __aarch64__
690 r0 += bottom_blob_tm.cstep * 4;
691 }
692 }
693 for (; i < tiles; i++)
694 {
695 #if __aarch64__
696 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
697 #else
698 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
699 #endif
700
701 const float* r0 = bottom_blob_tm;
702
703 r0 += (r * tiles + i) * 4;
704
705 for (int q = 0; q < inch; q++)
706 {
707 #if __aarch64__
708 asm volatile(
709 "prfm pldl1keep, [%0, #128] \n"
710 "ld1 {v0.4s}, [%0] \n"
711 "st1 {v0.4s}, [%1], #16 \n"
712 : "=r"(r0), // %0
713 "=r"(tm2p) // %1
714 : "0"(r0),
715 "1"(tm2p)
716 : "memory", "v0");
717 #else
718 asm volatile(
719 "pld [%0, #128] \n"
720 "vld1.f32 {d0-d1}, [%0 :128] \n"
721 "vst1.f32 {d0-d1}, [%1 :128]! \n"
722 : "=r"(r0), // %0
723 "=r"(tm2p) // %1
724 : "0"(r0),
725 "1"(tm2p)
726 : "memory", "q0");
727 #endif // __aarch64__
728 r0 += bottom_blob_tm.cstep * 4;
729 }
730 }
731 }
732
733 bottom_blob_tm = Mat();
734 // permute end
735
736 top_blob_tm.create(tiles, 64, outch, elemsize, elempack, opt.workspace_allocator);
737
738 int remain_outch_start = 0;
739
740 #if __ARM_NEON && __aarch64__
741 int nn_outch = 0;
742 nn_outch = outch >> 1;
743 remain_outch_start = nn_outch << 1;
744
745 #pragma omp parallel for num_threads(opt.num_threads)
746 for (int pp = 0; pp < nn_outch; pp++)
747 {
748 int p = pp * 2;
749
750 float* output0_tm = top_blob_tm.channel(p);
751 float* output1_tm = top_blob_tm.channel(p + 1);
752
753 const Mat kernel01_tm = kernel_tm.channel(pp);
754
755 for (int r = 0; r < 64; r++)
756 {
757 const Mat bb2 = bottom_blob_tm2.channel(r);
758
759 int i = 0;
760 for (; i + 11 < tiles; i += 12)
761 {
762 const float* r0 = bb2.row(i / 12);
763
764 const float* k01 = kernel01_tm.row(r);
765
766 int nn = inch; // inch always > 0
767
768 asm volatile(
769 "eor v8.16b, v8.16b, v8.16b \n"
770 "eor v9.16b, v9.16b, v9.16b \n"
771 "eor v10.16b, v10.16b, v10.16b \n"
772 "eor v11.16b, v11.16b, v11.16b \n"
773 "eor v12.16b, v12.16b, v12.16b \n"
774 "eor v13.16b, v13.16b, v13.16b \n"
775 "eor v14.16b, v14.16b, v14.16b \n"
776 "eor v15.16b, v15.16b, v15.16b \n"
777 "eor v16.16b, v16.16b, v16.16b \n"
778 "eor v17.16b, v17.16b, v17.16b \n"
779 "eor v18.16b, v18.16b, v18.16b \n"
780 "eor v19.16b, v19.16b, v19.16b \n"
781 "eor v20.16b, v20.16b, v20.16b \n"
782 "eor v21.16b, v21.16b, v21.16b \n"
783 "eor v22.16b, v22.16b, v22.16b \n"
784 "eor v23.16b, v23.16b, v23.16b \n"
785 "eor v24.16b, v24.16b, v24.16b \n"
786 "eor v25.16b, v25.16b, v25.16b \n"
787 "eor v26.16b, v26.16b, v26.16b \n"
788 "eor v27.16b, v27.16b, v27.16b \n"
789 "eor v28.16b, v28.16b, v28.16b \n"
790 "eor v29.16b, v29.16b, v29.16b \n"
791 "eor v30.16b, v30.16b, v30.16b \n"
792 "eor v31.16b, v31.16b, v31.16b \n"
793
794 "0: \n"
795
796 "prfm pldl1keep, [%3, #512] \n"
797 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
798
799 "prfm pldl1keep, [%4, #512] \n"
800 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w0011_01
801
802 "fmla v8.4s, v4.4s, v0.s[0] \n"
803 "fmla v9.4s, v4.4s, v0.s[1] \n"
804 "fmla v10.4s, v4.4s, v0.s[2] \n"
805 "fmla v11.4s, v4.4s, v0.s[3] \n"
806 "fmla v12.4s, v4.4s, v1.s[0] \n"
807 "fmla v13.4s, v4.4s, v1.s[1] \n"
808 "fmla v14.4s, v4.4s, v1.s[2] \n"
809 "fmla v15.4s, v4.4s, v1.s[3] \n"
810 "fmla v16.4s, v4.4s, v2.s[0] \n"
811 "fmla v17.4s, v4.4s, v2.s[1] \n"
812 "fmla v18.4s, v4.4s, v2.s[2] \n"
813 "fmla v19.4s, v4.4s, v2.s[3] \n"
814
815 "fmla v20.4s, v5.4s, v0.s[0] \n"
816 "fmla v21.4s, v5.4s, v0.s[1] \n"
817 "fmla v22.4s, v5.4s, v0.s[2] \n"
818 "fmla v23.4s, v5.4s, v0.s[3] \n"
819 "fmla v24.4s, v5.4s, v1.s[0] \n"
820 "fmla v25.4s, v5.4s, v1.s[1] \n"
821 "fmla v26.4s, v5.4s, v1.s[2] \n"
822 "fmla v27.4s, v5.4s, v1.s[3] \n"
823 "fmla v28.4s, v5.4s, v2.s[0] \n"
824 "fmla v29.4s, v5.4s, v2.s[1] \n"
825 "fmla v30.4s, v5.4s, v2.s[2] \n"
826 "fmla v31.4s, v5.4s, v2.s[3] \n"
827
828 "fmla v8.4s, v6.4s, v3.s[0] \n"
829 "fmla v9.4s, v6.4s, v3.s[1] \n"
830 "fmla v10.4s, v6.4s, v3.s[2] \n"
831 "fmla v11.4s, v6.4s, v3.s[3] \n"
832
833 "fmla v20.4s, v7.4s, v3.s[0] \n"
834 "fmla v21.4s, v7.4s, v3.s[1] \n"
835 "fmla v22.4s, v7.4s, v3.s[2] \n"
836 "fmla v23.4s, v7.4s, v3.s[3] \n"
837
838 "prfm pldl1keep, [%3, #512] \n"
839 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
840
841 "fmla v12.4s, v6.4s, v0.s[0] \n"
842 "fmla v13.4s, v6.4s, v0.s[1] \n"
843 "fmla v14.4s, v6.4s, v0.s[2] \n"
844 "fmla v15.4s, v6.4s, v0.s[3] \n"
845 "fmla v16.4s, v6.4s, v1.s[0] \n"
846 "fmla v17.4s, v6.4s, v1.s[1] \n"
847 "fmla v18.4s, v6.4s, v1.s[2] \n"
848 "fmla v19.4s, v6.4s, v1.s[3] \n"
849
850 "fmla v24.4s, v7.4s, v0.s[0] \n"
851 "fmla v25.4s, v7.4s, v0.s[1] \n"
852 "fmla v26.4s, v7.4s, v0.s[2] \n"
853 "fmla v27.4s, v7.4s, v0.s[3] \n"
854 "fmla v28.4s, v7.4s, v1.s[0] \n"
855 "fmla v29.4s, v7.4s, v1.s[1] \n"
856 "fmla v30.4s, v7.4s, v1.s[2] \n"
857 "fmla v31.4s, v7.4s, v1.s[3] \n"
858
859 "prfm pldl1keep, [%4, #512] \n"
860 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w2233_01
861
862 "fmla v8.4s, v4.4s, v2.s[0] \n"
863 "fmla v9.4s, v4.4s, v2.s[1] \n"
864 "fmla v10.4s, v4.4s, v2.s[2] \n"
865 "fmla v11.4s, v4.4s, v2.s[3] \n"
866 "fmla v12.4s, v4.4s, v3.s[0] \n"
867 "fmla v13.4s, v4.4s, v3.s[1] \n"
868 "fmla v14.4s, v4.4s, v3.s[2] \n"
869 "fmla v15.4s, v4.4s, v3.s[3] \n"
870
871 "fmla v20.4s, v5.4s, v2.s[0] \n"
872 "fmla v21.4s, v5.4s, v2.s[1] \n"
873 "fmla v22.4s, v5.4s, v2.s[2] \n"
874 "fmla v23.4s, v5.4s, v2.s[3] \n"
875 "fmla v24.4s, v5.4s, v3.s[0] \n"
876 "fmla v25.4s, v5.4s, v3.s[1] \n"
877 "fmla v26.4s, v5.4s, v3.s[2] \n"
878 "fmla v27.4s, v5.4s, v3.s[3] \n"
879
880 "prfm pldl1keep, [%3, #512] \n"
881 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
882
883 "fmla v16.4s, v4.4s, v0.s[0] \n"
884 "fmla v17.4s, v4.4s, v0.s[1] \n"
885 "fmla v18.4s, v4.4s, v0.s[2] \n"
886 "fmla v19.4s, v4.4s, v0.s[3] \n"
887
888 "fmla v28.4s, v5.4s, v0.s[0] \n"
889 "fmla v29.4s, v5.4s, v0.s[1] \n"
890 "fmla v30.4s, v5.4s, v0.s[2] \n"
891 "fmla v31.4s, v5.4s, v0.s[3] \n"
892
893 "fmla v8.4s, v6.4s, v1.s[0] \n"
894 "fmla v9.4s, v6.4s, v1.s[1] \n"
895 "fmla v10.4s, v6.4s, v1.s[2] \n"
896 "fmla v11.4s, v6.4s, v1.s[3] \n"
897 "fmla v12.4s, v6.4s, v2.s[0] \n"
898 "fmla v13.4s, v6.4s, v2.s[1] \n"
899 "fmla v14.4s, v6.4s, v2.s[2] \n"
900 "fmla v15.4s, v6.4s, v2.s[3] \n"
901 "fmla v16.4s, v6.4s, v3.s[0] \n"
902 "fmla v17.4s, v6.4s, v3.s[1] \n"
903 "fmla v18.4s, v6.4s, v3.s[2] \n"
904 "fmla v19.4s, v6.4s, v3.s[3] \n"
905
906 "subs %w0, %w0, #1 \n"
907
908 "fmla v20.4s, v7.4s, v1.s[0] \n"
909 "fmla v21.4s, v7.4s, v1.s[1] \n"
910 "fmla v22.4s, v7.4s, v1.s[2] \n"
911 "fmla v23.4s, v7.4s, v1.s[3] \n"
912 "fmla v24.4s, v7.4s, v2.s[0] \n"
913 "fmla v25.4s, v7.4s, v2.s[1] \n"
914 "fmla v26.4s, v7.4s, v2.s[2] \n"
915 "fmla v27.4s, v7.4s, v2.s[3] \n"
916 "fmla v28.4s, v7.4s, v3.s[0] \n"
917 "fmla v29.4s, v7.4s, v3.s[1] \n"
918 "fmla v30.4s, v7.4s, v3.s[2] \n"
919 "fmla v31.4s, v7.4s, v3.s[3] \n"
920
921 "bne 0b \n"
922
923 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
924 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
925 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
926 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
927 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
928 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
929
930 : "=r"(nn), // %0
931 "=r"(output0_tm), // %1
932 "=r"(output1_tm), // %2
933 "=r"(r0), // %3
934 "=r"(k01) // %4
935 : "0"(nn),
936 "1"(output0_tm),
937 "2"(output1_tm),
938 "3"(r0),
939 "4"(k01)
940 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
941 }
942 for (; i + 7 < tiles; i += 8)
943 {
944 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
945
946 const float* k01 = kernel01_tm.row(r);
947
948 int nn = inch; // inch always > 0
949
950 asm volatile(
951 "eor v16.16b, v16.16b, v16.16b \n"
952 "eor v17.16b, v17.16b, v17.16b \n"
953 "eor v18.16b, v18.16b, v18.16b \n"
954 "eor v19.16b, v19.16b, v19.16b \n"
955 "eor v20.16b, v20.16b, v20.16b \n"
956 "eor v21.16b, v21.16b, v21.16b \n"
957 "eor v22.16b, v22.16b, v22.16b \n"
958 "eor v23.16b, v23.16b, v23.16b \n"
959 "eor v24.16b, v24.16b, v24.16b \n"
960 "eor v25.16b, v25.16b, v25.16b \n"
961 "eor v26.16b, v26.16b, v26.16b \n"
962 "eor v27.16b, v27.16b, v27.16b \n"
963 "eor v28.16b, v28.16b, v28.16b \n"
964 "eor v29.16b, v29.16b, v29.16b \n"
965 "eor v30.16b, v30.16b, v30.16b \n"
966 "eor v31.16b, v31.16b, v31.16b \n"
967
968 "0: \n"
969
970 "prfm pldl1keep, [%3, #512] \n"
971 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
972
973 "prfm pldl1keep, [%4, #512] \n"
974 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
975
976 "prfm pldl1keep, [%3, #512] \n"
977 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
978
979 "fmla v16.4s, v8.4s, v0.s[0] \n"
980 "fmla v17.4s, v8.4s, v1.s[0] \n"
981 "fmla v18.4s, v8.4s, v2.s[0] \n"
982 "fmla v19.4s, v8.4s, v3.s[0] \n"
983 "fmla v20.4s, v8.4s, v4.s[0] \n"
984 "fmla v21.4s, v8.4s, v5.s[0] \n"
985 "fmla v22.4s, v8.4s, v6.s[0] \n"
986 "fmla v23.4s, v8.4s, v7.s[0] \n"
987
988 "fmla v24.4s, v9.4s, v0.s[0] \n"
989 "fmla v25.4s, v9.4s, v1.s[0] \n"
990 "fmla v26.4s, v9.4s, v2.s[0] \n"
991 "fmla v27.4s, v9.4s, v3.s[0] \n"
992 "fmla v28.4s, v9.4s, v4.s[0] \n"
993 "fmla v29.4s, v9.4s, v5.s[0] \n"
994 "fmla v30.4s, v9.4s, v6.s[0] \n"
995 "fmla v31.4s, v9.4s, v7.s[0] \n"
996
997 "prfm pldl1keep, [%4, #512] \n"
998 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
999
1000 "fmla v16.4s, v10.4s, v0.s[1] \n"
1001 "fmla v17.4s, v10.4s, v1.s[1] \n"
1002 "fmla v18.4s, v10.4s, v2.s[1] \n"
1003 "fmla v19.4s, v10.4s, v3.s[1] \n"
1004 "fmla v20.4s, v10.4s, v4.s[1] \n"
1005 "fmla v21.4s, v10.4s, v5.s[1] \n"
1006 "fmla v22.4s, v10.4s, v6.s[1] \n"
1007 "fmla v23.4s, v10.4s, v7.s[1] \n"
1008
1009 "fmla v24.4s, v11.4s, v0.s[1] \n"
1010 "fmla v25.4s, v11.4s, v1.s[1] \n"
1011 "fmla v26.4s, v11.4s, v2.s[1] \n"
1012 "fmla v27.4s, v11.4s, v3.s[1] \n"
1013 "fmla v28.4s, v11.4s, v4.s[1] \n"
1014 "fmla v29.4s, v11.4s, v5.s[1] \n"
1015 "fmla v30.4s, v11.4s, v6.s[1] \n"
1016 "fmla v31.4s, v11.4s, v7.s[1] \n"
1017
1018 "fmla v16.4s, v12.4s, v0.s[2] \n"
1019 "fmla v17.4s, v12.4s, v1.s[2] \n"
1020 "fmla v18.4s, v12.4s, v2.s[2] \n"
1021 "fmla v19.4s, v12.4s, v3.s[2] \n"
1022 "fmla v20.4s, v12.4s, v4.s[2] \n"
1023 "fmla v21.4s, v12.4s, v5.s[2] \n"
1024 "fmla v22.4s, v12.4s, v6.s[2] \n"
1025 "fmla v23.4s, v12.4s, v7.s[2] \n"
1026
1027 "fmla v24.4s, v13.4s, v0.s[2] \n"
1028 "fmla v25.4s, v13.4s, v1.s[2] \n"
1029 "fmla v26.4s, v13.4s, v2.s[2] \n"
1030 "fmla v27.4s, v13.4s, v3.s[2] \n"
1031 "fmla v28.4s, v13.4s, v4.s[2] \n"
1032 "fmla v29.4s, v13.4s, v5.s[2] \n"
1033 "fmla v30.4s, v13.4s, v6.s[2] \n"
1034 "fmla v31.4s, v13.4s, v7.s[2] \n"
1035
1036 "fmla v16.4s, v14.4s, v0.s[3] \n"
1037 "fmla v17.4s, v14.4s, v1.s[3] \n"
1038 "fmla v18.4s, v14.4s, v2.s[3] \n"
1039 "fmla v19.4s, v14.4s, v3.s[3] \n"
1040 "fmla v20.4s, v14.4s, v4.s[3] \n"
1041 "fmla v21.4s, v14.4s, v5.s[3] \n"
1042 "fmla v22.4s, v14.4s, v6.s[3] \n"
1043 "fmla v23.4s, v14.4s, v7.s[3] \n"
1044
1045 "subs %w0, %w0, #1 \n"
1046
1047 "fmla v24.4s, v15.4s, v0.s[3] \n"
1048 "fmla v25.4s, v15.4s, v1.s[3] \n"
1049 "fmla v26.4s, v15.4s, v2.s[3] \n"
1050 "fmla v27.4s, v15.4s, v3.s[3] \n"
1051 "fmla v28.4s, v15.4s, v4.s[3] \n"
1052 "fmla v29.4s, v15.4s, v5.s[3] \n"
1053 "fmla v30.4s, v15.4s, v6.s[3] \n"
1054 "fmla v31.4s, v15.4s, v7.s[3] \n"
1055
1056 "bne 0b \n"
1057
1058 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1059 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1060 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1061 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
1062
1063 : "=r"(nn), // %0
1064 "=r"(output0_tm), // %1
1065 "=r"(output1_tm), // %2
1066 "=r"(r0), // %3
1067 "=r"(k01) // %4
1068 : "0"(nn),
1069 "1"(output0_tm),
1070 "2"(output1_tm),
1071 "3"(r0),
1072 "4"(k01)
1073 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
1074 }
1075 for (; i + 3 < tiles; i += 4)
1076 {
1077 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1078
1079 const float* k01 = kernel01_tm.row(r);
1080
1081 int nn = inch; // inch always > 0
1082
1083 asm volatile(
1084 "eor v16.16b, v16.16b, v16.16b \n"
1085 "eor v17.16b, v17.16b, v17.16b \n"
1086 "eor v18.16b, v18.16b, v18.16b \n"
1087 "eor v19.16b, v19.16b, v19.16b \n"
1088 "eor v20.16b, v20.16b, v20.16b \n"
1089 "eor v21.16b, v21.16b, v21.16b \n"
1090 "eor v22.16b, v22.16b, v22.16b \n"
1091 "eor v23.16b, v23.16b, v23.16b \n"
1092
1093 "0: \n"
1094
1095 "prfm pldl1keep, [%3, #512] \n"
1096 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
1097
1098 "prfm pldl1keep, [%4, #512] \n"
1099 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1100
1101 "fmla v16.4s, v8.4s, v0.s[0] \n"
1102 "fmla v17.4s, v8.4s, v1.s[0] \n"
1103 "fmla v18.4s, v8.4s, v2.s[0] \n"
1104 "fmla v19.4s, v8.4s, v3.s[0] \n"
1105
1106 "fmla v20.4s, v9.4s, v0.s[0] \n"
1107 "fmla v21.4s, v9.4s, v1.s[0] \n"
1108 "fmla v22.4s, v9.4s, v2.s[0] \n"
1109 "fmla v23.4s, v9.4s, v3.s[0] \n"
1110
1111 "prfm pldl1keep, [%4, #512] \n"
1112 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1113
1114 "fmla v16.4s, v10.4s, v0.s[1] \n"
1115 "fmla v17.4s, v10.4s, v1.s[1] \n"
1116 "fmla v18.4s, v10.4s, v2.s[1] \n"
1117 "fmla v19.4s, v10.4s, v3.s[1] \n"
1118
1119 "fmla v20.4s, v11.4s, v0.s[1] \n"
1120 "fmla v21.4s, v11.4s, v1.s[1] \n"
1121 "fmla v22.4s, v11.4s, v2.s[1] \n"
1122 "fmla v23.4s, v11.4s, v3.s[1] \n"
1123
1124 "fmla v16.4s, v12.4s, v0.s[2] \n"
1125 "fmla v17.4s, v12.4s, v1.s[2] \n"
1126 "fmla v18.4s, v12.4s, v2.s[2] \n"
1127 "fmla v19.4s, v12.4s, v3.s[2] \n"
1128
1129 "fmla v20.4s, v13.4s, v0.s[2] \n"
1130 "fmla v21.4s, v13.4s, v1.s[2] \n"
1131 "fmla v22.4s, v13.4s, v2.s[2] \n"
1132 "fmla v23.4s, v13.4s, v3.s[2] \n"
1133
1134 "subs %w0, %w0, #1 \n"
1135
1136 "fmla v16.4s, v14.4s, v0.s[3] \n"
1137 "fmla v17.4s, v14.4s, v1.s[3] \n"
1138 "fmla v18.4s, v14.4s, v2.s[3] \n"
1139 "fmla v19.4s, v14.4s, v3.s[3] \n"
1140
1141 "fmla v20.4s, v15.4s, v0.s[3] \n"
1142 "fmla v21.4s, v15.4s, v1.s[3] \n"
1143 "fmla v22.4s, v15.4s, v2.s[3] \n"
1144 "fmla v23.4s, v15.4s, v3.s[3] \n"
1145
1146 "bne 0b \n"
1147
1148 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1149 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1150
1151 : "=r"(nn), // %0
1152 "=r"(output0_tm), // %1
1153 "=r"(output1_tm), // %2
1154 "=r"(r0), // %3
1155 "=r"(k01) // %4
1156 : "0"(nn),
1157 "1"(output0_tm),
1158 "2"(output1_tm),
1159 "3"(r0),
1160 "4"(k01)
1161 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1162 }
1163 for (; i + 1 < tiles; i += 2)
1164 {
1165 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1166
1167 const float* k01 = kernel01_tm.row(r);
1168
1169 int nn = inch; // inch always > 0
1170
1171 asm volatile(
1172 "eor v16.16b, v16.16b, v16.16b \n"
1173 "eor v17.16b, v17.16b, v17.16b \n"
1174 "eor v18.16b, v18.16b, v18.16b \n"
1175 "eor v19.16b, v19.16b, v19.16b \n"
1176
1177 "0: \n"
1178
1179 "prfm pldl1keep, [%3, #256] \n"
1180 "ld1 {v0.4s, v1.4s}, [%3], #32 \n" // r0 r1
1181
1182 "prfm pldl1keep, [%4, #512] \n"
1183 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1184
1185 "fmla v16.4s, v8.4s, v0.s[0] \n"
1186 "fmla v17.4s, v8.4s, v1.s[0] \n"
1187 "fmla v18.4s, v9.4s, v0.s[0] \n"
1188 "fmla v19.4s, v9.4s, v1.s[0] \n"
1189
1190 "prfm pldl1keep, [%4, #512] \n"
1191 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1192
1193 "fmla v16.4s, v10.4s, v0.s[1] \n"
1194 "fmla v17.4s, v10.4s, v1.s[1] \n"
1195 "fmla v18.4s, v11.4s, v0.s[1] \n"
1196 "fmla v19.4s, v11.4s, v1.s[1] \n"
1197
1198 "fmla v16.4s, v12.4s, v0.s[2] \n"
1199 "fmla v17.4s, v12.4s, v1.s[2] \n"
1200 "fmla v18.4s, v13.4s, v0.s[2] \n"
1201 "fmla v19.4s, v13.4s, v1.s[2] \n"
1202
1203 "subs %w0, %w0, #1 \n"
1204
1205 "fmla v16.4s, v14.4s, v0.s[3] \n"
1206 "fmla v17.4s, v14.4s, v1.s[3] \n"
1207 "fmla v18.4s, v15.4s, v0.s[3] \n"
1208 "fmla v19.4s, v15.4s, v1.s[3] \n"
1209
1210 "bne 0b \n"
1211
1212 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
1213 "st1 {v18.4s, v19.4s}, [%2], #32 \n"
1214
1215 : "=r"(nn), // %0
1216 "=r"(output0_tm), // %1
1217 "=r"(output1_tm), // %2
1218 "=r"(r0), // %3
1219 "=r"(k01) // %4
1220 : "0"(nn),
1221 "1"(output0_tm),
1222 "2"(output1_tm),
1223 "3"(r0),
1224 "4"(k01)
1225 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1226 }
1227 for (; i < tiles; i++)
1228 {
1229 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1230
1231 const float* k01 = kernel01_tm.row(r);
1232
1233 int nn = inch; // inch always > 0
1234
1235 asm volatile(
1236 "eor v16.16b, v16.16b, v16.16b \n"
1237 "eor v17.16b, v17.16b, v17.16b \n"
1238
1239 "0: \n"
1240
1241 "prfm pldl1keep, [%3, #128] \n"
1242 "ld1 {v0.4s}, [%3], #16 \n" // r0
1243
1244 "prfm pldl1keep, [%4, #512] \n"
1245 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
1246
1247 "fmla v16.4s, v8.4s, v0.s[0] \n"
1248 "fmla v17.4s, v9.4s, v0.s[0] \n"
1249
1250 "prfm pldl1keep, [%4, #512] \n"
1251 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
1252
1253 "fmla v16.4s, v10.4s, v0.s[1] \n"
1254 "fmla v17.4s, v11.4s, v0.s[1] \n"
1255
1256 "fmla v16.4s, v12.4s, v0.s[2] \n"
1257 "fmla v17.4s, v13.4s, v0.s[2] \n"
1258
1259 "subs %w0, %w0, #1 \n"
1260
1261 "fmla v16.4s, v14.4s, v0.s[3] \n"
1262 "fmla v17.4s, v15.4s, v0.s[3] \n"
1263
1264 "bne 0b \n"
1265
1266 "st1 {v16.4s}, [%1], #16 \n"
1267 "st1 {v17.4s}, [%2], #16 \n"
1268
1269 : "=r"(nn), // %0
1270 "=r"(output0_tm), // %1
1271 "=r"(output1_tm), // %2
1272 "=r"(r0), // %3
1273 "=r"(k01) // %4
1274 : "0"(nn),
1275 "1"(output0_tm),
1276 "2"(output1_tm),
1277 "3"(r0),
1278 "4"(k01)
1279 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
1280 }
1281 }
1282 }
1283 #endif // __ARM_NEON && __aarch64__
1284
1285 #pragma omp parallel for num_threads(opt.num_threads)
1286 for (int p = remain_outch_start; p < outch; p++)
1287 {
1288 float* output0_tm = top_blob_tm.channel(p);
1289
1290 #if __aarch64__
1291 const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
1292 #else
1293 const Mat kernel0_tm = kernel_tm.channel(p);
1294 #endif
1295
1296 for (int r = 0; r < 64; r++)
1297 {
1298 const Mat bb2 = bottom_blob_tm2.channel(r);
1299
1300 int i = 0;
1301 #if __aarch64__
1302 for (; i + 11 < tiles; i += 12)
1303 {
1304 const float* r0 = bb2.row(i / 12);
1305
1306 const float* k0 = kernel0_tm.row(r);
1307
1308 int nn = inch; // inch always > 0
1309
1310 asm volatile(
1311 "eor v8.16b, v8.16b, v8.16b \n"
1312 "eor v9.16b, v9.16b, v9.16b \n"
1313 "eor v10.16b, v10.16b, v10.16b \n"
1314 "eor v11.16b, v11.16b, v11.16b \n"
1315 "eor v12.16b, v12.16b, v12.16b \n"
1316 "eor v13.16b, v13.16b, v13.16b \n"
1317 "eor v14.16b, v14.16b, v14.16b \n"
1318 "eor v15.16b, v15.16b, v15.16b \n"
1319 "eor v16.16b, v16.16b, v16.16b \n"
1320 "eor v17.16b, v17.16b, v17.16b \n"
1321 "eor v18.16b, v18.16b, v18.16b \n"
1322 "eor v19.16b, v19.16b, v19.16b \n"
1323
1324 "0: \n"
1325
1326 "prfm pldl1keep, [%2, #512] \n"
1327 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1328
1329 "prfm pldl1keep, [%3, #512] \n"
1330 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
1331
1332 "fmla v8.4s, v4.4s, v0.s[0] \n"
1333 "fmla v9.4s, v4.4s, v0.s[1] \n"
1334 "fmla v10.4s, v4.4s, v0.s[2] \n"
1335 "fmla v11.4s, v4.4s, v0.s[3] \n"
1336 "fmla v12.4s, v4.4s, v1.s[0] \n"
1337 "fmla v13.4s, v4.4s, v1.s[1] \n"
1338 "fmla v14.4s, v4.4s, v1.s[2] \n"
1339 "fmla v15.4s, v4.4s, v1.s[3] \n"
1340 "fmla v16.4s, v4.4s, v2.s[0] \n"
1341 "fmla v17.4s, v4.4s, v2.s[1] \n"
1342 "fmla v18.4s, v4.4s, v2.s[2] \n"
1343 "fmla v19.4s, v4.4s, v2.s[3] \n"
1344
1345 "prfm pldl1keep, [%2, #512] \n"
1346 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
1347
1348 "fmla v8.4s, v5.4s, v3.s[0] \n"
1349 "fmla v9.4s, v5.4s, v3.s[1] \n"
1350 "fmla v10.4s, v5.4s, v3.s[2] \n"
1351 "fmla v11.4s, v5.4s, v3.s[3] \n"
1352 "fmla v12.4s, v5.4s, v20.s[0] \n"
1353 "fmla v13.4s, v5.4s, v20.s[1] \n"
1354 "fmla v14.4s, v5.4s, v20.s[2] \n"
1355 "fmla v15.4s, v5.4s, v20.s[3] \n"
1356 "fmla v16.4s, v5.4s, v21.s[0] \n"
1357 "fmla v17.4s, v5.4s, v21.s[1] \n"
1358 "fmla v18.4s, v5.4s, v21.s[2] \n"
1359 "fmla v19.4s, v5.4s, v21.s[3] \n"
1360
1361 "prfm pldl1keep, [%2, #512] \n"
1362 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
1363
1364 "fmla v8.4s, v6.4s, v22.s[0] \n"
1365 "fmla v9.4s, v6.4s, v22.s[1] \n"
1366 "fmla v10.4s, v6.4s, v22.s[2] \n"
1367 "fmla v11.4s, v6.4s, v22.s[3] \n"
1368 "fmla v12.4s, v6.4s, v23.s[0] \n"
1369 "fmla v13.4s, v6.4s, v23.s[1] \n"
1370 "fmla v14.4s, v6.4s, v23.s[2] \n"
1371 "fmla v15.4s, v6.4s, v23.s[3] \n"
1372 "fmla v16.4s, v6.4s, v24.s[0] \n"
1373 "fmla v17.4s, v6.4s, v24.s[1] \n"
1374 "fmla v18.4s, v6.4s, v24.s[2] \n"
1375 "fmla v19.4s, v6.4s, v24.s[3] \n"
1376
1377 "subs %w0, %w0, #1 \n"
1378
1379 "fmla v8.4s, v7.4s, v25.s[0] \n"
1380 "fmla v9.4s, v7.4s, v25.s[1] \n"
1381 "fmla v10.4s, v7.4s, v25.s[2] \n"
1382 "fmla v11.4s, v7.4s, v25.s[3] \n"
1383 "fmla v12.4s, v7.4s, v26.s[0] \n"
1384 "fmla v13.4s, v7.4s, v26.s[1] \n"
1385 "fmla v14.4s, v7.4s, v26.s[2] \n"
1386 "fmla v15.4s, v7.4s, v26.s[3] \n"
1387 "fmla v16.4s, v7.4s, v27.s[0] \n"
1388 "fmla v17.4s, v7.4s, v27.s[1] \n"
1389 "fmla v18.4s, v7.4s, v27.s[2] \n"
1390 "fmla v19.4s, v7.4s, v27.s[3] \n"
1391
1392 "bne 0b \n"
1393
1394 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
1395 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
1396 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1397
1398 : "=r"(nn), // %0
1399 "=r"(output0_tm), // %1
1400 "=r"(r0), // %2
1401 "=r"(k0) // %3
1402 : "0"(nn),
1403 "1"(output0_tm),
1404 "2"(r0),
1405 "3"(k0)
1406 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1407 }
1408 #endif
1409 for (; i + 7 < tiles; i += 8)
1410 {
1411 #if __aarch64__
1412 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1413 #else
1414 const float* r0 = bb2.row(i / 8);
1415 #endif
1416
1417 const float* k0 = kernel0_tm.row(r);
1418
1419 int nn = inch; // inch always > 0
1420
1421 #if __aarch64__
1422 asm volatile(
1423 "eor v16.16b, v16.16b, v16.16b \n"
1424 "eor v17.16b, v17.16b, v17.16b \n"
1425 "eor v18.16b, v18.16b, v18.16b \n"
1426 "eor v19.16b, v19.16b, v19.16b \n"
1427 "eor v20.16b, v20.16b, v20.16b \n"
1428 "eor v21.16b, v21.16b, v21.16b \n"
1429 "eor v22.16b, v22.16b, v22.16b \n"
1430 "eor v23.16b, v23.16b, v23.16b \n"
1431
1432 "0: \n"
1433
1434 "prfm pldl1keep, [%2, #512] \n"
1435 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1436
1437 "prfm pldl1keep, [%3, #512] \n"
1438 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1439
1440 "fmla v16.4s, v8.4s, v0.s[0] \n"
1441 "fmla v17.4s, v8.4s, v1.s[0] \n"
1442 "fmla v18.4s, v8.4s, v2.s[0] \n"
1443 "fmla v19.4s, v8.4s, v3.s[0] \n"
1444
1445 "prfm pldl1keep, [%2, #512] \n"
1446 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
1447
1448 "fmla v20.4s, v8.4s, v4.s[0] \n"
1449 "fmla v21.4s, v8.4s, v5.s[0] \n"
1450 "fmla v22.4s, v8.4s, v6.s[0] \n"
1451 "fmla v23.4s, v8.4s, v7.s[0] \n"
1452
1453 "fmla v16.4s, v9.4s, v0.s[1] \n"
1454 "fmla v17.4s, v9.4s, v1.s[1] \n"
1455 "fmla v18.4s, v9.4s, v2.s[1] \n"
1456 "fmla v19.4s, v9.4s, v3.s[1] \n"
1457 "fmla v20.4s, v9.4s, v4.s[1] \n"
1458 "fmla v21.4s, v9.4s, v5.s[1] \n"
1459 "fmla v22.4s, v9.4s, v6.s[1] \n"
1460 "fmla v23.4s, v9.4s, v7.s[1] \n"
1461
1462 "fmla v16.4s, v10.4s, v0.s[2] \n"
1463 "fmla v17.4s, v10.4s, v1.s[2] \n"
1464 "fmla v18.4s, v10.4s, v2.s[2] \n"
1465 "fmla v19.4s, v10.4s, v3.s[2] \n"
1466 "fmla v20.4s, v10.4s, v4.s[2] \n"
1467 "fmla v21.4s, v10.4s, v5.s[2] \n"
1468 "fmla v22.4s, v10.4s, v6.s[2] \n"
1469 "fmla v23.4s, v10.4s, v7.s[2] \n"
1470
1471 "subs %w0, %w0, #1 \n"
1472
1473 "fmla v16.4s, v11.4s, v0.s[3] \n"
1474 "fmla v17.4s, v11.4s, v1.s[3] \n"
1475 "fmla v18.4s, v11.4s, v2.s[3] \n"
1476 "fmla v19.4s, v11.4s, v3.s[3] \n"
1477 "fmla v20.4s, v11.4s, v4.s[3] \n"
1478 "fmla v21.4s, v11.4s, v5.s[3] \n"
1479 "fmla v22.4s, v11.4s, v6.s[3] \n"
1480 "fmla v23.4s, v11.4s, v7.s[3] \n"
1481
1482 "bne 0b \n"
1483
1484 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1485 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
1486
1487 : "=r"(nn), // %0
1488 "=r"(output0_tm), // %1
1489 "=r"(r0), // %2
1490 "=r"(k0) // %3
1491 : "0"(nn),
1492 "1"(output0_tm),
1493 "2"(r0),
1494 "3"(k0)
1495 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
1496 #else
1497 asm volatile(
1498 "veor q8, q8 \n"
1499 "veor q9, q9 \n"
1500 "veor q10, q10 \n"
1501 "veor q11, q11 \n"
1502 "veor q12, q12 \n"
1503 "veor q13, q13 \n"
1504 "veor q14, q14 \n"
1505 "veor q15, q15 \n"
1506
1507 "0: \n"
1508
1509 "pld [%2, #512] \n"
1510 "vldm %2!, {d0-d7} \n"
1511
1512 "pld [%3, #512] \n"
1513 "vldm %3!, {d8-d15} \n"
1514
1515 "vmla.f32 q8, q4, d0[0] \n"
1516 "vmla.f32 q9, q4, d0[1] \n"
1517 "vmla.f32 q10, q4, d1[0] \n"
1518 "vmla.f32 q11, q4, d1[1] \n"
1519 "vmla.f32 q12, q4, d2[0] \n"
1520 "vmla.f32 q13, q4, d2[1] \n"
1521 "vmla.f32 q14, q4, d3[0] \n"
1522 "vmla.f32 q15, q4, d3[1] \n"
1523
1524 "vmla.f32 q8, q5, d4[0] \n"
1525 "vmla.f32 q9, q5, d4[1] \n"
1526 "vmla.f32 q10, q5, d5[0] \n"
1527 "vmla.f32 q11, q5, d5[1] \n"
1528 "vmla.f32 q12, q5, d6[0] \n"
1529 "vmla.f32 q13, q5, d6[1] \n"
1530 "vmla.f32 q14, q5, d7[0] \n"
1531 "vmla.f32 q15, q5, d7[1] \n"
1532
1533 "pld [%2, #512] \n"
1534 "vldm %2!, {d0-d7} \n"
1535
1536 "vmla.f32 q8, q6, d0[0] \n"
1537 "vmla.f32 q9, q6, d0[1] \n"
1538 "vmla.f32 q10, q6, d1[0] \n"
1539 "vmla.f32 q11, q6, d1[1] \n"
1540 "vmla.f32 q12, q6, d2[0] \n"
1541 "vmla.f32 q13, q6, d2[1] \n"
1542 "vmla.f32 q14, q6, d3[0] \n"
1543 "vmla.f32 q15, q6, d3[1] \n"
1544
1545 "subs %0, %0, #1 \n"
1546
1547 "vmla.f32 q8, q7, d4[0] \n"
1548 "vmla.f32 q9, q7, d4[1] \n"
1549 "vmla.f32 q10, q7, d5[0] \n"
1550 "vmla.f32 q11, q7, d5[1] \n"
1551 "vmla.f32 q12, q7, d6[0] \n"
1552 "vmla.f32 q13, q7, d6[1] \n"
1553 "vmla.f32 q14, q7, d7[0] \n"
1554 "vmla.f32 q15, q7, d7[1] \n"
1555
1556 "bne 0b \n"
1557
1558 "vstm %1!, {d16-d23} \n"
1559 "vstm %1!, {d24-d31} \n"
1560
1561 : "=r"(nn), // %0
1562 "=r"(output0_tm), // %1
1563 "=r"(r0), // %2
1564 "=r"(k0) // %3
1565 : "0"(nn),
1566 "1"(output0_tm),
1567 "2"(r0),
1568 "3"(k0)
1569 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1570 #endif
1571 }
1572 for (; i + 3 < tiles; i += 4)
1573 {
1574 #if __aarch64__
1575 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1576 #else
1577 const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1578 #endif
1579
1580 const float* k0 = kernel0_tm.row(r);
1581
1582 int nn = inch; // inch always > 0
1583
1584 #if __aarch64__
1585 asm volatile(
1586 "eor v16.16b, v16.16b, v16.16b \n"
1587 "eor v17.16b, v17.16b, v17.16b \n"
1588 "eor v18.16b, v18.16b, v18.16b \n"
1589 "eor v19.16b, v19.16b, v19.16b \n"
1590
1591 "0: \n"
1592
1593 "prfm pldl1keep, [%2, #512] \n"
1594 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
1595
1596 "prfm pldl1keep, [%3, #512] \n"
1597 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1598
1599 "fmla v16.4s, v8.4s, v0.s[0] \n"
1600 "fmla v17.4s, v8.4s, v1.s[0] \n"
1601 "fmla v18.4s, v8.4s, v2.s[0] \n"
1602 "fmla v19.4s, v8.4s, v3.s[0] \n"
1603
1604 "fmla v16.4s, v9.4s, v0.s[1] \n"
1605 "fmla v17.4s, v9.4s, v1.s[1] \n"
1606 "fmla v18.4s, v9.4s, v2.s[1] \n"
1607 "fmla v19.4s, v9.4s, v3.s[1] \n"
1608
1609 "fmla v16.4s, v10.4s, v0.s[2] \n"
1610 "fmla v17.4s, v10.4s, v1.s[2] \n"
1611 "fmla v18.4s, v10.4s, v2.s[2] \n"
1612 "fmla v19.4s, v10.4s, v3.s[2] \n"
1613
1614 "subs %w0, %w0, #1 \n"
1615
1616 "fmla v16.4s, v11.4s, v0.s[3] \n"
1617 "fmla v17.4s, v11.4s, v1.s[3] \n"
1618 "fmla v18.4s, v11.4s, v2.s[3] \n"
1619 "fmla v19.4s, v11.4s, v3.s[3] \n"
1620
1621 "bne 0b \n"
1622
1623 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
1624
1625 : "=r"(nn), // %0
1626 "=r"(output0_tm), // %1
1627 "=r"(r0), // %2
1628 "=r"(k0) // %3
1629 : "0"(nn),
1630 "1"(output0_tm),
1631 "2"(r0),
1632 "3"(k0)
1633 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1634 #else
1635 asm volatile(
1636 "veor q8, q8 \n"
1637 "veor q9, q9 \n"
1638 "veor q10, q10 \n"
1639 "veor q11, q11 \n"
1640
1641 "0: \n"
1642
1643 "pld [%2, #512] \n"
1644 "vldm %2!, {d0-d7} \n"
1645
1646 "pld [%3, #512] \n"
1647 "vldm %3!, {d8-d15} \n"
1648
1649 "vmla.f32 q8, q4, d0[0] \n"
1650 "vmla.f32 q9, q4, d2[0] \n"
1651 "vmla.f32 q10, q4, d4[0] \n"
1652 "vmla.f32 q11, q4, d6[0] \n"
1653
1654 "vmla.f32 q8, q5, d0[1] \n"
1655 "vmla.f32 q9, q5, d2[1] \n"
1656 "vmla.f32 q10, q5, d4[1] \n"
1657 "vmla.f32 q11, q5, d6[1] \n"
1658
1659 "vmla.f32 q8, q6, d1[0] \n"
1660 "vmla.f32 q9, q6, d3[0] \n"
1661 "vmla.f32 q10, q6, d5[0] \n"
1662 "vmla.f32 q11, q6, d7[0] \n"
1663
1664 "subs %0, %0, #1 \n"
1665
1666 "vmla.f32 q8, q7, d1[1] \n"
1667 "vmla.f32 q9, q7, d3[1] \n"
1668 "vmla.f32 q10, q7, d5[1] \n"
1669 "vmla.f32 q11, q7, d7[1] \n"
1670
1671 "bne 0b \n"
1672
1673 "vstm %1!, {d16-d23} \n"
1674
1675 : "=r"(nn), // %0
1676 "=r"(output0_tm), // %1
1677 "=r"(r0), // %2
1678 "=r"(k0) // %3
1679 : "0"(nn),
1680 "1"(output0_tm),
1681 "2"(r0),
1682 "3"(k0)
1683 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1684 #endif
1685 }
1686 for (; i + 1 < tiles; i += 2)
1687 {
1688 #if __aarch64__
1689 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
1690 #else
1691 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
1692 #endif
1693
1694 const float* k0 = kernel0_tm.row(r);
1695
1696 int nn = inch; // inch always > 0
1697
1698 #if __aarch64__
1699 asm volatile(
1700 "eor v16.16b, v16.16b, v16.16b \n"
1701 "eor v17.16b, v17.16b, v17.16b \n"
1702
1703 "0: \n"
1704
1705 "prfm pldl1keep, [%2, #256] \n"
1706 "ld1 {v0.4s, v1.4s}, [%2], #32 \n" // r0 r1
1707
1708 "prfm pldl1keep, [%3, #512] \n"
1709 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1710
1711 "fmla v16.4s, v8.4s, v0.s[0] \n"
1712 "fmla v17.4s, v8.4s, v1.s[0] \n"
1713
1714 "fmla v16.4s, v9.4s, v0.s[1] \n"
1715 "fmla v17.4s, v9.4s, v1.s[1] \n"
1716
1717 "fmla v16.4s, v10.4s, v0.s[2] \n"
1718 "fmla v17.4s, v10.4s, v1.s[2] \n"
1719
1720 "subs %w0, %w0, #1 \n"
1721
1722 "fmla v16.4s, v11.4s, v0.s[3] \n"
1723 "fmla v17.4s, v11.4s, v1.s[3] \n"
1724
1725 "bne 0b \n"
1726
1727 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
1728
1729 : "=r"(nn), // %0
1730 "=r"(output0_tm), // %1
1731 "=r"(r0), // %2
1732 "=r"(k0) // %3
1733 : "0"(nn),
1734 "1"(output0_tm),
1735 "2"(r0),
1736 "3"(k0)
1737 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
1738 #else
1739 asm volatile(
1740 "veor q8, q8 \n"
1741 "veor q9, q9 \n"
1742
1743 "0: \n"
1744
1745 "pld [%2, #256] \n"
1746 "vld1.f32 {d0-d3}, [%2 :128]! \n"
1747
1748 "pld [%3, #512] \n"
1749 "vldm %3!, {d8-d15} \n"
1750
1751 "vmla.f32 q8, q4, d0[0] \n"
1752 "vmla.f32 q9, q4, d2[0] \n"
1753
1754 "vmla.f32 q8, q5, d0[1] \n"
1755 "vmla.f32 q9, q5, d2[1] \n"
1756
1757 "vmla.f32 q8, q6, d1[0] \n"
1758 "vmla.f32 q9, q6, d3[0] \n"
1759
1760 "subs %0, %0, #1 \n"
1761
1762 "vmla.f32 q8, q7, d1[1] \n"
1763 "vmla.f32 q9, q7, d3[1] \n"
1764
1765 "bne 0b \n"
1766
1767 "vst1.f32 {d16-d19}, [%1 :128]! \n"
1768
1769 : "=r"(nn), // %0
1770 "=r"(output0_tm), // %1
1771 "=r"(r0), // %2
1772 "=r"(k0) // %3
1773 : "0"(nn),
1774 "1"(output0_tm),
1775 "2"(r0),
1776 "3"(k0)
1777 : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
1778 #endif
1779 }
1780 for (; i < tiles; i++)
1781 {
1782 #if __aarch64__
1783 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
1784 #else
1785 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
1786 #endif
1787
1788 const float* k0 = kernel0_tm.row(r);
1789
1790 int nn = inch; // inch always > 0
1791
1792 #if __aarch64__
1793 asm volatile(
1794 "eor v16.16b, v16.16b, v16.16b \n"
1795
1796 "0: \n"
1797
1798 "prfm pldl1keep, [%2, #128] \n"
1799 "ld1 {v0.4s}, [%2], #16 \n" // r0
1800
1801 "prfm pldl1keep, [%3, #512] \n"
1802 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
1803
1804 "fmla v16.4s, v8.4s, v0.s[0] \n"
1805 "fmla v16.4s, v9.4s, v0.s[1] \n"
1806
1807 "subs %w0, %w0, #1 \n"
1808
1809 "fmla v16.4s, v10.4s, v0.s[2] \n"
1810 "fmla v16.4s, v11.4s, v0.s[3] \n"
1811
1812 "bne 0b \n"
1813
1814 "st1 {v16.4s}, [%1], #16 \n"
1815
1816 : "=r"(nn), // %0
1817 "=r"(output0_tm), // %1
1818 "=r"(r0), // %2
1819 "=r"(k0) // %3
1820 : "0"(nn),
1821 "1"(output0_tm),
1822 "2"(r0),
1823 "3"(k0)
1824 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
1825 #else
1826 asm volatile(
1827 "veor q8, q8 \n"
1828
1829 "0: \n"
1830
1831 "pld [%2, #128] \n"
1832 "vld1.f32 {d0-d1}, [%2 :128]! \n"
1833
1834 "pld [%3, #512] \n"
1835 "vldm %3!, {d8-d15} \n"
1836
1837 "vmla.f32 q8, q4, d0[0] \n"
1838 "vmla.f32 q8, q5, d0[1] \n"
1839
1840 "subs %0, %0, #1 \n"
1841
1842 "vmla.f32 q8, q6, d1[0] \n"
1843 "vmla.f32 q8, q7, d1[1] \n"
1844
1845 "bne 0b \n"
1846
1847 "vst1.f32 {d16-d17}, [%1 :128]! \n"
1848
1849 : "=r"(nn), // %0
1850 "=r"(output0_tm), // %1
1851 "=r"(r0), // %2
1852 "=r"(k0) // %3
1853 : "0"(nn),
1854 "1"(output0_tm),
1855 "2"(r0),
1856 "3"(k0)
1857 : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
1858 #endif
1859 }
1860 }
1861 }
1862 }
1863 bottom_blob_tm = Mat();
1864 // END dot
1865
1866 // BEGIN transform output
1867 Mat top_blob_bordered;
1868 if (outw == top_blob.w && outh == top_blob.h)
1869 {
1870 top_blob_bordered = top_blob;
1871 }
1872 else
1873 {
1874 top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
1875 }
1876 {
1877 // const float otm[6][8] = {
1878 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
1879 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
1880 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
1881 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
1882 // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
1883 // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
1884 // };
1885
1886 // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
1887 // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
1888 // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
1889 // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
1890 // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
1891 // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
1892
1893 int w_tm = outw / 6 * 8;
1894 int h_tm = outh / 6 * 8;
1895 const int tiles = w_tm / 8 * h_tm / 8;
1896
1897 #pragma omp parallel for num_threads(opt.num_threads)
1898 for (int p = 0; p < outch; p++)
1899 {
1900 const Mat out0_tm = top_blob_tm.channel(p);
1901 Mat out0 = top_blob_bordered.channel(p);
1902
1903 // const float bias0 = bias ? bias[p] : 0.f;
1904 float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
1905
1906 float tmp[6][8][4];
1907
1908 // tile
1909 for (int i = 0; i < outh / 6; i++)
1910 {
1911 for (int j = 0; j < outw / 6; j++)
1912 {
1913 // top_blob_tm.create(tiles, 64, outch, elemsize, elempack);
1914
1915 const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 8 + j) * 4;
1916 const float* output0_tm_1 = output0_tm_0 + tiles * 4;
1917 const float* output0_tm_2 = output0_tm_0 + tiles * 8;
1918 const float* output0_tm_3 = output0_tm_0 + tiles * 12;
1919 const float* output0_tm_4 = output0_tm_0 + tiles * 16;
1920 const float* output0_tm_5 = output0_tm_0 + tiles * 20;
1921 const float* output0_tm_6 = output0_tm_0 + tiles * 24;
1922 const float* output0_tm_7 = output0_tm_0 + tiles * 28;
1923
1924 float* output0 = out0.row(i * 6) + (j * 6) * 4;
1925
1926 // TODO neon optimize
1927 for (int m = 0; m < 8; m++)
1928 {
1929 float32x4_t _out0tm0 = vld1q_f32(output0_tm_0);
1930 float32x4_t _out0tm1 = vld1q_f32(output0_tm_1);
1931 float32x4_t _out0tm2 = vld1q_f32(output0_tm_2);
1932 float32x4_t _out0tm3 = vld1q_f32(output0_tm_3);
1933 float32x4_t _out0tm4 = vld1q_f32(output0_tm_4);
1934 float32x4_t _out0tm5 = vld1q_f32(output0_tm_5);
1935 float32x4_t _out0tm6 = vld1q_f32(output0_tm_6);
1936 float32x4_t _out0tm7 = vld1q_f32(output0_tm_7);
1937
1938 float32x4_t _tmp024a = vaddq_f32(_out0tm1, _out0tm2);
1939 float32x4_t _tmp135a = vsubq_f32(_out0tm1, _out0tm2);
1940
1941 // float tmp024a = output0_tm[1] + output0_tm[2];
1942 // float tmp135a = output0_tm[1] - output0_tm[2];
1943
1944 float32x4_t _tmp024b = vaddq_f32(_out0tm3, _out0tm4);
1945 float32x4_t _tmp135b = vsubq_f32(_out0tm3, _out0tm4);
1946
1947 // float tmp024b = output0_tm[3] + output0_tm[4];
1948 // float tmp135b = output0_tm[3] - output0_tm[4];
1949
1950 float32x4_t _tmp024c = vaddq_f32(_out0tm5, _out0tm6);
1951 float32x4_t _tmp135c = vsubq_f32(_out0tm5, _out0tm6);
1952
1953 // float tmp024c = output0_tm[5] + output0_tm[6];
1954 // float tmp135c = output0_tm[5] - output0_tm[6];
1955
1956 float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f));
1957 float32x4_t _tmp2m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f);
1958 float32x4_t _tmp4m = vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f);
1959 vst1q_f32(tmp[0][m], _tmp0m);
1960 vst1q_f32(tmp[2][m], _tmp2m);
1961 vst1q_f32(tmp[4][m], _tmp4m);
1962
1963 // tmp[0][m] = output0_tm[0] + tmp024a + tmp024b + tmp024c * 32;
1964 // tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
1965 // tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
1966
1967 float32x4_t _tmp1m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f);
1968 float32x4_t _tmp3m = vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f);
1969 float32x4_t _tmp5m = vaddq_f32(vaddq_f32(_out0tm7, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f));
1970 vst1q_f32(tmp[1][m], _tmp1m);
1971 vst1q_f32(tmp[3][m], _tmp3m);
1972 vst1q_f32(tmp[5][m], _tmp5m);
1973
1974 // tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
1975 // tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
1976 // tmp[5][m] = output0_tm[7] + tmp135a + tmp135b * 32 + tmp135c;
1977
1978 output0_tm_0 += tiles * 32;
1979 output0_tm_1 += tiles * 32;
1980 output0_tm_2 += tiles * 32;
1981 output0_tm_3 += tiles * 32;
1982 output0_tm_4 += tiles * 32;
1983 output0_tm_5 += tiles * 32;
1984 output0_tm_6 += tiles * 32;
1985 output0_tm_7 += tiles * 32;
1986 }
1987
1988 for (int m = 0; m < 6; m++)
1989 {
1990 float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
1991 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
1992 float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
1993 float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
1994 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
1995 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
1996 float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
1997 float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
1998
1999 float32x4_t _tmp024a = vaddq_f32(_tmp01, _tmp02);
2000 float32x4_t _tmp135a = vsubq_f32(_tmp01, _tmp02);
2001
2002 // float tmp024a = tmp0[1] + tmp0[2];
2003 // float tmp135a = tmp0[1] - tmp0[2];
2004
2005 float32x4_t _tmp024b = vaddq_f32(_tmp03, _tmp04);
2006 float32x4_t _tmp135b = vsubq_f32(_tmp03, _tmp04);
2007
2008 // float tmp024b = tmp0[3] + tmp0[4];
2009 // float tmp135b = tmp0[3] - tmp0[4];
2010
2011 float32x4_t _tmp024c = vaddq_f32(_tmp05, _tmp06);
2012 float32x4_t _tmp135c = vsubq_f32(_tmp05, _tmp06);
2013
2014 // float tmp024c = tmp0[5] + tmp0[6];
2015 // float tmp135c = tmp0[5] - tmp0[6];
2016
2017 float32x4_t _out00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp024a), vmlaq_n_f32(_tmp024b, _tmp024c, 32.f)));
2018 float32x4_t _out02 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 4.f), _tmp024c, 8.f));
2019 float32x4_t _out04 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp024a, _tmp024b, 16.f), _tmp024c, 2.f));
2020 vst1q_f32(output0, _out00);
2021 vst1q_f32(output0 + 8, _out02);
2022 vst1q_f32(output0 + 16, _out04);
2023
2024 // output0[0] = bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32;
2025 // output0[2] = bias0 + tmp024a + tmp024b * 4 + tmp024c * 8;
2026 // output0[4] = bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c;
2027
2028 float32x4_t _out01 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 2.f), _tmp135c, 16.f));
2029 float32x4_t _out03 = vaddq_f32(_bias0, vmlaq_n_f32(vmlaq_n_f32(_tmp135a, _tmp135b, 8.f), _tmp135c, 4.f));
2030 float32x4_t _out05 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp07, _tmp135a), vmlaq_n_f32(_tmp135c, _tmp135b, 32.f)));
2031 vst1q_f32(output0 + 4, _out01);
2032 vst1q_f32(output0 + 12, _out03);
2033 vst1q_f32(output0 + 20, _out05);
2034
2035 // output0[1] = bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16;
2036 // output0[3] = bias0 + tmp135a + tmp135b * 8 + tmp135c * 4;
2037 // output0[5] = bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c;
2038
2039 output0 += outw * 4;
2040 }
2041 }
2042 }
2043 }
2044 }
2045 // END transform output
2046
2047 // cut result pad
2048 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
2049 }
2050
conv3x3s1_winograd42_transform_kernel_pack4_neon(const Mat & kernel,Mat & kernel_tm_pack4,int inch,int outch)2051 static void conv3x3s1_winograd42_transform_kernel_pack4_neon(const Mat& kernel, Mat& kernel_tm_pack4, int inch, int outch)
2052 {
2053 // winograd43 transform kernel
2054 Mat kernel_tm(6 * 6, inch, outch);
2055
2056 const float ktm[6][3] = {
2057 {1.0f / 4, 0.0f, 0.0f},
2058 {-1.0f / 6, -1.0f / 6, -1.0f / 6},
2059 {-1.0f / 6, 1.0f / 6, -1.0f / 6},
2060 {1.0f / 24, 1.0f / 12, 1.0f / 6},
2061 {1.0f / 24, -1.0f / 12, 1.0f / 6},
2062 {0.0f, 0.0f, 1.0f}
2063 };
2064
2065 #pragma omp parallel for
2066 for (int p = 0; p < outch; p++)
2067 {
2068 for (int q = 0; q < inch; q++)
2069 {
2070 const float* kernel0 = (const float*)kernel + p * inch * 9 + q * 9;
2071 float* kernel_tm0 = kernel_tm.channel(p).row(q);
2072
2073 // transform kernel
2074 const float* k0 = kernel0;
2075 const float* k1 = kernel0 + 3;
2076 const float* k2 = kernel0 + 6;
2077
2078 // h
2079 float tmp[6][3];
2080 for (int i = 0; i < 6; i++)
2081 {
2082 tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
2083 tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
2084 tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
2085 }
2086
2087 // U
2088 for (int j = 0; j < 6; j++)
2089 {
2090 float* tmpp = &tmp[j][0];
2091
2092 for (int i = 0; i < 6; i++)
2093 {
2094 kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
2095 }
2096 }
2097 }
2098 }
2099
2100 // interleave
2101 // src = 36-inch-outch
2102 // dst = 4b-4a-inch/4a-36-outch/4b;
2103 #if __aarch64__
2104 kernel_tm_pack4.create(2 * inch / 4, 36, (outch / 4) / 2 + (outch / 4) % 2, (size_t)4u * 16, 16);
2105 #else
2106 kernel_tm_pack4.create(inch / 4, 36, outch / 4, (size_t)4u * 16, 16);
2107 #endif
2108
2109 int q = 0;
2110 #if __aarch64__
2111 for (; q + 7 < outch; q += 8)
2112 {
2113 const Mat k0 = kernel_tm.channel(q);
2114 const Mat k1 = kernel_tm.channel(q + 1);
2115 const Mat k2 = kernel_tm.channel(q + 2);
2116 const Mat k3 = kernel_tm.channel(q + 3);
2117 const Mat k4 = kernel_tm.channel(q + 4);
2118 const Mat k5 = kernel_tm.channel(q + 5);
2119 const Mat k6 = kernel_tm.channel(q + 6);
2120 const Mat k7 = kernel_tm.channel(q + 7);
2121
2122 Mat g0 = kernel_tm_pack4.channel(q / 8);
2123
2124 for (int k = 0; k < 36; k++)
2125 {
2126 float* g00 = g0.row(k);
2127
2128 for (int p = 0; p + 3 < inch; p += 4)
2129 {
2130 const float* k00 = k0.row(p);
2131 const float* k01 = k0.row(p + 1);
2132 const float* k02 = k0.row(p + 2);
2133 const float* k03 = k0.row(p + 3);
2134
2135 const float* k10 = k1.row(p);
2136 const float* k11 = k1.row(p + 1);
2137 const float* k12 = k1.row(p + 2);
2138 const float* k13 = k1.row(p + 3);
2139
2140 const float* k20 = k2.row(p);
2141 const float* k21 = k2.row(p + 1);
2142 const float* k22 = k2.row(p + 2);
2143 const float* k23 = k2.row(p + 3);
2144
2145 const float* k30 = k3.row(p);
2146 const float* k31 = k3.row(p + 1);
2147 const float* k32 = k3.row(p + 2);
2148 const float* k33 = k3.row(p + 3);
2149
2150 const float* k40 = k4.row(p);
2151 const float* k41 = k4.row(p + 1);
2152 const float* k42 = k4.row(p + 2);
2153 const float* k43 = k4.row(p + 3);
2154
2155 const float* k50 = k5.row(p);
2156 const float* k51 = k5.row(p + 1);
2157 const float* k52 = k5.row(p + 2);
2158 const float* k53 = k5.row(p + 3);
2159
2160 const float* k60 = k6.row(p);
2161 const float* k61 = k6.row(p + 1);
2162 const float* k62 = k6.row(p + 2);
2163 const float* k63 = k6.row(p + 3);
2164
2165 const float* k70 = k7.row(p);
2166 const float* k71 = k7.row(p + 1);
2167 const float* k72 = k7.row(p + 2);
2168 const float* k73 = k7.row(p + 3);
2169
2170 g00[0] = k00[k];
2171 g00[1] = k10[k];
2172 g00[2] = k20[k];
2173 g00[3] = k30[k];
2174
2175 g00[4] = k40[k];
2176 g00[5] = k50[k];
2177 g00[6] = k60[k];
2178 g00[7] = k70[k];
2179
2180 g00[8] = k01[k];
2181 g00[9] = k11[k];
2182 g00[10] = k21[k];
2183 g00[11] = k31[k];
2184
2185 g00[12] = k41[k];
2186 g00[13] = k51[k];
2187 g00[14] = k61[k];
2188 g00[15] = k71[k];
2189
2190 g00[16] = k02[k];
2191 g00[17] = k12[k];
2192 g00[18] = k22[k];
2193 g00[19] = k32[k];
2194
2195 g00[20] = k42[k];
2196 g00[21] = k52[k];
2197 g00[22] = k62[k];
2198 g00[23] = k72[k];
2199
2200 g00[24] = k03[k];
2201 g00[25] = k13[k];
2202 g00[26] = k23[k];
2203 g00[27] = k33[k];
2204
2205 g00[28] = k43[k];
2206 g00[29] = k53[k];
2207 g00[30] = k63[k];
2208 g00[31] = k73[k];
2209
2210 g00 += 32;
2211 }
2212 }
2213 }
2214 #endif // __aarch64__
2215 for (; q + 3 < outch; q += 4)
2216 {
2217 const Mat k0 = kernel_tm.channel(q);
2218 const Mat k1 = kernel_tm.channel(q + 1);
2219 const Mat k2 = kernel_tm.channel(q + 2);
2220 const Mat k3 = kernel_tm.channel(q + 3);
2221
2222 #if __aarch64__
2223 Mat g0 = kernel_tm_pack4.channel(q / 8 + (q % 8) / 4);
2224 #else
2225 Mat g0 = kernel_tm_pack4.channel(q / 4);
2226 #endif
2227
2228 for (int k = 0; k < 36; k++)
2229 {
2230 float* g00 = g0.row(k);
2231
2232 for (int p = 0; p + 3 < inch; p += 4)
2233 {
2234 const float* k00 = k0.row(p);
2235 const float* k01 = k0.row(p + 1);
2236 const float* k02 = k0.row(p + 2);
2237 const float* k03 = k0.row(p + 3);
2238
2239 const float* k10 = k1.row(p);
2240 const float* k11 = k1.row(p + 1);
2241 const float* k12 = k1.row(p + 2);
2242 const float* k13 = k1.row(p + 3);
2243
2244 const float* k20 = k2.row(p);
2245 const float* k21 = k2.row(p + 1);
2246 const float* k22 = k2.row(p + 2);
2247 const float* k23 = k2.row(p + 3);
2248
2249 const float* k30 = k3.row(p);
2250 const float* k31 = k3.row(p + 1);
2251 const float* k32 = k3.row(p + 2);
2252 const float* k33 = k3.row(p + 3);
2253
2254 g00[0] = k00[k];
2255 g00[1] = k10[k];
2256 g00[2] = k20[k];
2257 g00[3] = k30[k];
2258
2259 g00[4] = k01[k];
2260 g00[5] = k11[k];
2261 g00[6] = k21[k];
2262 g00[7] = k31[k];
2263
2264 g00[8] = k02[k];
2265 g00[9] = k12[k];
2266 g00[10] = k22[k];
2267 g00[11] = k32[k];
2268
2269 g00[12] = k03[k];
2270 g00[13] = k13[k];
2271 g00[14] = k23[k];
2272 g00[15] = k33[k];
2273
2274 g00 += 16;
2275 }
2276 }
2277 }
2278 }
2279
conv3x3s1_winograd42_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)2280 static void conv3x3s1_winograd42_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
2281 {
2282 int w = bottom_blob.w;
2283 int h = bottom_blob.h;
2284 int inch = bottom_blob.c;
2285 size_t elemsize = bottom_blob.elemsize;
2286 int elempack = bottom_blob.elempack;
2287
2288 int outw = top_blob.w;
2289 int outh = top_blob.h;
2290 int outch = top_blob.c;
2291
2292 // pad to 4n+2
2293 Mat bottom_blob_bordered = bottom_blob;
2294
2295 outw = (outw + 3) / 4 * 4;
2296 outh = (outh + 3) / 4 * 4;
2297
2298 w = outw + 2;
2299 h = outh + 2;
2300 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
2301
2302 const float* bias = _bias;
2303
2304 // BEGIN transform input
2305 Mat bottom_blob_tm;
2306 {
2307 int w_tm = outw / 4 * 6;
2308 int h_tm = outh / 4 * 6;
2309
2310 const int tiles = w_tm / 6 * h_tm / 6;
2311
2312 bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
2313
2314 // const float itm[4][4] = {
2315 // {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
2316 // {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
2317 // {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
2318 // {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
2319 // {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
2320 // {0.0f, 4.0f, 0.0f,-5.0f, 0.0f, 1.0f}
2321 // };
2322
2323 // 0 = 4 * r00 - 5 * r02 + r04
2324 // 1 = -4 * (r01 + r02) + r04 + r03
2325 // 2 = 4 * (r01 - r02) + r04 - r03
2326 // 3 = -2 * (r01 - r03) + r04 - r02
2327 // 4 = 2 * (r01 - r03) + r04 - r02
2328 // 5 = 4 * r01 - 5 * r03 + r05
2329
2330 #pragma omp parallel for num_threads(opt.num_threads)
2331 for (int q = 0; q < inch; q++)
2332 {
2333 const Mat img0 = bottom_blob_bordered.channel(q);
2334 Mat img0_tm = bottom_blob_tm.channel(q);
2335
2336 float tmp[6][6][4];
2337
2338 // tile
2339 for (int i = 0; i < h_tm / 6; i++)
2340 {
2341 for (int j = 0; j < w_tm / 6; j++)
2342 {
2343 const float* r0 = img0.row(i * 4) + (j * 4) * 4;
2344
2345 for (int m = 0; m < 6; m++)
2346 {
2347 float32x4_t _r00 = vld1q_f32(r0);
2348 float32x4_t _r01 = vld1q_f32(r0 + 4);
2349 float32x4_t _r02 = vld1q_f32(r0 + 8);
2350 float32x4_t _r03 = vld1q_f32(r0 + 12);
2351 float32x4_t _r04 = vld1q_f32(r0 + 16);
2352 float32x4_t _r05 = vld1q_f32(r0 + 20);
2353
2354 float32x4_t _tmp0m = vmlsq_n_f32(vmlaq_n_f32(_r04, _r00, 4.f), _r02, 5.f);
2355 float32x4_t _tmp1m = vmlsq_n_f32(vaddq_f32(_r04, _r03), vaddq_f32(_r01, _r02), 4.f);
2356 float32x4_t _tmp2m = vmlaq_n_f32(vsubq_f32(_r04, _r03), vsubq_f32(_r01, _r02), 4.f);
2357 float32x4_t _tmp3m = vmlsq_n_f32(vsubq_f32(_r04, _r02), vsubq_f32(_r01, _r03), 2.f);
2358 float32x4_t _tmp4m = vmlaq_n_f32(vsubq_f32(_r04, _r02), vsubq_f32(_r01, _r03), 2.f);
2359 float32x4_t _tmp5m = vmlsq_n_f32(vmlaq_n_f32(_r05, _r01, 4.f), _r03, 5.f);
2360
2361 vst1q_f32(tmp[0][m], _tmp0m);
2362 vst1q_f32(tmp[1][m], _tmp1m);
2363 vst1q_f32(tmp[2][m], _tmp2m);
2364 vst1q_f32(tmp[3][m], _tmp3m);
2365 vst1q_f32(tmp[4][m], _tmp4m);
2366 vst1q_f32(tmp[5][m], _tmp5m);
2367
2368 r0 += w * 4;
2369 }
2370
2371 float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 6 + j) * 4;
2372 float* r0_tm_1 = r0_tm_0 + tiles * 4;
2373 float* r0_tm_2 = r0_tm_0 + tiles * 8;
2374 float* r0_tm_3 = r0_tm_0 + tiles * 12;
2375 float* r0_tm_4 = r0_tm_0 + tiles * 16;
2376 float* r0_tm_5 = r0_tm_0 + tiles * 20;
2377
2378 for (int m = 0; m < 6; m++)
2379 {
2380 float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
2381 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
2382 float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
2383 float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
2384 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
2385 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
2386
2387 float32x4_t _r0tm0 = vmlsq_n_f32(vmlaq_n_f32(_tmp04, _tmp00, 4.f), _tmp02, 5.f);
2388 float32x4_t _r0tm1 = vmlsq_n_f32(vaddq_f32(_tmp04, _tmp03), vaddq_f32(_tmp01, _tmp02), 4.f);
2389 float32x4_t _r0tm2 = vmlaq_n_f32(vsubq_f32(_tmp04, _tmp03), vsubq_f32(_tmp01, _tmp02), 4.f);
2390 float32x4_t _r0tm3 = vmlsq_n_f32(vsubq_f32(_tmp04, _tmp02), vsubq_f32(_tmp01, _tmp03), 2.f);
2391 float32x4_t _r0tm4 = vmlaq_n_f32(vsubq_f32(_tmp04, _tmp02), vsubq_f32(_tmp01, _tmp03), 2.f);
2392 float32x4_t _r0tm5 = vmlsq_n_f32(vmlaq_n_f32(_tmp05, _tmp01, 4.f), _tmp03, 5.f);
2393
2394 vst1q_f32(r0_tm_0, _r0tm0);
2395 vst1q_f32(r0_tm_1, _r0tm1);
2396 vst1q_f32(r0_tm_2, _r0tm2);
2397 vst1q_f32(r0_tm_3, _r0tm3);
2398 vst1q_f32(r0_tm_4, _r0tm4);
2399 vst1q_f32(r0_tm_5, _r0tm5);
2400
2401 r0_tm_0 += tiles * 24;
2402 r0_tm_1 += tiles * 24;
2403 r0_tm_2 += tiles * 24;
2404 r0_tm_3 += tiles * 24;
2405 r0_tm_4 += tiles * 24;
2406 r0_tm_5 += tiles * 24;
2407 }
2408 }
2409 }
2410 }
2411 }
2412 bottom_blob_bordered = Mat();
2413 // END transform input
2414
2415 // BEGIN dot
2416 Mat top_blob_tm;
2417 {
2418 int w_tm = outw / 4 * 6;
2419 int h_tm = outh / 4 * 6;
2420
2421 const int tiles = h_tm / 6 * w_tm / 6;
2422
2423 // permute
2424 // bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
2425 Mat bottom_blob_tm2;
2426 #if __aarch64__
2427 if (tiles >= 12)
2428 bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + (tiles % 12 % 4) / 2 + tiles % 12 % 2, 36, elemsize, elempack, opt.workspace_allocator);
2429 else if (tiles >= 8)
2430 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2431 else if (tiles >= 4)
2432 bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2433 else if (tiles >= 2)
2434 bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2435 else // if (tiles >= 1)
2436 bottom_blob_tm2.create(1 * inch, tiles, 36, elemsize, elempack, opt.workspace_allocator);
2437 #else
2438 if (tiles >= 8)
2439 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2440 else if (tiles >= 4)
2441 bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2442 else if (tiles >= 2)
2443 bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, elemsize, elempack, opt.workspace_allocator);
2444 else // if (tiles >= 1)
2445 bottom_blob_tm2.create(1 * inch, tiles, 36, elemsize, elempack, opt.workspace_allocator);
2446 #endif
2447
2448 #pragma omp parallel for num_threads(opt.num_threads)
2449 for (int r = 0; r < 36; r++)
2450 {
2451 Mat tm2 = bottom_blob_tm2.channel(r);
2452
2453 // tile
2454 int i = 0;
2455 #if __aarch64__
2456 for (; i + 11 < tiles; i += 12)
2457 {
2458 float* tm2p = tm2.row(i / 12);
2459
2460 const float* r0 = bottom_blob_tm;
2461
2462 r0 += (r * tiles + i) * 4;
2463
2464 for (int q = 0; q < inch; q++)
2465 {
2466 asm volatile(
2467 "prfm pldl1keep, [%0, #512] \n"
2468 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
2469 "prfm pldl1keep, [%0, #512] \n"
2470 "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
2471 "prfm pldl1keep, [%0, #512] \n"
2472 "ld4 {v8.4s, v9.4s, v10.4s, v11.4s}, [%0] \n"
2473 "st1 {v0.4s}, [%1], #16 \n"
2474 "st1 {v4.4s}, [%1], #16 \n"
2475 "st1 {v8.4s}, [%1], #16 \n"
2476 "sub %0, %0, #128 \n"
2477 "st1 {v1.4s}, [%1], #16 \n"
2478 "st1 {v5.4s}, [%1], #16 \n"
2479 "st1 {v9.4s}, [%1], #16 \n"
2480 "st1 {v2.4s}, [%1], #16 \n"
2481 "st1 {v6.4s}, [%1], #16 \n"
2482 "st1 {v10.4s}, [%1], #16 \n"
2483 "st1 {v3.4s}, [%1], #16 \n"
2484 "st1 {v7.4s}, [%1], #16 \n"
2485 "st1 {v11.4s}, [%1], #16 \n"
2486 : "=r"(r0), // %0
2487 "=r"(tm2p) // %1
2488 : "0"(r0),
2489 "1"(tm2p)
2490 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
2491 r0 += bottom_blob_tm.cstep * 4;
2492 }
2493 }
2494 #endif
2495 for (; i + 7 < tiles; i += 8)
2496 {
2497 #if __aarch64__
2498 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
2499 #else
2500 float* tm2p = tm2.row(i / 8);
2501 #endif
2502
2503 const float* r0 = bottom_blob_tm;
2504
2505 r0 += (r * tiles + i) * 4;
2506
2507 for (int q = 0; q < inch; q++)
2508 {
2509 #if __aarch64__
2510 asm volatile(
2511 "prfm pldl1keep, [%0, #512] \n"
2512 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
2513 "prfm pldl1keep, [%0, #512] \n"
2514 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
2515 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
2516 "sub %0, %0, #64 \n"
2517 "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
2518 : "=r"(r0), // %0
2519 "=r"(tm2p) // %1
2520 : "0"(r0),
2521 "1"(tm2p)
2522 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
2523 #else
2524 asm volatile(
2525 "pld [%0, #512] \n"
2526 "vldm %0!, {d0-d7} \n"
2527 "pld [%0, #512] \n"
2528 "vldm %0, {d16-d23} \n"
2529
2530 // transpose 8x4
2531 "vtrn.32 q0, q1 \n"
2532 "vtrn.32 q2, q3 \n"
2533 "vtrn.32 q8, q9 \n"
2534 "vtrn.32 q10, q11 \n"
2535 "vswp d1, d4 \n"
2536 "vswp d3, d6 \n"
2537 "vswp d17, d20 \n"
2538 "vswp d19, d22 \n"
2539 "vswp q1, q8 \n"
2540 "vswp q3, q10 \n"
2541
2542 "vst1.f32 {d0-d3}, [%1 :128]! \n"
2543 "vst1.f32 {d16-d19}, [%1 :128]! \n"
2544 "sub %0, %0, #64 \n"
2545 "vst1.f32 {d4-d7}, [%1 :128]! \n"
2546 "vst1.f32 {d20-d23}, [%1 :128]! \n"
2547 : "=r"(r0), // %0
2548 "=r"(tm2p) // %1
2549 : "0"(r0),
2550 "1"(tm2p)
2551 : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
2552 #endif
2553 r0 += bottom_blob_tm.cstep * 4;
2554 }
2555 }
2556 for (; i + 3 < tiles; i += 4)
2557 {
2558 #if __aarch64__
2559 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
2560 #else
2561 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
2562 #endif
2563
2564 const float* r0 = bottom_blob_tm;
2565
2566 r0 += (r * tiles + i) * 4;
2567
2568 for (int q = 0; q < inch; q++)
2569 {
2570 #if __aarch64__
2571 asm volatile(
2572 "prfm pldl1keep, [%0, #512] \n"
2573 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
2574 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
2575 : "=r"(r0), // %0
2576 "=r"(tm2p) // %1
2577 : "0"(r0),
2578 "1"(tm2p)
2579 : "memory", "v0", "v1", "v2", "v3");
2580 #else
2581 asm volatile(
2582 "pld [%0, #512] \n"
2583 "vldm %0, {d0-d7} \n"
2584 "vstm %1!, {d0-d7} \n"
2585 : "=r"(r0), // %0
2586 "=r"(tm2p) // %1
2587 : "0"(r0),
2588 "1"(tm2p)
2589 : "memory", "q0", "q1", "q2", "q3");
2590 #endif // __aarch64__
2591 r0 += bottom_blob_tm.cstep * 4;
2592 }
2593 }
2594 for (; i + 1 < tiles; i += 2)
2595 {
2596 #if __aarch64__
2597 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
2598 #else
2599 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
2600 #endif
2601
2602 const float* r0 = bottom_blob_tm;
2603
2604 r0 += (r * tiles + i) * 4;
2605
2606 for (int q = 0; q < inch; q++)
2607 {
2608 #if __aarch64__
2609 asm volatile(
2610 "prfm pldl1keep, [%0, #256] \n"
2611 "ld1 {v0.4s, v1.4s}, [%0] \n"
2612 "st1 {v0.4s, v1.4s}, [%1], #32 \n"
2613 : "=r"(r0), // %0
2614 "=r"(tm2p) // %1
2615 : "0"(r0),
2616 "1"(tm2p)
2617 : "memory", "v0", "v1");
2618 #else
2619 asm volatile(
2620 "pld [%0, #256] \n"
2621 "vld1.f32 {d0-d3}, [%0 :128] \n"
2622 "vst1.f32 {d0-d3}, [%1 :128]! \n"
2623 : "=r"(r0), // %0
2624 "=r"(tm2p) // %1
2625 : "0"(r0),
2626 "1"(tm2p)
2627 : "memory", "q0", "q1");
2628 #endif // __aarch64__
2629 r0 += bottom_blob_tm.cstep * 4;
2630 }
2631 }
2632 for (; i < tiles; i++)
2633 {
2634 #if __aarch64__
2635 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
2636 #else
2637 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
2638 #endif
2639
2640 const float* r0 = bottom_blob_tm;
2641
2642 r0 += (r * tiles + i) * 4;
2643
2644 for (int q = 0; q < inch; q++)
2645 {
2646 #if __aarch64__
2647 asm volatile(
2648 "prfm pldl1keep, [%0, #128] \n"
2649 "ld1 {v0.4s}, [%0] \n"
2650 "st1 {v0.4s}, [%1], #16 \n"
2651 : "=r"(r0), // %0
2652 "=r"(tm2p) // %1
2653 : "0"(r0),
2654 "1"(tm2p)
2655 : "memory", "v0");
2656 #else
2657 asm volatile(
2658 "pld [%0, #128] \n"
2659 "vld1.f32 {d0-d1}, [%0 :128] \n"
2660 "vst1.f32 {d0-d1}, [%1 :128]! \n"
2661 : "=r"(r0), // %0
2662 "=r"(tm2p) // %1
2663 : "0"(r0),
2664 "1"(tm2p)
2665 : "memory", "q0");
2666 #endif // __aarch64__
2667 r0 += bottom_blob_tm.cstep * 4;
2668 }
2669 }
2670 }
2671
2672 bottom_blob_tm = Mat();
2673 // permute end
2674
2675 top_blob_tm.create(tiles, 36, outch, elemsize, elempack, opt.workspace_allocator);
2676
2677 int remain_outch_start = 0;
2678
2679 #if __ARM_NEON && __aarch64__
2680 int nn_outch = 0;
2681 nn_outch = outch >> 1;
2682 remain_outch_start = nn_outch << 1;
2683
2684 #pragma omp parallel for num_threads(opt.num_threads)
2685 for (int pp = 0; pp < nn_outch; pp++)
2686 {
2687 int p = pp * 2;
2688
2689 float* output0_tm = top_blob_tm.channel(p);
2690 float* output1_tm = top_blob_tm.channel(p + 1);
2691
2692 const Mat kernel01_tm = kernel_tm.channel(pp);
2693
2694 for (int r = 0; r < 36; r++)
2695 {
2696 const Mat bb2 = bottom_blob_tm2.channel(r);
2697
2698 int i = 0;
2699 for (; i + 11 < tiles; i += 12)
2700 {
2701 const float* r0 = bb2.row(i / 12);
2702
2703 const float* k01 = kernel01_tm.row(r);
2704
2705 int nn = inch; // inch always > 0
2706
2707 asm volatile(
2708 "eor v8.16b, v8.16b, v8.16b \n"
2709 "eor v9.16b, v9.16b, v9.16b \n"
2710 "eor v10.16b, v10.16b, v10.16b \n"
2711 "eor v11.16b, v11.16b, v11.16b \n"
2712 "eor v12.16b, v12.16b, v12.16b \n"
2713 "eor v13.16b, v13.16b, v13.16b \n"
2714 "eor v14.16b, v14.16b, v14.16b \n"
2715 "eor v15.16b, v15.16b, v15.16b \n"
2716 "eor v16.16b, v16.16b, v16.16b \n"
2717 "eor v17.16b, v17.16b, v17.16b \n"
2718 "eor v18.16b, v18.16b, v18.16b \n"
2719 "eor v19.16b, v19.16b, v19.16b \n"
2720 "eor v20.16b, v20.16b, v20.16b \n"
2721 "eor v21.16b, v21.16b, v21.16b \n"
2722 "eor v22.16b, v22.16b, v22.16b \n"
2723 "eor v23.16b, v23.16b, v23.16b \n"
2724 "eor v24.16b, v24.16b, v24.16b \n"
2725 "eor v25.16b, v25.16b, v25.16b \n"
2726 "eor v26.16b, v26.16b, v26.16b \n"
2727 "eor v27.16b, v27.16b, v27.16b \n"
2728 "eor v28.16b, v28.16b, v28.16b \n"
2729 "eor v29.16b, v29.16b, v29.16b \n"
2730 "eor v30.16b, v30.16b, v30.16b \n"
2731 "eor v31.16b, v31.16b, v31.16b \n"
2732
2733 "0: \n"
2734
2735 "prfm pldl1keep, [%3, #512] \n"
2736 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2737
2738 "prfm pldl1keep, [%4, #512] \n"
2739 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w0011_01
2740
2741 "fmla v8.4s, v4.4s, v0.s[0] \n"
2742 "fmla v9.4s, v4.4s, v0.s[1] \n"
2743 "fmla v10.4s, v4.4s, v0.s[2] \n"
2744 "fmla v11.4s, v4.4s, v0.s[3] \n"
2745 "fmla v12.4s, v4.4s, v1.s[0] \n"
2746 "fmla v13.4s, v4.4s, v1.s[1] \n"
2747 "fmla v14.4s, v4.4s, v1.s[2] \n"
2748 "fmla v15.4s, v4.4s, v1.s[3] \n"
2749 "fmla v16.4s, v4.4s, v2.s[0] \n"
2750 "fmla v17.4s, v4.4s, v2.s[1] \n"
2751 "fmla v18.4s, v4.4s, v2.s[2] \n"
2752 "fmla v19.4s, v4.4s, v2.s[3] \n"
2753
2754 "fmla v20.4s, v5.4s, v0.s[0] \n"
2755 "fmla v21.4s, v5.4s, v0.s[1] \n"
2756 "fmla v22.4s, v5.4s, v0.s[2] \n"
2757 "fmla v23.4s, v5.4s, v0.s[3] \n"
2758 "fmla v24.4s, v5.4s, v1.s[0] \n"
2759 "fmla v25.4s, v5.4s, v1.s[1] \n"
2760 "fmla v26.4s, v5.4s, v1.s[2] \n"
2761 "fmla v27.4s, v5.4s, v1.s[3] \n"
2762 "fmla v28.4s, v5.4s, v2.s[0] \n"
2763 "fmla v29.4s, v5.4s, v2.s[1] \n"
2764 "fmla v30.4s, v5.4s, v2.s[2] \n"
2765 "fmla v31.4s, v5.4s, v2.s[3] \n"
2766
2767 "fmla v8.4s, v6.4s, v3.s[0] \n"
2768 "fmla v9.4s, v6.4s, v3.s[1] \n"
2769 "fmla v10.4s, v6.4s, v3.s[2] \n"
2770 "fmla v11.4s, v6.4s, v3.s[3] \n"
2771
2772 "fmla v20.4s, v7.4s, v3.s[0] \n"
2773 "fmla v21.4s, v7.4s, v3.s[1] \n"
2774 "fmla v22.4s, v7.4s, v3.s[2] \n"
2775 "fmla v23.4s, v7.4s, v3.s[3] \n"
2776
2777 "prfm pldl1keep, [%3, #512] \n"
2778 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2779
2780 "fmla v12.4s, v6.4s, v0.s[0] \n"
2781 "fmla v13.4s, v6.4s, v0.s[1] \n"
2782 "fmla v14.4s, v6.4s, v0.s[2] \n"
2783 "fmla v15.4s, v6.4s, v0.s[3] \n"
2784 "fmla v16.4s, v6.4s, v1.s[0] \n"
2785 "fmla v17.4s, v6.4s, v1.s[1] \n"
2786 "fmla v18.4s, v6.4s, v1.s[2] \n"
2787 "fmla v19.4s, v6.4s, v1.s[3] \n"
2788
2789 "fmla v24.4s, v7.4s, v0.s[0] \n"
2790 "fmla v25.4s, v7.4s, v0.s[1] \n"
2791 "fmla v26.4s, v7.4s, v0.s[2] \n"
2792 "fmla v27.4s, v7.4s, v0.s[3] \n"
2793 "fmla v28.4s, v7.4s, v1.s[0] \n"
2794 "fmla v29.4s, v7.4s, v1.s[1] \n"
2795 "fmla v30.4s, v7.4s, v1.s[2] \n"
2796 "fmla v31.4s, v7.4s, v1.s[3] \n"
2797
2798 "prfm pldl1keep, [%4, #512] \n"
2799 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%4], #64 \n" // w2233_01
2800
2801 "fmla v8.4s, v4.4s, v2.s[0] \n"
2802 "fmla v9.4s, v4.4s, v2.s[1] \n"
2803 "fmla v10.4s, v4.4s, v2.s[2] \n"
2804 "fmla v11.4s, v4.4s, v2.s[3] \n"
2805 "fmla v12.4s, v4.4s, v3.s[0] \n"
2806 "fmla v13.4s, v4.4s, v3.s[1] \n"
2807 "fmla v14.4s, v4.4s, v3.s[2] \n"
2808 "fmla v15.4s, v4.4s, v3.s[3] \n"
2809
2810 "fmla v20.4s, v5.4s, v2.s[0] \n"
2811 "fmla v21.4s, v5.4s, v2.s[1] \n"
2812 "fmla v22.4s, v5.4s, v2.s[2] \n"
2813 "fmla v23.4s, v5.4s, v2.s[3] \n"
2814 "fmla v24.4s, v5.4s, v3.s[0] \n"
2815 "fmla v25.4s, v5.4s, v3.s[1] \n"
2816 "fmla v26.4s, v5.4s, v3.s[2] \n"
2817 "fmla v27.4s, v5.4s, v3.s[3] \n"
2818
2819 "prfm pldl1keep, [%3, #512] \n"
2820 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
2821
2822 "fmla v16.4s, v4.4s, v0.s[0] \n"
2823 "fmla v17.4s, v4.4s, v0.s[1] \n"
2824 "fmla v18.4s, v4.4s, v0.s[2] \n"
2825 "fmla v19.4s, v4.4s, v0.s[3] \n"
2826
2827 "fmla v28.4s, v5.4s, v0.s[0] \n"
2828 "fmla v29.4s, v5.4s, v0.s[1] \n"
2829 "fmla v30.4s, v5.4s, v0.s[2] \n"
2830 "fmla v31.4s, v5.4s, v0.s[3] \n"
2831
2832 "fmla v8.4s, v6.4s, v1.s[0] \n"
2833 "fmla v9.4s, v6.4s, v1.s[1] \n"
2834 "fmla v10.4s, v6.4s, v1.s[2] \n"
2835 "fmla v11.4s, v6.4s, v1.s[3] \n"
2836 "fmla v12.4s, v6.4s, v2.s[0] \n"
2837 "fmla v13.4s, v6.4s, v2.s[1] \n"
2838 "fmla v14.4s, v6.4s, v2.s[2] \n"
2839 "fmla v15.4s, v6.4s, v2.s[3] \n"
2840 "fmla v16.4s, v6.4s, v3.s[0] \n"
2841 "fmla v17.4s, v6.4s, v3.s[1] \n"
2842 "fmla v18.4s, v6.4s, v3.s[2] \n"
2843 "fmla v19.4s, v6.4s, v3.s[3] \n"
2844
2845 "subs %w0, %w0, #1 \n"
2846
2847 "fmla v20.4s, v7.4s, v1.s[0] \n"
2848 "fmla v21.4s, v7.4s, v1.s[1] \n"
2849 "fmla v22.4s, v7.4s, v1.s[2] \n"
2850 "fmla v23.4s, v7.4s, v1.s[3] \n"
2851 "fmla v24.4s, v7.4s, v2.s[0] \n"
2852 "fmla v25.4s, v7.4s, v2.s[1] \n"
2853 "fmla v26.4s, v7.4s, v2.s[2] \n"
2854 "fmla v27.4s, v7.4s, v2.s[3] \n"
2855 "fmla v28.4s, v7.4s, v3.s[0] \n"
2856 "fmla v29.4s, v7.4s, v3.s[1] \n"
2857 "fmla v30.4s, v7.4s, v3.s[2] \n"
2858 "fmla v31.4s, v7.4s, v3.s[3] \n"
2859
2860 "bne 0b \n"
2861
2862 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
2863 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
2864 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
2865 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
2866 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
2867 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
2868
2869 : "=r"(nn), // %0
2870 "=r"(output0_tm), // %1
2871 "=r"(output1_tm), // %2
2872 "=r"(r0), // %3
2873 "=r"(k01) // %4
2874 : "0"(nn),
2875 "1"(output0_tm),
2876 "2"(output1_tm),
2877 "3"(r0),
2878 "4"(k01)
2879 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
2880 }
2881 for (; i + 7 < tiles; i += 8)
2882 {
2883 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
2884
2885 const float* k01 = kernel01_tm.row(r);
2886
2887 int nn = inch; // inch always > 0
2888
2889 asm volatile(
2890 "eor v16.16b, v16.16b, v16.16b \n"
2891 "eor v17.16b, v17.16b, v17.16b \n"
2892 "eor v18.16b, v18.16b, v18.16b \n"
2893 "eor v19.16b, v19.16b, v19.16b \n"
2894 "eor v20.16b, v20.16b, v20.16b \n"
2895 "eor v21.16b, v21.16b, v21.16b \n"
2896 "eor v22.16b, v22.16b, v22.16b \n"
2897 "eor v23.16b, v23.16b, v23.16b \n"
2898 "eor v24.16b, v24.16b, v24.16b \n"
2899 "eor v25.16b, v25.16b, v25.16b \n"
2900 "eor v26.16b, v26.16b, v26.16b \n"
2901 "eor v27.16b, v27.16b, v27.16b \n"
2902 "eor v28.16b, v28.16b, v28.16b \n"
2903 "eor v29.16b, v29.16b, v29.16b \n"
2904 "eor v30.16b, v30.16b, v30.16b \n"
2905 "eor v31.16b, v31.16b, v31.16b \n"
2906
2907 "0: \n"
2908
2909 "prfm pldl1keep, [%3, #512] \n"
2910 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
2911
2912 "prfm pldl1keep, [%4, #512] \n"
2913 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
2914
2915 "prfm pldl1keep, [%3, #512] \n"
2916 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r4 r5 r6 r7
2917
2918 "fmla v16.4s, v8.4s, v0.s[0] \n"
2919 "fmla v17.4s, v8.4s, v1.s[0] \n"
2920 "fmla v18.4s, v8.4s, v2.s[0] \n"
2921 "fmla v19.4s, v8.4s, v3.s[0] \n"
2922 "fmla v20.4s, v8.4s, v4.s[0] \n"
2923 "fmla v21.4s, v8.4s, v5.s[0] \n"
2924 "fmla v22.4s, v8.4s, v6.s[0] \n"
2925 "fmla v23.4s, v8.4s, v7.s[0] \n"
2926
2927 "fmla v24.4s, v9.4s, v0.s[0] \n"
2928 "fmla v25.4s, v9.4s, v1.s[0] \n"
2929 "fmla v26.4s, v9.4s, v2.s[0] \n"
2930 "fmla v27.4s, v9.4s, v3.s[0] \n"
2931 "fmla v28.4s, v9.4s, v4.s[0] \n"
2932 "fmla v29.4s, v9.4s, v5.s[0] \n"
2933 "fmla v30.4s, v9.4s, v6.s[0] \n"
2934 "fmla v31.4s, v9.4s, v7.s[0] \n"
2935
2936 "prfm pldl1keep, [%4, #512] \n"
2937 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
2938
2939 "fmla v16.4s, v10.4s, v0.s[1] \n"
2940 "fmla v17.4s, v10.4s, v1.s[1] \n"
2941 "fmla v18.4s, v10.4s, v2.s[1] \n"
2942 "fmla v19.4s, v10.4s, v3.s[1] \n"
2943 "fmla v20.4s, v10.4s, v4.s[1] \n"
2944 "fmla v21.4s, v10.4s, v5.s[1] \n"
2945 "fmla v22.4s, v10.4s, v6.s[1] \n"
2946 "fmla v23.4s, v10.4s, v7.s[1] \n"
2947
2948 "fmla v24.4s, v11.4s, v0.s[1] \n"
2949 "fmla v25.4s, v11.4s, v1.s[1] \n"
2950 "fmla v26.4s, v11.4s, v2.s[1] \n"
2951 "fmla v27.4s, v11.4s, v3.s[1] \n"
2952 "fmla v28.4s, v11.4s, v4.s[1] \n"
2953 "fmla v29.4s, v11.4s, v5.s[1] \n"
2954 "fmla v30.4s, v11.4s, v6.s[1] \n"
2955 "fmla v31.4s, v11.4s, v7.s[1] \n"
2956
2957 "fmla v16.4s, v12.4s, v0.s[2] \n"
2958 "fmla v17.4s, v12.4s, v1.s[2] \n"
2959 "fmla v18.4s, v12.4s, v2.s[2] \n"
2960 "fmla v19.4s, v12.4s, v3.s[2] \n"
2961 "fmla v20.4s, v12.4s, v4.s[2] \n"
2962 "fmla v21.4s, v12.4s, v5.s[2] \n"
2963 "fmla v22.4s, v12.4s, v6.s[2] \n"
2964 "fmla v23.4s, v12.4s, v7.s[2] \n"
2965
2966 "fmla v24.4s, v13.4s, v0.s[2] \n"
2967 "fmla v25.4s, v13.4s, v1.s[2] \n"
2968 "fmla v26.4s, v13.4s, v2.s[2] \n"
2969 "fmla v27.4s, v13.4s, v3.s[2] \n"
2970 "fmla v28.4s, v13.4s, v4.s[2] \n"
2971 "fmla v29.4s, v13.4s, v5.s[2] \n"
2972 "fmla v30.4s, v13.4s, v6.s[2] \n"
2973 "fmla v31.4s, v13.4s, v7.s[2] \n"
2974
2975 "fmla v16.4s, v14.4s, v0.s[3] \n"
2976 "fmla v17.4s, v14.4s, v1.s[3] \n"
2977 "fmla v18.4s, v14.4s, v2.s[3] \n"
2978 "fmla v19.4s, v14.4s, v3.s[3] \n"
2979 "fmla v20.4s, v14.4s, v4.s[3] \n"
2980 "fmla v21.4s, v14.4s, v5.s[3] \n"
2981 "fmla v22.4s, v14.4s, v6.s[3] \n"
2982 "fmla v23.4s, v14.4s, v7.s[3] \n"
2983
2984 "subs %w0, %w0, #1 \n"
2985
2986 "fmla v24.4s, v15.4s, v0.s[3] \n"
2987 "fmla v25.4s, v15.4s, v1.s[3] \n"
2988 "fmla v26.4s, v15.4s, v2.s[3] \n"
2989 "fmla v27.4s, v15.4s, v3.s[3] \n"
2990 "fmla v28.4s, v15.4s, v4.s[3] \n"
2991 "fmla v29.4s, v15.4s, v5.s[3] \n"
2992 "fmla v30.4s, v15.4s, v6.s[3] \n"
2993 "fmla v31.4s, v15.4s, v7.s[3] \n"
2994
2995 "bne 0b \n"
2996
2997 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
2998 "st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
2999 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
3000 "st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [%2], #64 \n"
3001
3002 : "=r"(nn), // %0
3003 "=r"(output0_tm), // %1
3004 "=r"(output1_tm), // %2
3005 "=r"(r0), // %3
3006 "=r"(k01) // %4
3007 : "0"(nn),
3008 "1"(output0_tm),
3009 "2"(output1_tm),
3010 "3"(r0),
3011 "4"(k01)
3012 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
3013 }
3014 for (; i + 3 < tiles; i += 4)
3015 {
3016 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
3017
3018 const float* k01 = kernel01_tm.row(r);
3019
3020 int nn = inch; // inch always > 0
3021
3022 asm volatile(
3023 "eor v16.16b, v16.16b, v16.16b \n"
3024 "eor v17.16b, v17.16b, v17.16b \n"
3025 "eor v18.16b, v18.16b, v18.16b \n"
3026 "eor v19.16b, v19.16b, v19.16b \n"
3027 "eor v20.16b, v20.16b, v20.16b \n"
3028 "eor v21.16b, v21.16b, v21.16b \n"
3029 "eor v22.16b, v22.16b, v22.16b \n"
3030 "eor v23.16b, v23.16b, v23.16b \n"
3031
3032 "0: \n"
3033
3034 "prfm pldl1keep, [%3, #512] \n"
3035 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r0 r1 r2 r3
3036
3037 "prfm pldl1keep, [%4, #512] \n"
3038 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3039
3040 "fmla v16.4s, v8.4s, v0.s[0] \n"
3041 "fmla v17.4s, v8.4s, v1.s[0] \n"
3042 "fmla v18.4s, v8.4s, v2.s[0] \n"
3043 "fmla v19.4s, v8.4s, v3.s[0] \n"
3044
3045 "fmla v20.4s, v9.4s, v0.s[0] \n"
3046 "fmla v21.4s, v9.4s, v1.s[0] \n"
3047 "fmla v22.4s, v9.4s, v2.s[0] \n"
3048 "fmla v23.4s, v9.4s, v3.s[0] \n"
3049
3050 "prfm pldl1keep, [%4, #512] \n"
3051 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3052
3053 "fmla v16.4s, v10.4s, v0.s[1] \n"
3054 "fmla v17.4s, v10.4s, v1.s[1] \n"
3055 "fmla v18.4s, v10.4s, v2.s[1] \n"
3056 "fmla v19.4s, v10.4s, v3.s[1] \n"
3057
3058 "fmla v20.4s, v11.4s, v0.s[1] \n"
3059 "fmla v21.4s, v11.4s, v1.s[1] \n"
3060 "fmla v22.4s, v11.4s, v2.s[1] \n"
3061 "fmla v23.4s, v11.4s, v3.s[1] \n"
3062
3063 "fmla v16.4s, v12.4s, v0.s[2] \n"
3064 "fmla v17.4s, v12.4s, v1.s[2] \n"
3065 "fmla v18.4s, v12.4s, v2.s[2] \n"
3066 "fmla v19.4s, v12.4s, v3.s[2] \n"
3067
3068 "fmla v20.4s, v13.4s, v0.s[2] \n"
3069 "fmla v21.4s, v13.4s, v1.s[2] \n"
3070 "fmla v22.4s, v13.4s, v2.s[2] \n"
3071 "fmla v23.4s, v13.4s, v3.s[2] \n"
3072
3073 "subs %w0, %w0, #1 \n"
3074
3075 "fmla v16.4s, v14.4s, v0.s[3] \n"
3076 "fmla v17.4s, v14.4s, v1.s[3] \n"
3077 "fmla v18.4s, v14.4s, v2.s[3] \n"
3078 "fmla v19.4s, v14.4s, v3.s[3] \n"
3079
3080 "fmla v20.4s, v15.4s, v0.s[3] \n"
3081 "fmla v21.4s, v15.4s, v1.s[3] \n"
3082 "fmla v22.4s, v15.4s, v2.s[3] \n"
3083 "fmla v23.4s, v15.4s, v3.s[3] \n"
3084
3085 "bne 0b \n"
3086
3087 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3088 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
3089
3090 : "=r"(nn), // %0
3091 "=r"(output0_tm), // %1
3092 "=r"(output1_tm), // %2
3093 "=r"(r0), // %3
3094 "=r"(k01) // %4
3095 : "0"(nn),
3096 "1"(output0_tm),
3097 "2"(output1_tm),
3098 "3"(r0),
3099 "4"(k01)
3100 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
3101 }
3102 for (; i + 1 < tiles; i += 2)
3103 {
3104 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
3105
3106 const float* k01 = kernel01_tm.row(r);
3107
3108 int nn = inch; // inch always > 0
3109
3110 asm volatile(
3111 "eor v16.16b, v16.16b, v16.16b \n"
3112 "eor v17.16b, v17.16b, v17.16b \n"
3113 "eor v18.16b, v18.16b, v18.16b \n"
3114 "eor v19.16b, v19.16b, v19.16b \n"
3115
3116 "0: \n"
3117
3118 "prfm pldl1keep, [%3, #256] \n"
3119 "ld1 {v0.4s, v1.4s}, [%3], #32 \n" // r0 r1
3120
3121 "prfm pldl1keep, [%4, #512] \n"
3122 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3123
3124 "fmla v16.4s, v8.4s, v0.s[0] \n"
3125 "fmla v17.4s, v8.4s, v1.s[0] \n"
3126 "fmla v18.4s, v9.4s, v0.s[0] \n"
3127 "fmla v19.4s, v9.4s, v1.s[0] \n"
3128
3129 "prfm pldl1keep, [%4, #512] \n"
3130 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3131
3132 "fmla v16.4s, v10.4s, v0.s[1] \n"
3133 "fmla v17.4s, v10.4s, v1.s[1] \n"
3134 "fmla v18.4s, v11.4s, v0.s[1] \n"
3135 "fmla v19.4s, v11.4s, v1.s[1] \n"
3136
3137 "fmla v16.4s, v12.4s, v0.s[2] \n"
3138 "fmla v17.4s, v12.4s, v1.s[2] \n"
3139 "fmla v18.4s, v13.4s, v0.s[2] \n"
3140 "fmla v19.4s, v13.4s, v1.s[2] \n"
3141
3142 "subs %w0, %w0, #1 \n"
3143
3144 "fmla v16.4s, v14.4s, v0.s[3] \n"
3145 "fmla v17.4s, v14.4s, v1.s[3] \n"
3146 "fmla v18.4s, v15.4s, v0.s[3] \n"
3147 "fmla v19.4s, v15.4s, v1.s[3] \n"
3148
3149 "bne 0b \n"
3150
3151 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
3152 "st1 {v18.4s, v19.4s}, [%2], #32 \n"
3153
3154 : "=r"(nn), // %0
3155 "=r"(output0_tm), // %1
3156 "=r"(output1_tm), // %2
3157 "=r"(r0), // %3
3158 "=r"(k01) // %4
3159 : "0"(nn),
3160 "1"(output0_tm),
3161 "2"(output1_tm),
3162 "3"(r0),
3163 "4"(k01)
3164 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
3165 }
3166 for (; i < tiles; i++)
3167 {
3168 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
3169
3170 const float* k01 = kernel01_tm.row(r);
3171
3172 int nn = inch; // inch always > 0
3173
3174 asm volatile(
3175 "eor v16.16b, v16.16b, v16.16b \n"
3176 "eor v17.16b, v17.16b, v17.16b \n"
3177
3178 "0: \n"
3179
3180 "prfm pldl1keep, [%3, #128] \n"
3181 "ld1 {v0.4s}, [%3], #16 \n" // r0
3182
3183 "prfm pldl1keep, [%4, #512] \n"
3184 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%4], #64 \n" // w0011_01
3185
3186 "fmla v16.4s, v8.4s, v0.s[0] \n"
3187 "fmla v17.4s, v9.4s, v0.s[0] \n"
3188
3189 "prfm pldl1keep, [%4, #512] \n"
3190 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%4], #64 \n" // w2233_01
3191
3192 "fmla v16.4s, v10.4s, v0.s[1] \n"
3193 "fmla v17.4s, v11.4s, v0.s[1] \n"
3194
3195 "fmla v16.4s, v12.4s, v0.s[2] \n"
3196 "fmla v17.4s, v13.4s, v0.s[2] \n"
3197
3198 "subs %w0, %w0, #1 \n"
3199
3200 "fmla v16.4s, v14.4s, v0.s[3] \n"
3201 "fmla v17.4s, v15.4s, v0.s[3] \n"
3202
3203 "bne 0b \n"
3204
3205 "st1 {v16.4s}, [%1], #16 \n"
3206 "st1 {v17.4s}, [%2], #16 \n"
3207
3208 : "=r"(nn), // %0
3209 "=r"(output0_tm), // %1
3210 "=r"(output1_tm), // %2
3211 "=r"(r0), // %3
3212 "=r"(k01) // %4
3213 : "0"(nn),
3214 "1"(output0_tm),
3215 "2"(output1_tm),
3216 "3"(r0),
3217 "4"(k01)
3218 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17");
3219 }
3220 }
3221 }
3222 #endif // __ARM_NEON && __aarch64__
3223
3224 #pragma omp parallel for num_threads(opt.num_threads)
3225 for (int p = remain_outch_start; p < outch; p++)
3226 {
3227 float* output0_tm = top_blob_tm.channel(p);
3228
3229 #if __aarch64__
3230 const Mat kernel0_tm = kernel_tm.channel(p / 2 + p % 2);
3231 #else
3232 const Mat kernel0_tm = kernel_tm.channel(p);
3233 #endif
3234
3235 for (int r = 0; r < 36; r++)
3236 {
3237 const Mat bb2 = bottom_blob_tm2.channel(r);
3238
3239 int i = 0;
3240 #if __aarch64__
3241 for (; i + 11 < tiles; i += 12)
3242 {
3243 const float* r0 = bb2.row(i / 12);
3244
3245 const float* k0 = kernel0_tm.row(r);
3246
3247 int nn = inch; // inch always > 0
3248
3249 asm volatile(
3250 "eor v8.16b, v8.16b, v8.16b \n"
3251 "eor v9.16b, v9.16b, v9.16b \n"
3252 "eor v10.16b, v10.16b, v10.16b \n"
3253 "eor v11.16b, v11.16b, v11.16b \n"
3254 "eor v12.16b, v12.16b, v12.16b \n"
3255 "eor v13.16b, v13.16b, v13.16b \n"
3256 "eor v14.16b, v14.16b, v14.16b \n"
3257 "eor v15.16b, v15.16b, v15.16b \n"
3258 "eor v16.16b, v16.16b, v16.16b \n"
3259 "eor v17.16b, v17.16b, v17.16b \n"
3260 "eor v18.16b, v18.16b, v18.16b \n"
3261 "eor v19.16b, v19.16b, v19.16b \n"
3262
3263 "0: \n"
3264
3265 "prfm pldl1keep, [%2, #512] \n"
3266 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
3267
3268 "prfm pldl1keep, [%3, #512] \n"
3269 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // w0123_0
3270
3271 "fmla v8.4s, v4.4s, v0.s[0] \n"
3272 "fmla v9.4s, v4.4s, v0.s[1] \n"
3273 "fmla v10.4s, v4.4s, v0.s[2] \n"
3274 "fmla v11.4s, v4.4s, v0.s[3] \n"
3275 "fmla v12.4s, v4.4s, v1.s[0] \n"
3276 "fmla v13.4s, v4.4s, v1.s[1] \n"
3277 "fmla v14.4s, v4.4s, v1.s[2] \n"
3278 "fmla v15.4s, v4.4s, v1.s[3] \n"
3279 "fmla v16.4s, v4.4s, v2.s[0] \n"
3280 "fmla v17.4s, v4.4s, v2.s[1] \n"
3281 "fmla v18.4s, v4.4s, v2.s[2] \n"
3282 "fmla v19.4s, v4.4s, v2.s[3] \n"
3283
3284 "prfm pldl1keep, [%2, #512] \n"
3285 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%2], #64 \n"
3286
3287 "fmla v8.4s, v5.4s, v3.s[0] \n"
3288 "fmla v9.4s, v5.4s, v3.s[1] \n"
3289 "fmla v10.4s, v5.4s, v3.s[2] \n"
3290 "fmla v11.4s, v5.4s, v3.s[3] \n"
3291 "fmla v12.4s, v5.4s, v20.s[0] \n"
3292 "fmla v13.4s, v5.4s, v20.s[1] \n"
3293 "fmla v14.4s, v5.4s, v20.s[2] \n"
3294 "fmla v15.4s, v5.4s, v20.s[3] \n"
3295 "fmla v16.4s, v5.4s, v21.s[0] \n"
3296 "fmla v17.4s, v5.4s, v21.s[1] \n"
3297 "fmla v18.4s, v5.4s, v21.s[2] \n"
3298 "fmla v19.4s, v5.4s, v21.s[3] \n"
3299
3300 "prfm pldl1keep, [%2, #512] \n"
3301 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%2], #64 \n"
3302
3303 "fmla v8.4s, v6.4s, v22.s[0] \n"
3304 "fmla v9.4s, v6.4s, v22.s[1] \n"
3305 "fmla v10.4s, v6.4s, v22.s[2] \n"
3306 "fmla v11.4s, v6.4s, v22.s[3] \n"
3307 "fmla v12.4s, v6.4s, v23.s[0] \n"
3308 "fmla v13.4s, v6.4s, v23.s[1] \n"
3309 "fmla v14.4s, v6.4s, v23.s[2] \n"
3310 "fmla v15.4s, v6.4s, v23.s[3] \n"
3311 "fmla v16.4s, v6.4s, v24.s[0] \n"
3312 "fmla v17.4s, v6.4s, v24.s[1] \n"
3313 "fmla v18.4s, v6.4s, v24.s[2] \n"
3314 "fmla v19.4s, v6.4s, v24.s[3] \n"
3315
3316 "subs %w0, %w0, #1 \n"
3317
3318 "fmla v8.4s, v7.4s, v25.s[0] \n"
3319 "fmla v9.4s, v7.4s, v25.s[1] \n"
3320 "fmla v10.4s, v7.4s, v25.s[2] \n"
3321 "fmla v11.4s, v7.4s, v25.s[3] \n"
3322 "fmla v12.4s, v7.4s, v26.s[0] \n"
3323 "fmla v13.4s, v7.4s, v26.s[1] \n"
3324 "fmla v14.4s, v7.4s, v26.s[2] \n"
3325 "fmla v15.4s, v7.4s, v26.s[3] \n"
3326 "fmla v16.4s, v7.4s, v27.s[0] \n"
3327 "fmla v17.4s, v7.4s, v27.s[1] \n"
3328 "fmla v18.4s, v7.4s, v27.s[2] \n"
3329 "fmla v19.4s, v7.4s, v27.s[3] \n"
3330
3331 "bne 0b \n"
3332
3333 "st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%1], #64 \n"
3334 "st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%1], #64 \n"
3335 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3336
3337 : "=r"(nn), // %0
3338 "=r"(output0_tm), // %1
3339 "=r"(r0), // %2
3340 "=r"(k0) // %3
3341 : "0"(nn),
3342 "1"(output0_tm),
3343 "2"(r0),
3344 "3"(k0)
3345 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
3346 }
3347 #endif
3348 for (; i + 7 < tiles; i += 8)
3349 {
3350 #if __aarch64__
3351 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
3352 #else
3353 const float* r0 = bb2.row(i / 8);
3354 #endif
3355
3356 const float* k0 = kernel0_tm.row(r);
3357
3358 int nn = inch; // inch always > 0
3359
3360 #if __aarch64__
3361 asm volatile(
3362 "eor v16.16b, v16.16b, v16.16b \n"
3363 "eor v17.16b, v17.16b, v17.16b \n"
3364 "eor v18.16b, v18.16b, v18.16b \n"
3365 "eor v19.16b, v19.16b, v19.16b \n"
3366 "eor v20.16b, v20.16b, v20.16b \n"
3367 "eor v21.16b, v21.16b, v21.16b \n"
3368 "eor v22.16b, v22.16b, v22.16b \n"
3369 "eor v23.16b, v23.16b, v23.16b \n"
3370
3371 "0: \n"
3372
3373 "prfm pldl1keep, [%2, #512] \n"
3374 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
3375
3376 "prfm pldl1keep, [%3, #512] \n"
3377 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3378
3379 "fmla v16.4s, v8.4s, v0.s[0] \n"
3380 "fmla v17.4s, v8.4s, v1.s[0] \n"
3381 "fmla v18.4s, v8.4s, v2.s[0] \n"
3382 "fmla v19.4s, v8.4s, v3.s[0] \n"
3383
3384 "prfm pldl1keep, [%2, #512] \n"
3385 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n" // r4 r5 r6 r7
3386
3387 "fmla v20.4s, v8.4s, v4.s[0] \n"
3388 "fmla v21.4s, v8.4s, v5.s[0] \n"
3389 "fmla v22.4s, v8.4s, v6.s[0] \n"
3390 "fmla v23.4s, v8.4s, v7.s[0] \n"
3391
3392 "fmla v16.4s, v9.4s, v0.s[1] \n"
3393 "fmla v17.4s, v9.4s, v1.s[1] \n"
3394 "fmla v18.4s, v9.4s, v2.s[1] \n"
3395 "fmla v19.4s, v9.4s, v3.s[1] \n"
3396 "fmla v20.4s, v9.4s, v4.s[1] \n"
3397 "fmla v21.4s, v9.4s, v5.s[1] \n"
3398 "fmla v22.4s, v9.4s, v6.s[1] \n"
3399 "fmla v23.4s, v9.4s, v7.s[1] \n"
3400
3401 "fmla v16.4s, v10.4s, v0.s[2] \n"
3402 "fmla v17.4s, v10.4s, v1.s[2] \n"
3403 "fmla v18.4s, v10.4s, v2.s[2] \n"
3404 "fmla v19.4s, v10.4s, v3.s[2] \n"
3405 "fmla v20.4s, v10.4s, v4.s[2] \n"
3406 "fmla v21.4s, v10.4s, v5.s[2] \n"
3407 "fmla v22.4s, v10.4s, v6.s[2] \n"
3408 "fmla v23.4s, v10.4s, v7.s[2] \n"
3409
3410 "subs %w0, %w0, #1 \n"
3411
3412 "fmla v16.4s, v11.4s, v0.s[3] \n"
3413 "fmla v17.4s, v11.4s, v1.s[3] \n"
3414 "fmla v18.4s, v11.4s, v2.s[3] \n"
3415 "fmla v19.4s, v11.4s, v3.s[3] \n"
3416 "fmla v20.4s, v11.4s, v4.s[3] \n"
3417 "fmla v21.4s, v11.4s, v5.s[3] \n"
3418 "fmla v22.4s, v11.4s, v6.s[3] \n"
3419 "fmla v23.4s, v11.4s, v7.s[3] \n"
3420
3421 "bne 0b \n"
3422
3423 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3424 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
3425
3426 : "=r"(nn), // %0
3427 "=r"(output0_tm), // %1
3428 "=r"(r0), // %2
3429 "=r"(k0) // %3
3430 : "0"(nn),
3431 "1"(output0_tm),
3432 "2"(r0),
3433 "3"(k0)
3434 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
3435 #else
3436 asm volatile(
3437 "veor q8, q8 \n"
3438 "veor q9, q9 \n"
3439 "veor q10, q10 \n"
3440 "veor q11, q11 \n"
3441 "veor q12, q12 \n"
3442 "veor q13, q13 \n"
3443 "veor q14, q14 \n"
3444 "veor q15, q15 \n"
3445
3446 "0: \n"
3447
3448 "pld [%2, #512] \n"
3449 "vldm %2!, {d0-d7} \n"
3450
3451 "pld [%3, #512] \n"
3452 "vldm %3!, {d8-d15} \n"
3453
3454 "vmla.f32 q8, q4, d0[0] \n"
3455 "vmla.f32 q9, q4, d0[1] \n"
3456 "vmla.f32 q10, q4, d1[0] \n"
3457 "vmla.f32 q11, q4, d1[1] \n"
3458 "vmla.f32 q12, q4, d2[0] \n"
3459 "vmla.f32 q13, q4, d2[1] \n"
3460 "vmla.f32 q14, q4, d3[0] \n"
3461 "vmla.f32 q15, q4, d3[1] \n"
3462
3463 "vmla.f32 q8, q5, d4[0] \n"
3464 "vmla.f32 q9, q5, d4[1] \n"
3465 "vmla.f32 q10, q5, d5[0] \n"
3466 "vmla.f32 q11, q5, d5[1] \n"
3467 "vmla.f32 q12, q5, d6[0] \n"
3468 "vmla.f32 q13, q5, d6[1] \n"
3469 "vmla.f32 q14, q5, d7[0] \n"
3470 "vmla.f32 q15, q5, d7[1] \n"
3471
3472 "pld [%2, #512] \n"
3473 "vldm %2!, {d0-d7} \n"
3474
3475 "vmla.f32 q8, q6, d0[0] \n"
3476 "vmla.f32 q9, q6, d0[1] \n"
3477 "vmla.f32 q10, q6, d1[0] \n"
3478 "vmla.f32 q11, q6, d1[1] \n"
3479 "vmla.f32 q12, q6, d2[0] \n"
3480 "vmla.f32 q13, q6, d2[1] \n"
3481 "vmla.f32 q14, q6, d3[0] \n"
3482 "vmla.f32 q15, q6, d3[1] \n"
3483
3484 "subs %0, %0, #1 \n"
3485
3486 "vmla.f32 q8, q7, d4[0] \n"
3487 "vmla.f32 q9, q7, d4[1] \n"
3488 "vmla.f32 q10, q7, d5[0] \n"
3489 "vmla.f32 q11, q7, d5[1] \n"
3490 "vmla.f32 q12, q7, d6[0] \n"
3491 "vmla.f32 q13, q7, d6[1] \n"
3492 "vmla.f32 q14, q7, d7[0] \n"
3493 "vmla.f32 q15, q7, d7[1] \n"
3494
3495 "bne 0b \n"
3496
3497 "vstm %1!, {d16-d23} \n"
3498 "vstm %1!, {d24-d31} \n"
3499
3500 : "=r"(nn), // %0
3501 "=r"(output0_tm), // %1
3502 "=r"(r0), // %2
3503 "=r"(k0) // %3
3504 : "0"(nn),
3505 "1"(output0_tm),
3506 "2"(r0),
3507 "3"(k0)
3508 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
3509 #endif
3510 }
3511 for (; i + 3 < tiles; i += 4)
3512 {
3513 #if __aarch64__
3514 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
3515 #else
3516 const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
3517 #endif
3518
3519 const float* k0 = kernel0_tm.row(r);
3520
3521 int nn = inch; // inch always > 0
3522
3523 #if __aarch64__
3524 asm volatile(
3525 "eor v16.16b, v16.16b, v16.16b \n"
3526 "eor v17.16b, v17.16b, v17.16b \n"
3527 "eor v18.16b, v18.16b, v18.16b \n"
3528 "eor v19.16b, v19.16b, v19.16b \n"
3529
3530 "0: \n"
3531
3532 "prfm pldl1keep, [%2, #512] \n"
3533 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r0 r1 r2 r3
3534
3535 "prfm pldl1keep, [%3, #512] \n"
3536 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3537
3538 "fmla v16.4s, v8.4s, v0.s[0] \n"
3539 "fmla v17.4s, v8.4s, v1.s[0] \n"
3540 "fmla v18.4s, v8.4s, v2.s[0] \n"
3541 "fmla v19.4s, v8.4s, v3.s[0] \n"
3542
3543 "fmla v16.4s, v9.4s, v0.s[1] \n"
3544 "fmla v17.4s, v9.4s, v1.s[1] \n"
3545 "fmla v18.4s, v9.4s, v2.s[1] \n"
3546 "fmla v19.4s, v9.4s, v3.s[1] \n"
3547
3548 "fmla v16.4s, v10.4s, v0.s[2] \n"
3549 "fmla v17.4s, v10.4s, v1.s[2] \n"
3550 "fmla v18.4s, v10.4s, v2.s[2] \n"
3551 "fmla v19.4s, v10.4s, v3.s[2] \n"
3552
3553 "subs %w0, %w0, #1 \n"
3554
3555 "fmla v16.4s, v11.4s, v0.s[3] \n"
3556 "fmla v17.4s, v11.4s, v1.s[3] \n"
3557 "fmla v18.4s, v11.4s, v2.s[3] \n"
3558 "fmla v19.4s, v11.4s, v3.s[3] \n"
3559
3560 "bne 0b \n"
3561
3562 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
3563
3564 : "=r"(nn), // %0
3565 "=r"(output0_tm), // %1
3566 "=r"(r0), // %2
3567 "=r"(k0) // %3
3568 : "0"(nn),
3569 "1"(output0_tm),
3570 "2"(r0),
3571 "3"(k0)
3572 : "cc", "memory", "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
3573 #else
3574 asm volatile(
3575 "veor q8, q8 \n"
3576 "veor q9, q9 \n"
3577 "veor q10, q10 \n"
3578 "veor q11, q11 \n"
3579
3580 "0: \n"
3581
3582 "pld [%2, #512] \n"
3583 "vldm %2!, {d0-d7} \n"
3584
3585 "pld [%3, #512] \n"
3586 "vldm %3!, {d8-d15} \n"
3587
3588 "vmla.f32 q8, q4, d0[0] \n"
3589 "vmla.f32 q9, q4, d2[0] \n"
3590 "vmla.f32 q10, q4, d4[0] \n"
3591 "vmla.f32 q11, q4, d6[0] \n"
3592
3593 "vmla.f32 q8, q5, d0[1] \n"
3594 "vmla.f32 q9, q5, d2[1] \n"
3595 "vmla.f32 q10, q5, d4[1] \n"
3596 "vmla.f32 q11, q5, d6[1] \n"
3597
3598 "vmla.f32 q8, q6, d1[0] \n"
3599 "vmla.f32 q9, q6, d3[0] \n"
3600 "vmla.f32 q10, q6, d5[0] \n"
3601 "vmla.f32 q11, q6, d7[0] \n"
3602
3603 "subs %0, %0, #1 \n"
3604
3605 "vmla.f32 q8, q7, d1[1] \n"
3606 "vmla.f32 q9, q7, d3[1] \n"
3607 "vmla.f32 q10, q7, d5[1] \n"
3608 "vmla.f32 q11, q7, d7[1] \n"
3609
3610 "bne 0b \n"
3611
3612 "vstm %1!, {d16-d23} \n"
3613
3614 : "=r"(nn), // %0
3615 "=r"(output0_tm), // %1
3616 "=r"(r0), // %2
3617 "=r"(k0) // %3
3618 : "0"(nn),
3619 "1"(output0_tm),
3620 "2"(r0),
3621 "3"(k0)
3622 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
3623 #endif
3624 }
3625 for (; i + 1 < tiles; i += 2)
3626 {
3627 #if __aarch64__
3628 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2);
3629 #else
3630 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2);
3631 #endif
3632
3633 const float* k0 = kernel0_tm.row(r);
3634
3635 int nn = inch; // inch always > 0
3636
3637 #if __aarch64__
3638 asm volatile(
3639 "eor v16.16b, v16.16b, v16.16b \n"
3640 "eor v17.16b, v17.16b, v17.16b \n"
3641
3642 "0: \n"
3643
3644 "prfm pldl1keep, [%2, #256] \n"
3645 "ld1 {v0.4s, v1.4s}, [%2], #32 \n" // r0 r1
3646
3647 "prfm pldl1keep, [%3, #512] \n"
3648 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3649
3650 "fmla v16.4s, v8.4s, v0.s[0] \n"
3651 "fmla v17.4s, v8.4s, v1.s[0] \n"
3652
3653 "fmla v16.4s, v9.4s, v0.s[1] \n"
3654 "fmla v17.4s, v9.4s, v1.s[1] \n"
3655
3656 "fmla v16.4s, v10.4s, v0.s[2] \n"
3657 "fmla v17.4s, v10.4s, v1.s[2] \n"
3658
3659 "subs %w0, %w0, #1 \n"
3660
3661 "fmla v16.4s, v11.4s, v0.s[3] \n"
3662 "fmla v17.4s, v11.4s, v1.s[3] \n"
3663
3664 "bne 0b \n"
3665
3666 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
3667
3668 : "=r"(nn), // %0
3669 "=r"(output0_tm), // %1
3670 "=r"(r0), // %2
3671 "=r"(k0) // %3
3672 : "0"(nn),
3673 "1"(output0_tm),
3674 "2"(r0),
3675 "3"(k0)
3676 : "cc", "memory", "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17");
3677 #else
3678 asm volatile(
3679 "veor q8, q8 \n"
3680 "veor q9, q9 \n"
3681
3682 "0: \n"
3683
3684 "pld [%2, #256] \n"
3685 "vld1.f32 {d0-d3}, [%2 :128]! \n"
3686
3687 "pld [%3, #512] \n"
3688 "vldm %3!, {d8-d15} \n"
3689
3690 "vmla.f32 q8, q4, d0[0] \n"
3691 "vmla.f32 q9, q4, d2[0] \n"
3692
3693 "vmla.f32 q8, q5, d0[1] \n"
3694 "vmla.f32 q9, q5, d2[1] \n"
3695
3696 "vmla.f32 q8, q6, d1[0] \n"
3697 "vmla.f32 q9, q6, d3[0] \n"
3698
3699 "subs %0, %0, #1 \n"
3700
3701 "vmla.f32 q8, q7, d1[1] \n"
3702 "vmla.f32 q9, q7, d3[1] \n"
3703
3704 "bne 0b \n"
3705
3706 "vst1.f32 {d16-d19}, [%1 :128]! \n"
3707
3708 : "=r"(nn), // %0
3709 "=r"(output0_tm), // %1
3710 "=r"(r0), // %2
3711 "=r"(k0) // %3
3712 : "0"(nn),
3713 "1"(output0_tm),
3714 "2"(r0),
3715 "3"(k0)
3716 : "cc", "memory", "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
3717 #endif
3718 }
3719 for (; i < tiles; i++)
3720 {
3721 #if __aarch64__
3722 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + (i % 12 % 4) / 2 + i % 12 % 2);
3723 #else
3724 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + (i % 4) / 2 + i % 2);
3725 #endif
3726
3727 const float* k0 = kernel0_tm.row(r);
3728
3729 int nn = inch; // inch always > 0
3730
3731 #if __aarch64__
3732 asm volatile(
3733 "eor v16.16b, v16.16b, v16.16b \n"
3734
3735 "0: \n"
3736
3737 "prfm pldl1keep, [%2, #128] \n"
3738 "ld1 {v0.4s}, [%2], #16 \n" // r0
3739
3740 "prfm pldl1keep, [%3, #512] \n"
3741 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%3], #64 \n" // w0123
3742
3743 "fmla v16.4s, v8.4s, v0.s[0] \n"
3744 "fmla v16.4s, v9.4s, v0.s[1] \n"
3745
3746 "subs %w0, %w0, #1 \n"
3747
3748 "fmla v16.4s, v10.4s, v0.s[2] \n"
3749 "fmla v16.4s, v11.4s, v0.s[3] \n"
3750
3751 "bne 0b \n"
3752
3753 "st1 {v16.4s}, [%1], #16 \n"
3754
3755 : "=r"(nn), // %0
3756 "=r"(output0_tm), // %1
3757 "=r"(r0), // %2
3758 "=r"(k0) // %3
3759 : "0"(nn),
3760 "1"(output0_tm),
3761 "2"(r0),
3762 "3"(k0)
3763 : "cc", "memory", "v0", "v8", "v9", "v10", "v11", "v16");
3764 #else
3765 asm volatile(
3766 "veor q8, q8 \n"
3767
3768 "0: \n"
3769
3770 "pld [%2, #128] \n"
3771 "vld1.f32 {d0-d1}, [%2 :128]! \n"
3772
3773 "pld [%3, #512] \n"
3774 "vldm %3!, {d8-d15} \n"
3775
3776 "vmla.f32 q8, q4, d0[0] \n"
3777 "vmla.f32 q8, q5, d0[1] \n"
3778
3779 "subs %0, %0, #1 \n"
3780
3781 "vmla.f32 q8, q6, d1[0] \n"
3782 "vmla.f32 q8, q7, d1[1] \n"
3783
3784 "bne 0b \n"
3785
3786 "vst1.f32 {d16-d17}, [%1 :128]! \n"
3787
3788 : "=r"(nn), // %0
3789 "=r"(output0_tm), // %1
3790 "=r"(r0), // %2
3791 "=r"(k0) // %3
3792 : "0"(nn),
3793 "1"(output0_tm),
3794 "2"(r0),
3795 "3"(k0)
3796 : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8");
3797 #endif
3798 }
3799 }
3800 }
3801 }
3802 bottom_blob_tm = Mat();
3803 // END dot
3804
3805 // BEGIN transform output
3806 Mat top_blob_bordered;
3807 if (outw == top_blob.w && outh == top_blob.h)
3808 {
3809 top_blob_bordered = top_blob;
3810 }
3811 else
3812 {
3813 top_blob_bordered.create(outw, outh, outch, elemsize, elempack, opt.workspace_allocator);
3814 }
3815 {
3816 // const float otm[4][6] = {
3817 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f},
3818 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
3819 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 0.0f},
3820 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
3821 // };
3822
3823 // 0 = r00 + (r01 + r02) + (r03 + r04)
3824 // 1 = (r01 - r02) + (r03 - r04) * 2
3825 // 2 = (r01 + r02) + (r03 + r04) * 4
3826 // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
3827
3828 int w_tm = outw / 4 * 6;
3829 int h_tm = outh / 4 * 6;
3830 const int tiles = w_tm / 6 * h_tm / 6;
3831
3832 #pragma omp parallel for num_threads(opt.num_threads)
3833 for (int p = 0; p < outch; p++)
3834 {
3835 const Mat out0_tm = top_blob_tm.channel(p);
3836 Mat out0 = top_blob_bordered.channel(p);
3837
3838 // const float bias0 = bias ? bias[p] : 0.f;
3839 float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
3840
3841 float tmp[4][6][4];
3842
3843 // tile
3844 for (int i = 0; i < outh / 4; i++)
3845 {
3846 for (int j = 0; j < outw / 4; j++)
3847 {
3848 // top_blob_tm.create(tiles, 36, outch, elemsize, elempack);
3849
3850 const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 6 + j) * 4;
3851 const float* output0_tm_1 = output0_tm_0 + tiles * 4;
3852 const float* output0_tm_2 = output0_tm_0 + tiles * 8;
3853 const float* output0_tm_3 = output0_tm_0 + tiles * 12;
3854 const float* output0_tm_4 = output0_tm_0 + tiles * 16;
3855 const float* output0_tm_5 = output0_tm_0 + tiles * 20;
3856
3857 float* output0 = out0.row(i * 4) + (j * 4) * 4;
3858
3859 // TODO neon optimize
3860 for (int m = 0; m < 6; m++)
3861 {
3862 float32x4_t _out0tm0 = vld1q_f32(output0_tm_0);
3863 float32x4_t _out0tm1 = vld1q_f32(output0_tm_1);
3864 float32x4_t _out0tm2 = vld1q_f32(output0_tm_2);
3865 float32x4_t _out0tm3 = vld1q_f32(output0_tm_3);
3866 float32x4_t _out0tm4 = vld1q_f32(output0_tm_4);
3867 float32x4_t _out0tm5 = vld1q_f32(output0_tm_5);
3868
3869 float32x4_t _tmp02a = vaddq_f32(_out0tm1, _out0tm2);
3870 float32x4_t _tmp13a = vsubq_f32(_out0tm1, _out0tm2);
3871
3872 float32x4_t _tmp02b = vaddq_f32(_out0tm3, _out0tm4);
3873 float32x4_t _tmp13b = vsubq_f32(_out0tm3, _out0tm4);
3874
3875 float32x4_t _tmp0m = vaddq_f32(vaddq_f32(_out0tm0, _tmp02a), _tmp02b);
3876 float32x4_t _tmp1m = vmlaq_n_f32(_tmp13a, _tmp13b, 2.f);
3877 float32x4_t _tmp2m = vmlaq_n_f32(_tmp02a, _tmp02b, 4.f);
3878 float32x4_t _tmp3m = vmlaq_n_f32(vaddq_f32(_out0tm5, _tmp13a), _tmp13b, 8.f);
3879
3880 vst1q_f32(tmp[0][m], _tmp0m);
3881 vst1q_f32(tmp[1][m], _tmp1m);
3882 vst1q_f32(tmp[2][m], _tmp2m);
3883 vst1q_f32(tmp[3][m], _tmp3m);
3884
3885 output0_tm_0 += tiles * 24;
3886 output0_tm_1 += tiles * 24;
3887 output0_tm_2 += tiles * 24;
3888 output0_tm_3 += tiles * 24;
3889 output0_tm_4 += tiles * 24;
3890 output0_tm_5 += tiles * 24;
3891 }
3892
3893 for (int m = 0; m < 4; m++)
3894 {
3895 float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
3896 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
3897 float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
3898 float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
3899 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
3900 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
3901
3902 float32x4_t _tmp02a = vaddq_f32(_tmp01, _tmp02);
3903 float32x4_t _tmp13a = vsubq_f32(_tmp01, _tmp02);
3904
3905 float32x4_t _tmp02b = vaddq_f32(_tmp03, _tmp04);
3906 float32x4_t _tmp13b = vsubq_f32(_tmp03, _tmp04);
3907
3908 float32x4_t _out00 = vaddq_f32(_bias0, vaddq_f32(vaddq_f32(_tmp00, _tmp02a), _tmp02b));
3909 float32x4_t _out01 = vaddq_f32(_bias0, vmlaq_n_f32(_tmp13a, _tmp13b, 2.f));
3910 float32x4_t _out02 = vaddq_f32(_bias0, vmlaq_n_f32(_tmp02a, _tmp02b, 4.f));
3911 float32x4_t _out03 = vaddq_f32(_bias0, vmlaq_n_f32(vaddq_f32(_tmp05, _tmp13a), _tmp13b, 8.f));
3912
3913 vst1q_f32(output0, _out00);
3914 vst1q_f32(output0 + 4, _out01);
3915 vst1q_f32(output0 + 8, _out02);
3916 vst1q_f32(output0 + 12, _out03);
3917
3918 output0 += outw * 4;
3919 }
3920 }
3921 }
3922 }
3923 }
3924 // END transform output
3925
3926 // cut result pad
3927 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
3928 }
3929
conv3x3s2_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)3930 static void conv3x3s2_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
3931 {
3932 int w = bottom_blob.w;
3933 int inch = bottom_blob.c;
3934 int outw = top_blob.w;
3935 int outh = top_blob.h;
3936 int outch = top_blob.c;
3937
3938 const int tailstep = (w - 2 * outw + w) * 4;
3939
3940 const float* bias = _bias;
3941
3942 #pragma omp parallel for num_threads(opt.num_threads)
3943 for (int p = 0; p < outch; p++)
3944 {
3945 Mat out0 = top_blob.channel(p);
3946
3947 float32x4_t _bias0 = bias ? vld1q_f32((const float*)bias + p * 4) : vdupq_n_f32(0.f);
3948 out0.fill(_bias0);
3949
3950 for (int q = 0; q < inch; q++)
3951 {
3952 float* outptr0 = out0.row(0);
3953
3954 const Mat img0 = bottom_blob.channel(q);
3955
3956 const float* r0 = img0.row(0);
3957 const float* r1 = img0.row(1);
3958 const float* r2 = img0.row(2);
3959
3960 const float* kptr = (const float*)kernel.channel(p).row(q);
3961
3962 int i = 0;
3963 for (; i < outh; i++)
3964 {
3965 int j = 0;
3966 for (; j + 3 < outw; j += 4)
3967 {
3968 #if __aarch64__
3969 asm volatile(
3970 "prfm pldl1keep, [%0, #512] \n"
3971 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%0] \n" // sum0 sum1 sum2 sum3
3972
3973 "prfm pldl1keep, [%1, #512] \n"
3974 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03
3975
3976 "prfm pldl1keep, [%1, #512] \n"
3977 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n" // r04 r05 r06 r07
3978
3979 "prfm pldl1keep, [%4, #512] \n"
3980 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
3981
3982 "fmla v20.4s, v16.4s, v0.s[0] \n"
3983 "fmla v21.4s, v16.4s, v2.s[0] \n"
3984 "fmla v22.4s, v16.4s, v4.s[0] \n"
3985 "fmla v23.4s, v16.4s, v6.s[0] \n"
3986 "fmla v20.4s, v17.4s, v0.s[1] \n"
3987 "fmla v21.4s, v17.4s, v2.s[1] \n"
3988 "fmla v22.4s, v17.4s, v4.s[1] \n"
3989 "fmla v23.4s, v17.4s, v6.s[1] \n"
3990
3991 "prfm pldl1keep, [%4, #512] \n"
3992 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
3993
3994 "fmla v20.4s, v18.4s, v0.s[2] \n"
3995 "fmla v21.4s, v18.4s, v2.s[2] \n"
3996 "fmla v22.4s, v18.4s, v4.s[2] \n"
3997 "fmla v23.4s, v18.4s, v6.s[2] \n"
3998 "fmla v20.4s, v19.4s, v0.s[3] \n"
3999 "fmla v21.4s, v19.4s, v2.s[3] \n"
4000 "fmla v22.4s, v19.4s, v4.s[3] \n"
4001 "fmla v23.4s, v19.4s, v6.s[3] \n"
4002
4003 "prfm pldl1keep, [%1, #128] \n"
4004 "ld1 {v28.4s}, [%1] \n" // r08
4005
4006 "fmla v20.4s, v24.4s, v1.s[0] \n"
4007 "fmla v21.4s, v24.4s, v3.s[0] \n"
4008 "fmla v22.4s, v24.4s, v5.s[0] \n"
4009 "fmla v23.4s, v24.4s, v7.s[0] \n"
4010 "fmla v20.4s, v25.4s, v1.s[1] \n"
4011 "fmla v21.4s, v25.4s, v3.s[1] \n"
4012 "fmla v22.4s, v25.4s, v5.s[1] \n"
4013 "fmla v23.4s, v25.4s, v7.s[1] \n"
4014
4015 "prfm pldl1keep, [%4, #512] \n"
4016 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4017
4018 "fmla v20.4s, v26.4s, v1.s[2] \n"
4019 "fmla v21.4s, v26.4s, v3.s[2] \n"
4020 "fmla v22.4s, v26.4s, v5.s[2] \n"
4021 "fmla v23.4s, v26.4s, v7.s[2] \n"
4022 "fmla v20.4s, v27.4s, v1.s[3] \n"
4023 "fmla v21.4s, v27.4s, v3.s[3] \n"
4024 "fmla v22.4s, v27.4s, v5.s[3] \n"
4025 "fmla v23.4s, v27.4s, v7.s[3] \n"
4026
4027 "prfm pldl1keep, [%2, #512] \n"
4028 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%2], #64 \n" // r10 r11 r12 r13
4029
4030 "fmla v20.4s, v16.4s, v2.s[0] \n"
4031 "fmla v21.4s, v16.4s, v4.s[0] \n"
4032 "fmla v22.4s, v16.4s, v6.s[0] \n"
4033 "fmla v23.4s, v16.4s, v28.s[0] \n"
4034 "fmla v20.4s, v17.4s, v2.s[1] \n"
4035 "fmla v21.4s, v17.4s, v4.s[1] \n"
4036 "fmla v22.4s, v17.4s, v6.s[1] \n"
4037 "fmla v23.4s, v17.4s, v28.s[1] \n"
4038
4039 "prfm pldl1keep, [%4, #512] \n"
4040 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4041
4042 "fmla v20.4s, v18.4s, v2.s[2] \n"
4043 "fmla v21.4s, v18.4s, v4.s[2] \n"
4044 "fmla v22.4s, v18.4s, v6.s[2] \n"
4045 "fmla v23.4s, v18.4s, v28.s[2] \n"
4046 "fmla v20.4s, v19.4s, v2.s[3] \n"
4047 "fmla v21.4s, v19.4s, v4.s[3] \n"
4048 "fmla v22.4s, v19.4s, v6.s[3] \n"
4049 "fmla v23.4s, v19.4s, v28.s[3] \n"
4050
4051 "prfm pldl1keep, [%2, #512] \n"
4052 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n" // r14 r15 r16 r17
4053
4054 "fmla v20.4s, v24.4s, v8.s[0] \n"
4055 "fmla v21.4s, v24.4s, v10.s[0] \n"
4056 "fmla v22.4s, v24.4s, v12.s[0] \n"
4057 "fmla v23.4s, v24.4s, v14.s[0] \n"
4058 "fmla v20.4s, v25.4s, v8.s[1] \n"
4059 "fmla v21.4s, v25.4s, v10.s[1] \n"
4060 "fmla v22.4s, v25.4s, v12.s[1] \n"
4061 "fmla v23.4s, v25.4s, v14.s[1] \n"
4062
4063 "prfm pldl1keep, [%4, #512] \n"
4064 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4065
4066 "fmla v20.4s, v26.4s, v8.s[2] \n"
4067 "fmla v21.4s, v26.4s, v10.s[2] \n"
4068 "fmla v22.4s, v26.4s, v12.s[2] \n"
4069 "fmla v23.4s, v26.4s, v14.s[2] \n"
4070 "fmla v20.4s, v27.4s, v8.s[3] \n"
4071 "fmla v21.4s, v27.4s, v10.s[3] \n"
4072 "fmla v22.4s, v27.4s, v12.s[3] \n"
4073 "fmla v23.4s, v27.4s, v14.s[3] \n"
4074
4075 "prfm pldl1keep, [%2, #128] \n"
4076 "ld1 {v28.4s}, [%2] \n" // r18
4077
4078 "fmla v20.4s, v16.4s, v9.s[0] \n"
4079 "fmla v21.4s, v16.4s, v11.s[0] \n"
4080 "fmla v22.4s, v16.4s, v13.s[0] \n"
4081 "fmla v23.4s, v16.4s, v15.s[0] \n"
4082 "fmla v20.4s, v17.4s, v9.s[1] \n"
4083 "fmla v21.4s, v17.4s, v11.s[1] \n"
4084 "fmla v22.4s, v17.4s, v13.s[1] \n"
4085 "fmla v23.4s, v17.4s, v15.s[1] \n"
4086
4087 "prfm pldl1keep, [%4, #512] \n"
4088 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4089
4090 "fmla v20.4s, v18.4s, v9.s[2] \n"
4091 "fmla v21.4s, v18.4s, v11.s[2] \n"
4092 "fmla v22.4s, v18.4s, v13.s[2] \n"
4093 "fmla v23.4s, v18.4s, v15.s[2] \n"
4094 "fmla v20.4s, v19.4s, v9.s[3] \n"
4095 "fmla v21.4s, v19.4s, v11.s[3] \n"
4096 "fmla v22.4s, v19.4s, v13.s[3] \n"
4097 "fmla v23.4s, v19.4s, v15.s[3] \n"
4098
4099 "prfm pldl1keep, [%3, #512] \n"
4100 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23
4101
4102 "fmla v20.4s, v24.4s, v10.s[0] \n"
4103 "fmla v21.4s, v24.4s, v12.s[0] \n"
4104 "fmla v22.4s, v24.4s, v14.s[0] \n"
4105 "fmla v23.4s, v24.4s, v28.s[0] \n"
4106 "fmla v20.4s, v25.4s, v10.s[1] \n"
4107 "fmla v21.4s, v25.4s, v12.s[1] \n"
4108 "fmla v22.4s, v25.4s, v14.s[1] \n"
4109 "fmla v23.4s, v25.4s, v28.s[1] \n"
4110
4111 "prfm pldl1keep, [%4, #512] \n"
4112 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4113
4114 "fmla v20.4s, v26.4s, v10.s[2] \n"
4115 "fmla v21.4s, v26.4s, v12.s[2] \n"
4116 "fmla v22.4s, v26.4s, v14.s[2] \n"
4117 "fmla v23.4s, v26.4s, v28.s[2] \n"
4118 "fmla v20.4s, v27.4s, v10.s[3] \n"
4119 "fmla v21.4s, v27.4s, v12.s[3] \n"
4120 "fmla v22.4s, v27.4s, v14.s[3] \n"
4121 "fmla v23.4s, v27.4s, v28.s[3] \n"
4122
4123 "prfm pldl1keep, [%3, #512] \n"
4124 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n" // r24 r25 r26 r27
4125
4126 "fmla v20.4s, v16.4s, v0.s[0] \n"
4127 "fmla v21.4s, v16.4s, v2.s[0] \n"
4128 "fmla v22.4s, v16.4s, v4.s[0] \n"
4129 "fmla v23.4s, v16.4s, v6.s[0] \n"
4130 "fmla v20.4s, v17.4s, v0.s[1] \n"
4131 "fmla v21.4s, v17.4s, v2.s[1] \n"
4132 "fmla v22.4s, v17.4s, v4.s[1] \n"
4133 "fmla v23.4s, v17.4s, v6.s[1] \n"
4134
4135 "prfm pldl1keep, [%4, #512] \n"
4136 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4137
4138 "fmla v20.4s, v18.4s, v0.s[2] \n"
4139 "fmla v21.4s, v18.4s, v2.s[2] \n"
4140 "fmla v22.4s, v18.4s, v4.s[2] \n"
4141 "fmla v23.4s, v18.4s, v6.s[2] \n"
4142 "fmla v20.4s, v19.4s, v0.s[3] \n"
4143 "fmla v21.4s, v19.4s, v2.s[3] \n"
4144 "fmla v22.4s, v19.4s, v4.s[3] \n"
4145 "fmla v23.4s, v19.4s, v6.s[3] \n"
4146
4147 "prfm pldl1keep, [%3, #128] \n"
4148 "ld1 {v28.4s}, [%3] \n" // r28
4149
4150 "fmla v20.4s, v24.4s, v1.s[0] \n"
4151 "fmla v21.4s, v24.4s, v3.s[0] \n"
4152 "fmla v22.4s, v24.4s, v5.s[0] \n"
4153 "fmla v23.4s, v24.4s, v7.s[0] \n"
4154 "fmla v20.4s, v25.4s, v1.s[1] \n"
4155 "fmla v21.4s, v25.4s, v3.s[1] \n"
4156 "fmla v22.4s, v25.4s, v5.s[1] \n"
4157 "fmla v23.4s, v25.4s, v7.s[1] \n"
4158
4159 // "prfm pldl1keep, [%4, #512] \n"
4160 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4161
4162 "fmla v20.4s, v26.4s, v1.s[2] \n"
4163 "fmla v21.4s, v26.4s, v3.s[2] \n"
4164 "fmla v22.4s, v26.4s, v5.s[2] \n"
4165 "fmla v23.4s, v26.4s, v7.s[2] \n"
4166 "fmla v20.4s, v27.4s, v1.s[3] \n"
4167 "fmla v21.4s, v27.4s, v3.s[3] \n"
4168 "fmla v22.4s, v27.4s, v5.s[3] \n"
4169 "fmla v23.4s, v27.4s, v7.s[3] \n"
4170
4171 "fmla v20.4s, v16.4s, v2.s[0] \n"
4172 "fmla v21.4s, v16.4s, v4.s[0] \n"
4173 "fmla v22.4s, v16.4s, v6.s[0] \n"
4174 "fmla v23.4s, v16.4s, v28.s[0] \n"
4175 "fmla v20.4s, v17.4s, v2.s[1] \n"
4176 "fmla v21.4s, v17.4s, v4.s[1] \n"
4177 "fmla v22.4s, v17.4s, v6.s[1] \n"
4178 "fmla v23.4s, v17.4s, v28.s[1] \n"
4179 "fmla v20.4s, v18.4s, v2.s[2] \n"
4180 "fmla v21.4s, v18.4s, v4.s[2] \n"
4181 "fmla v22.4s, v18.4s, v6.s[2] \n"
4182 "fmla v23.4s, v18.4s, v28.s[2] \n"
4183 "fmla v20.4s, v19.4s, v2.s[3] \n"
4184 "fmla v21.4s, v19.4s, v4.s[3] \n"
4185 "fmla v22.4s, v19.4s, v6.s[3] \n"
4186 "fmla v23.4s, v19.4s, v28.s[3] \n"
4187
4188 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4189
4190 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
4191
4192 : "=r"(outptr0), // %0
4193 "=r"(r0), // %1
4194 "=r"(r1), // %2
4195 "=r"(r2), // %3
4196 "=r"(kptr) // %4
4197 : "0"(outptr0),
4198 "1"(r0),
4199 "2"(r1),
4200 "3"(r2),
4201 "4"(kptr)
4202 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28");
4203 #else // __aarch64__
4204 asm volatile(
4205 "pld [%0, #512] \n"
4206 "vldm %0, {d24-d31} \n" // sum0 sum1 sum2 sum3
4207
4208 "pld [%1, #512] \n"
4209 "vldm %1!, {d0-d7} \n" // r00 r01 r02 r03
4210
4211 "pld [%1, #512] \n"
4212 "vldm %1!, {d8-d15} \n" // r04 r05 r06 r07
4213
4214 "pld [%4, #512] \n"
4215 "vldm %4!, {d16-d23} \n"
4216
4217 "vmla.f32 q12, q8, d0[0] \n"
4218 "vmla.f32 q13, q8, d4[0] \n"
4219 "vmla.f32 q14, q8, d8[0] \n"
4220 "vmla.f32 q15, q8, d12[0] \n"
4221 "vmla.f32 q12, q9, d0[1] \n"
4222 "vmla.f32 q13, q9, d4[1] \n"
4223 "vmla.f32 q14, q9, d8[1] \n"
4224 "vmla.f32 q15, q9, d12[1] \n"
4225 "vmla.f32 q12, q10, d1[0] \n"
4226 "vmla.f32 q13, q10, d5[0] \n"
4227 "vmla.f32 q14, q10, d9[0] \n"
4228 "vmla.f32 q15, q10, d13[0] \n"
4229 "vmla.f32 q12, q11, d1[1] \n"
4230 "vmla.f32 q13, q11, d5[1] \n"
4231 "vmla.f32 q14, q11, d9[1] \n"
4232 "vmla.f32 q15, q11, d13[1] \n"
4233
4234 "pld [%4, #512] \n"
4235 "vldm %4!, {d16-d23} \n"
4236
4237 "pld [%1, #128] \n"
4238 "vld1.f32 {d0-d1}, [%1 :128] \n" // r08
4239
4240 "vmla.f32 q12, q8, d2[0] \n"
4241 "vmla.f32 q13, q8, d6[0] \n"
4242 "vmla.f32 q14, q8, d10[0] \n"
4243 "vmla.f32 q15, q8, d14[0] \n"
4244 "vmla.f32 q12, q9, d2[1] \n"
4245 "vmla.f32 q13, q9, d6[1] \n"
4246 "vmla.f32 q14, q9, d10[1] \n"
4247 "vmla.f32 q15, q9, d14[1] \n"
4248 "vmla.f32 q12, q10, d3[0] \n"
4249 "vmla.f32 q13, q10, d7[0] \n"
4250 "vmla.f32 q14, q10, d11[0] \n"
4251 "vmla.f32 q15, q10, d15[0] \n"
4252 "vmla.f32 q12, q11, d3[1] \n"
4253 "vmla.f32 q13, q11, d7[1] \n"
4254 "vmla.f32 q14, q11, d11[1] \n"
4255 "vmla.f32 q15, q11, d15[1] \n"
4256
4257 "pld [%4, #512] \n"
4258 "vldm %4!, {d16-d23} \n"
4259
4260 "vmla.f32 q12, q8, d4[0] \n"
4261 "vmla.f32 q13, q8, d8[0] \n"
4262 "vmla.f32 q14, q8, d12[0] \n"
4263 "vmla.f32 q15, q8, d0[0] \n"
4264 "vmla.f32 q12, q9, d4[1] \n"
4265 "vmla.f32 q13, q9, d8[1] \n"
4266 "vmla.f32 q14, q9, d12[1] \n"
4267 "vmla.f32 q15, q9, d0[1] \n"
4268 "vmla.f32 q12, q10, d5[0] \n"
4269 "vmla.f32 q13, q10, d9[0] \n"
4270 "vmla.f32 q14, q10, d13[0] \n"
4271 "vmla.f32 q15, q10, d1[0] \n"
4272 "vmla.f32 q12, q11, d5[1] \n"
4273 "vmla.f32 q13, q11, d9[1] \n"
4274 "vmla.f32 q14, q11, d13[1] \n"
4275 "vmla.f32 q15, q11, d1[1] \n"
4276
4277 "pld [%2, #512] \n"
4278 "vldm %2!, {d8-d15} \n" // r10 r11 r12 r13
4279
4280 "pld [%2, #512] \n"
4281 "vldm %2!, {d0-d7} \n" // r14 r15 r16 r17
4282
4283 "pld [%4, #512] \n"
4284 "vldm %4!, {d16-d23} \n"
4285
4286 "vmla.f32 q12, q8, d8[0] \n"
4287 "vmla.f32 q13, q8, d12[0] \n"
4288 "vmla.f32 q14, q8, d0[0] \n"
4289 "vmla.f32 q15, q8, d4[0] \n"
4290 "vmla.f32 q12, q9, d8[1] \n"
4291 "vmla.f32 q13, q9, d12[1] \n"
4292 "vmla.f32 q14, q9, d0[1] \n"
4293 "vmla.f32 q15, q9, d4[1] \n"
4294 "vmla.f32 q12, q10, d9[0] \n"
4295 "vmla.f32 q13, q10, d13[0] \n"
4296 "vmla.f32 q14, q10, d1[0] \n"
4297 "vmla.f32 q15, q10, d5[0] \n"
4298 "vmla.f32 q12, q11, d9[1] \n"
4299 "vmla.f32 q13, q11, d13[1] \n"
4300 "vmla.f32 q14, q11, d1[1] \n"
4301 "vmla.f32 q15, q11, d5[1] \n"
4302
4303 "pld [%4, #512] \n"
4304 "vldm %4!, {d16-d23} \n"
4305
4306 "pld [%2, #128] \n"
4307 "vld1.f32 {d8-d9}, [%2 :128] \n" // r18
4308
4309 "vmla.f32 q12, q8, d10[0] \n"
4310 "vmla.f32 q13, q8, d14[0] \n"
4311 "vmla.f32 q14, q8, d2[0] \n"
4312 "vmla.f32 q15, q8, d6[0] \n"
4313 "vmla.f32 q12, q9, d10[1] \n"
4314 "vmla.f32 q13, q9, d14[1] \n"
4315 "vmla.f32 q14, q9, d2[1] \n"
4316 "vmla.f32 q15, q9, d6[1] \n"
4317 "vmla.f32 q12, q10, d11[0] \n"
4318 "vmla.f32 q13, q10, d15[0] \n"
4319 "vmla.f32 q14, q10, d3[0] \n"
4320 "vmla.f32 q15, q10, d7[0] \n"
4321 "vmla.f32 q12, q11, d11[1] \n"
4322 "vmla.f32 q13, q11, d15[1] \n"
4323 "vmla.f32 q14, q11, d3[1] \n"
4324 "vmla.f32 q15, q11, d7[1] \n"
4325
4326 "pld [%4, #512] \n"
4327 "vldm %4!, {d16-d23} \n"
4328
4329 "vmla.f32 q12, q8, d12[0] \n"
4330 "vmla.f32 q13, q8, d0[0] \n"
4331 "vmla.f32 q14, q8, d4[0] \n"
4332 "vmla.f32 q15, q8, d8[0] \n"
4333 "vmla.f32 q12, q9, d12[1] \n"
4334 "vmla.f32 q13, q9, d0[1] \n"
4335 "vmla.f32 q14, q9, d4[1] \n"
4336 "vmla.f32 q15, q9, d8[1] \n"
4337 "vmla.f32 q12, q10, d13[0] \n"
4338 "vmla.f32 q13, q10, d1[0] \n"
4339 "vmla.f32 q14, q10, d5[0] \n"
4340 "vmla.f32 q15, q10, d9[0] \n"
4341 "vmla.f32 q12, q11, d13[1] \n"
4342 "vmla.f32 q13, q11, d1[1] \n"
4343 "vmla.f32 q14, q11, d5[1] \n"
4344 "vmla.f32 q15, q11, d9[1] \n"
4345
4346 "pld [%3, #512] \n"
4347 "vldm %3!, {d0-d7} \n" // r20 r21 r22 r23
4348
4349 "pld [%3, #512] \n"
4350 "vldm %3!, {d8-d15} \n" // r24 r25 r26 r27
4351
4352 "pld [%4, #512] \n"
4353 "vldm %4!, {d16-d23} \n"
4354
4355 "vmla.f32 q12, q8, d0[0] \n"
4356 "vmla.f32 q13, q8, d4[0] \n"
4357 "vmla.f32 q14, q8, d8[0] \n"
4358 "vmla.f32 q15, q8, d12[0] \n"
4359 "vmla.f32 q12, q9, d0[1] \n"
4360 "vmla.f32 q13, q9, d4[1] \n"
4361 "vmla.f32 q14, q9, d8[1] \n"
4362 "vmla.f32 q15, q9, d12[1] \n"
4363 "vmla.f32 q12, q10, d1[0] \n"
4364 "vmla.f32 q13, q10, d5[0] \n"
4365 "vmla.f32 q14, q10, d9[0] \n"
4366 "vmla.f32 q15, q10, d13[0] \n"
4367 "vmla.f32 q12, q11, d1[1] \n"
4368 "vmla.f32 q13, q11, d5[1] \n"
4369 "vmla.f32 q14, q11, d9[1] \n"
4370 "vmla.f32 q15, q11, d13[1] \n"
4371
4372 "pld [%4, #512] \n"
4373 "vldm %4!, {d16-d23} \n"
4374
4375 "pld [%3, #128] \n"
4376 "vld1.f32 {d0-d1}, [%3 :128] \n" // r28
4377
4378 "vmla.f32 q12, q8, d2[0] \n"
4379 "vmla.f32 q13, q8, d6[0] \n"
4380 "vmla.f32 q14, q8, d10[0] \n"
4381 "vmla.f32 q15, q8, d14[0] \n"
4382 "vmla.f32 q12, q9, d2[1] \n"
4383 "vmla.f32 q13, q9, d6[1] \n"
4384 "vmla.f32 q14, q9, d10[1] \n"
4385 "vmla.f32 q15, q9, d14[1] \n"
4386 "vmla.f32 q12, q10, d3[0] \n"
4387 "vmla.f32 q13, q10, d7[0] \n"
4388 "vmla.f32 q14, q10, d11[0] \n"
4389 "vmla.f32 q15, q10, d15[0] \n"
4390 "vmla.f32 q12, q11, d3[1] \n"
4391 "vmla.f32 q13, q11, d7[1] \n"
4392 "vmla.f32 q14, q11, d11[1] \n"
4393 "vmla.f32 q15, q11, d15[1] \n"
4394
4395 // "pld [%4, #512] \n"
4396 "vldm %4, {d16-d23} \n"
4397
4398 "vmla.f32 q12, q8, d4[0] \n"
4399 "vmla.f32 q13, q8, d8[0] \n"
4400 "vmla.f32 q14, q8, d12[0] \n"
4401 "vmla.f32 q15, q8, d0[0] \n"
4402 "vmla.f32 q12, q9, d4[1] \n"
4403 "vmla.f32 q13, q9, d8[1] \n"
4404 "vmla.f32 q14, q9, d12[1] \n"
4405 "vmla.f32 q15, q9, d0[1] \n"
4406 "vmla.f32 q12, q10, d5[0] \n"
4407 "vmla.f32 q13, q10, d9[0] \n"
4408 "vmla.f32 q14, q10, d13[0] \n"
4409 "vmla.f32 q15, q10, d1[0] \n"
4410 "vmla.f32 q12, q11, d5[1] \n"
4411 "vmla.f32 q13, q11, d9[1] \n"
4412 "vmla.f32 q14, q11, d13[1] \n"
4413 "vmla.f32 q15, q11, d1[1] \n"
4414
4415 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4416
4417 "vstm %0!, {d24-d31} \n"
4418
4419 : "=r"(outptr0), // %0
4420 "=r"(r0), // %1
4421 "=r"(r1), // %2
4422 "=r"(r2), // %3
4423 "=r"(kptr) // %4
4424 : "0"(outptr0),
4425 "1"(r0),
4426 "2"(r1),
4427 "3"(r2),
4428 "4"(kptr)
4429 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4430 #endif // __aarch64__
4431 }
4432 for (; j + 1 < outw; j += 2)
4433 {
4434 #if __aarch64__
4435 asm volatile(
4436 "prfm pldl1keep, [%0, #256] \n"
4437 "ld1 {v20.4s, v21.4s}, [%0] \n" // sum0 sum1
4438
4439 "prfm pldl1keep, [%1, #512] \n"
4440 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n" // r00 r01 r02 r03
4441
4442 "prfm pldl1keep, [%4, #512] \n"
4443 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4444
4445 "fmul v22.4s, v16.4s, v0.s[0] \n"
4446 "fmul v23.4s, v16.4s, v2.s[0] \n"
4447 "fmla v20.4s, v17.4s, v0.s[1] \n"
4448 "fmla v21.4s, v17.4s, v2.s[1] \n"
4449
4450 "prfm pldl1keep, [%4, #512] \n"
4451 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4452
4453 "fmla v22.4s, v18.4s, v0.s[2] \n"
4454 "fmla v23.4s, v18.4s, v2.s[2] \n"
4455 "fmla v20.4s, v19.4s, v0.s[3] \n"
4456 "fmla v21.4s, v19.4s, v2.s[3] \n"
4457
4458 "prfm pldl1keep, [%1, #128] \n"
4459 "ld1 {v4.4s}, [%1] \n" // r04
4460
4461 "fmla v22.4s, v24.4s, v1.s[0] \n"
4462 "fmla v23.4s, v24.4s, v3.s[0] \n"
4463 "fmla v20.4s, v25.4s, v1.s[1] \n"
4464 "fmla v21.4s, v25.4s, v3.s[1] \n"
4465
4466 "prfm pldl1keep, [%4, #512] \n"
4467 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4468
4469 "fmla v22.4s, v26.4s, v1.s[2] \n"
4470 "fmla v23.4s, v26.4s, v3.s[2] \n"
4471 "fmla v20.4s, v27.4s, v1.s[3] \n"
4472 "fmla v21.4s, v27.4s, v3.s[3] \n"
4473
4474 "fmla v22.4s, v16.4s, v2.s[0] \n"
4475 "fmla v23.4s, v16.4s, v4.s[0] \n"
4476 "fmla v20.4s, v17.4s, v2.s[1] \n"
4477 "fmla v21.4s, v17.4s, v4.s[1] \n"
4478
4479 "prfm pldl1keep, [%4, #512] \n"
4480 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4481
4482 "fmla v22.4s, v18.4s, v2.s[2] \n"
4483 "fmla v23.4s, v18.4s, v4.s[2] \n"
4484 "fmla v20.4s, v19.4s, v2.s[3] \n"
4485 "fmla v21.4s, v19.4s, v4.s[3] \n"
4486
4487 "prfm pldl1keep, [%2, #512] \n"
4488 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n" // r10 r11 r12 r13
4489
4490 "fmla v22.4s, v24.4s, v0.s[0] \n"
4491 "fmla v23.4s, v24.4s, v2.s[0] \n"
4492 "fmla v20.4s, v25.4s, v0.s[1] \n"
4493 "fmla v21.4s, v25.4s, v2.s[1] \n"
4494
4495 "prfm pldl1keep, [%4, #512] \n"
4496 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4497
4498 "fmla v22.4s, v26.4s, v0.s[2] \n"
4499 "fmla v23.4s, v26.4s, v2.s[2] \n"
4500 "fmla v20.4s, v27.4s, v0.s[3] \n"
4501 "fmla v21.4s, v27.4s, v2.s[3] \n"
4502
4503 "prfm pldl1keep, [%2, #128] \n"
4504 "ld1 {v4.4s}, [%2] \n" // r14
4505
4506 "fmla v22.4s, v16.4s, v1.s[0] \n"
4507 "fmla v23.4s, v16.4s, v3.s[0] \n"
4508 "fmla v20.4s, v17.4s, v1.s[1] \n"
4509 "fmla v21.4s, v17.4s, v3.s[1] \n"
4510
4511 "prfm pldl1keep, [%4, #512] \n"
4512 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4513
4514 "fmla v22.4s, v18.4s, v1.s[2] \n"
4515 "fmla v23.4s, v18.4s, v3.s[2] \n"
4516 "fmla v20.4s, v19.4s, v1.s[3] \n"
4517 "fmla v21.4s, v19.4s, v3.s[3] \n"
4518
4519 "fmla v22.4s, v24.4s, v2.s[0] \n"
4520 "fmla v23.4s, v24.4s, v4.s[0] \n"
4521 "fmla v20.4s, v25.4s, v2.s[1] \n"
4522 "fmla v21.4s, v25.4s, v4.s[1] \n"
4523
4524 "prfm pldl1keep, [%4, #512] \n"
4525 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4526
4527 "fmla v22.4s, v26.4s, v2.s[2] \n"
4528 "fmla v23.4s, v26.4s, v4.s[2] \n"
4529 "fmla v20.4s, v27.4s, v2.s[3] \n"
4530 "fmla v21.4s, v27.4s, v4.s[3] \n"
4531
4532 "prfm pldl1keep, [%3, #512] \n"
4533 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n" // r20 r21 r22 r23
4534
4535 "fmla v22.4s, v16.4s, v0.s[0] \n"
4536 "fmla v23.4s, v16.4s, v2.s[0] \n"
4537 "fmla v20.4s, v17.4s, v0.s[1] \n"
4538 "fmla v21.4s, v17.4s, v2.s[1] \n"
4539
4540 "prfm pldl1keep, [%4, #512] \n"
4541 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4542
4543 "fmla v22.4s, v18.4s, v0.s[2] \n"
4544 "fmla v23.4s, v18.4s, v2.s[2] \n"
4545 "fmla v20.4s, v19.4s, v0.s[3] \n"
4546 "fmla v21.4s, v19.4s, v2.s[3] \n"
4547
4548 "prfm pldl1keep, [%3, #128] \n"
4549 "ld1 {v4.4s}, [%3] \n" // r24
4550
4551 "fmla v22.4s, v24.4s, v1.s[0] \n"
4552 "fmla v23.4s, v24.4s, v3.s[0] \n"
4553 "fmla v20.4s, v25.4s, v1.s[1] \n"
4554 "fmla v21.4s, v25.4s, v3.s[1] \n"
4555
4556 // "prfm pldl1keep, [%4, #512] \n"
4557 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4558
4559 "fmla v22.4s, v26.4s, v1.s[2] \n"
4560 "fmla v23.4s, v26.4s, v3.s[2] \n"
4561 "fmla v20.4s, v27.4s, v1.s[3] \n"
4562 "fmla v21.4s, v27.4s, v3.s[3] \n"
4563
4564 "fmla v22.4s, v16.4s, v2.s[0] \n"
4565 "fmla v23.4s, v16.4s, v4.s[0] \n"
4566 "fmla v20.4s, v17.4s, v2.s[1] \n"
4567 "fmla v21.4s, v17.4s, v4.s[1] \n"
4568 "fmla v22.4s, v18.4s, v2.s[2] \n"
4569 "fmla v23.4s, v18.4s, v4.s[2] \n"
4570 "fmla v20.4s, v19.4s, v2.s[3] \n"
4571 "fmla v21.4s, v19.4s, v4.s[3] \n"
4572
4573 "fadd v20.4s, v20.4s, v22.4s \n"
4574 "fadd v21.4s, v21.4s, v23.4s \n"
4575
4576 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4577
4578 "st1 {v20.4s, v21.4s}, [%0], #32 \n"
4579
4580 : "=r"(outptr0), // %0
4581 "=r"(r0), // %1
4582 "=r"(r1), // %2
4583 "=r"(r2), // %3
4584 "=r"(kptr) // %4
4585 : "0"(outptr0),
4586 "1"(r0),
4587 "2"(r1),
4588 "3"(r2),
4589 "4"(kptr)
4590 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
4591 #else // __aarch64__
4592 asm volatile(
4593 "pld [%0, #256] \n"
4594 "vld1.f32 {d24-d27}, [%0 :128] \n" // sum0 sum1
4595
4596 "pld [%1, #512] \n"
4597 "vldm %1!, {d0-d7} \n" // r00 r01 r02 r03
4598
4599 "pld [%4, #512] \n"
4600 "vldm %4!, {d16-d23} \n"
4601
4602 "vmul.f32 q14, q8, d0[0] \n"
4603 "vmul.f32 q15, q8, d4[0] \n"
4604 "vmla.f32 q12, q9, d0[1] \n"
4605 "vmla.f32 q13, q9, d4[1] \n"
4606 "vmla.f32 q14, q10, d1[0] \n"
4607 "vmla.f32 q15, q10, d5[0] \n"
4608 "vmla.f32 q12, q11, d1[1] \n"
4609 "vmla.f32 q13, q11, d5[1] \n"
4610
4611 "pld [%4, #512] \n"
4612 "vldm %4!, {d16-d23} \n"
4613
4614 "pld [%1, #128] \n"
4615 "vld1.f32 {d8-d9}, [%1 :128] \n" // r04
4616
4617 "vmla.f32 q14, q8, d2[0] \n"
4618 "vmla.f32 q15, q8, d6[0] \n"
4619 "vmla.f32 q12, q9, d2[1] \n"
4620 "vmla.f32 q13, q9, d6[1] \n"
4621 "vmla.f32 q14, q10, d3[0] \n"
4622 "vmla.f32 q15, q10, d7[0] \n"
4623 "vmla.f32 q12, q11, d3[1] \n"
4624 "vmla.f32 q13, q11, d7[1] \n"
4625
4626 "pld [%4, #512] \n"
4627 "vldm %4!, {d16-d23} \n"
4628
4629 "vmla.f32 q14, q8, d4[0] \n"
4630 "vmla.f32 q15, q8, d8[0] \n"
4631 "vmla.f32 q12, q9, d4[1] \n"
4632 "vmla.f32 q13, q9, d8[1] \n"
4633 "vmla.f32 q14, q10, d5[0] \n"
4634 "vmla.f32 q15, q10, d9[0] \n"
4635 "vmla.f32 q12, q11, d5[1] \n"
4636 "vmla.f32 q13, q11, d9[1] \n"
4637
4638 "pld [%2, #512] \n"
4639 "vldm %2!, {d0-d7} \n" // r10 r11 r12 r13
4640
4641 "pld [%4, #512] \n"
4642 "vldm %4!, {d16-d23} \n"
4643
4644 "vmla.f32 q14, q8, d0[0] \n"
4645 "vmla.f32 q15, q8, d4[0] \n"
4646 "vmla.f32 q12, q9, d0[1] \n"
4647 "vmla.f32 q13, q9, d4[1] \n"
4648 "vmla.f32 q14, q10, d1[0] \n"
4649 "vmla.f32 q15, q10, d5[0] \n"
4650 "vmla.f32 q12, q11, d1[1] \n"
4651 "vmla.f32 q13, q11, d5[1] \n"
4652
4653 "pld [%4, #512] \n"
4654 "vldm %4!, {d16-d23} \n"
4655
4656 "pld [%2, #128] \n"
4657 "vld1.f32 {d8-d9}, [%2 :128] \n" // r14
4658
4659 "vmla.f32 q14, q8, d2[0] \n"
4660 "vmla.f32 q15, q8, d6[0] \n"
4661 "vmla.f32 q12, q9, d2[1] \n"
4662 "vmla.f32 q13, q9, d6[1] \n"
4663 "vmla.f32 q14, q10, d3[0] \n"
4664 "vmla.f32 q15, q10, d7[0] \n"
4665 "vmla.f32 q12, q11, d3[1] \n"
4666 "vmla.f32 q13, q11, d7[1] \n"
4667
4668 "pld [%4, #512] \n"
4669 "vldm %4!, {d16-d23} \n"
4670
4671 "vmla.f32 q14, q8, d4[0] \n"
4672 "vmla.f32 q15, q8, d8[0] \n"
4673 "vmla.f32 q12, q9, d4[1] \n"
4674 "vmla.f32 q13, q9, d8[1] \n"
4675 "vmla.f32 q14, q10, d5[0] \n"
4676 "vmla.f32 q15, q10, d9[0] \n"
4677 "vmla.f32 q12, q11, d5[1] \n"
4678 "vmla.f32 q13, q11, d9[1] \n"
4679
4680 "pld [%3, #512] \n"
4681 "vldm %3!, {d0-d7} \n" // r20 r21 r22 r23
4682
4683 "pld [%4, #512] \n"
4684 "vldm %4!, {d16-d23} \n"
4685
4686 "vmla.f32 q14, q8, d0[0] \n"
4687 "vmla.f32 q15, q8, d4[0] \n"
4688 "vmla.f32 q12, q9, d0[1] \n"
4689 "vmla.f32 q13, q9, d4[1] \n"
4690 "vmla.f32 q14, q10, d1[0] \n"
4691 "vmla.f32 q15, q10, d5[0] \n"
4692 "vmla.f32 q12, q11, d1[1] \n"
4693 "vmla.f32 q13, q11, d5[1] \n"
4694
4695 "pld [%4, #512] \n"
4696 "vldm %4!, {d16-d23} \n"
4697
4698 "pld [%3, #128] \n"
4699 "vld1.f32 {d8-d9}, [%3 :128] \n" // r24
4700
4701 "vmla.f32 q14, q8, d2[0] \n"
4702 "vmla.f32 q15, q8, d6[0] \n"
4703 "vmla.f32 q12, q9, d2[1] \n"
4704 "vmla.f32 q13, q9, d6[1] \n"
4705 "vmla.f32 q14, q10, d3[0] \n"
4706 "vmla.f32 q15, q10, d7[0] \n"
4707 "vmla.f32 q12, q11, d3[1] \n"
4708 "vmla.f32 q13, q11, d7[1] \n"
4709
4710 // "pld [%4, #512] \n"
4711 "vldm %4, {d16-d23} \n"
4712
4713 "vmla.f32 q14, q8, d4[0] \n"
4714 "vmla.f32 q15, q8, d8[0] \n"
4715 "vmla.f32 q12, q9, d4[1] \n"
4716 "vmla.f32 q13, q9, d8[1] \n"
4717 "vmla.f32 q14, q10, d5[0] \n"
4718 "vmla.f32 q15, q10, d9[0] \n"
4719 "vmla.f32 q12, q11, d5[1] \n"
4720 "vmla.f32 q13, q11, d9[1] \n"
4721
4722 "vadd.f32 q12, q12, q14 \n"
4723 "vadd.f32 q13, q13, q15 \n"
4724
4725 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4726
4727 "vst1.f32 {d24-d27}, [%0 :128]! \n"
4728
4729 : "=r"(outptr0), // %0
4730 "=r"(r0), // %1
4731 "=r"(r1), // %2
4732 "=r"(r2), // %3
4733 "=r"(kptr) // %4
4734 : "0"(outptr0),
4735 "1"(r0),
4736 "2"(r1),
4737 "3"(r2),
4738 "4"(kptr)
4739 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4740 #endif // __aarch64__
4741 }
4742 for (; j < outw; j++)
4743 {
4744 #if __aarch64__
4745 asm volatile(
4746 "prfm pldl1keep, [%0, #128] \n"
4747 "ld1 {v20.4s}, [%0] \n" // sum0
4748
4749 "prfm pldl1keep, [%1, #384] \n"
4750 "ld1 {v0.4s, v1.4s, v2.4s}, [%1] \n" // r00 r01 r02
4751
4752 "prfm pldl1keep, [%4, #512] \n"
4753 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4754
4755 "fmul v21.4s, v16.4s, v0.s[0] \n"
4756 "fmul v22.4s, v17.4s, v0.s[1] \n"
4757
4758 "prfm pldl1keep, [%4, #512] \n"
4759 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4760
4761 "fmul v23.4s, v18.4s, v0.s[2] \n"
4762 "fmla v20.4s, v19.4s, v0.s[3] \n"
4763
4764 "fmla v21.4s, v24.4s, v1.s[0] \n"
4765 "fmla v22.4s, v25.4s, v1.s[1] \n"
4766
4767 "prfm pldl1keep, [%4, #512] \n"
4768 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4769
4770 "fmla v23.4s, v26.4s, v1.s[2] \n"
4771 "fmla v20.4s, v27.4s, v1.s[3] \n"
4772
4773 "prfm pldl1keep, [%2, #384] \n"
4774 "ld1 {v3.4s, v4.4s, v5.4s}, [%2] \n" // r10 r11 r12
4775
4776 "fmla v21.4s, v16.4s, v2.s[0] \n"
4777 "fmla v22.4s, v17.4s, v2.s[1] \n"
4778
4779 "prfm pldl1keep, [%4, #512] \n"
4780 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4781
4782 "fmla v23.4s, v18.4s, v2.s[2] \n"
4783 "fmla v20.4s, v19.4s, v2.s[3] \n"
4784
4785 "fmla v21.4s, v24.4s, v3.s[0] \n"
4786 "fmla v22.4s, v25.4s, v3.s[1] \n"
4787
4788 "prfm pldl1keep, [%4, #512] \n"
4789 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4790
4791 "fmla v23.4s, v26.4s, v3.s[2] \n"
4792 "fmla v20.4s, v27.4s, v3.s[3] \n"
4793
4794 "fmla v21.4s, v16.4s, v4.s[0] \n"
4795 "fmla v22.4s, v17.4s, v4.s[1] \n"
4796
4797 "prfm pldl1keep, [%4, #512] \n"
4798 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4799
4800 "fmla v23.4s, v18.4s, v4.s[2] \n"
4801 "fmla v20.4s, v19.4s, v4.s[3] \n"
4802
4803 "prfm pldl1keep, [%3, #384] \n"
4804 "ld1 {v0.4s, v1.4s, v2.4s}, [%3] \n" // r20 r21 r22
4805
4806 "fmla v21.4s, v24.4s, v5.s[0] \n"
4807 "fmla v22.4s, v25.4s, v5.s[1] \n"
4808
4809 "prfm pldl1keep, [%4, #512] \n"
4810 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4], #64 \n"
4811
4812 "fmla v23.4s, v26.4s, v5.s[2] \n"
4813 "fmla v20.4s, v27.4s, v5.s[3] \n"
4814
4815 "fmla v21.4s, v16.4s, v0.s[0] \n"
4816 "fmla v22.4s, v17.4s, v0.s[1] \n"
4817
4818 "prfm pldl1keep, [%4, #512] \n"
4819 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%4], #64 \n"
4820
4821 "fmla v23.4s, v18.4s, v0.s[2] \n"
4822 "fmla v20.4s, v19.4s, v0.s[3] \n"
4823
4824 "fmla v21.4s, v24.4s, v1.s[0] \n"
4825 "fmla v22.4s, v25.4s, v1.s[1] \n"
4826
4827 // "prfm pldl1keep, [%4, #512] \n"
4828 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%4] \n"
4829
4830 "fmla v23.4s, v26.4s, v1.s[2] \n"
4831 "fmla v20.4s, v27.4s, v1.s[3] \n"
4832
4833 "fmla v21.4s, v16.4s, v2.s[0] \n"
4834 "fmla v22.4s, v17.4s, v2.s[1] \n"
4835 "fmla v23.4s, v18.4s, v2.s[2] \n"
4836 "fmla v20.4s, v19.4s, v2.s[3] \n"
4837
4838 "add %1, %1, #32 \n"
4839
4840 "fadd v22.4s, v21.4s, v22.4s \n"
4841
4842 "add %2, %2, #32 \n"
4843
4844 "fadd v23.4s, v23.4s, v22.4s \n"
4845
4846 "add %3, %3, #32 \n"
4847
4848 "fadd v20.4s, v20.4s, v23.4s \n"
4849
4850 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4851
4852 "st1 {v20.4s}, [%0], #16 \n"
4853
4854 : "=r"(outptr0), // %0
4855 "=r"(r0), // %1
4856 "=r"(r1), // %2
4857 "=r"(r2), // %3
4858 "=r"(kptr) // %4
4859 : "0"(outptr0),
4860 "1"(r0),
4861 "2"(r1),
4862 "3"(r2),
4863 "4"(kptr)
4864 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
4865 #else // __aarch64__
4866 asm volatile(
4867 "pld [%0, #128] \n"
4868 "vld1.f32 {d24-d25}, [%0 :128] \n" // sum0
4869
4870 "pld [%1, #384] \n"
4871 "vldm %1, {d0-d5} \n" // r00 r01 r02
4872
4873 "pld [%4, #512] \n"
4874 "vldm %4!, {d16-d23} \n"
4875
4876 "vmul.f32 q13, q8, d0[0] \n"
4877 "vmul.f32 q14, q9, d0[1] \n"
4878 "vmul.f32 q15, q10, d1[0] \n"
4879 "vmla.f32 q12, q11, d1[1] \n"
4880
4881 "pld [%4, #512] \n"
4882 "vldm %4!, {d16-d23} \n"
4883
4884 "vmla.f32 q13, q8, d2[0] \n"
4885 "vmla.f32 q14, q9, d2[1] \n"
4886 "vmla.f32 q15, q10, d3[0] \n"
4887 "vmla.f32 q12, q11, d3[1] \n"
4888
4889 "pld [%4, #512] \n"
4890 "vldm %4!, {d16-d23} \n"
4891
4892 "vmla.f32 q13, q8, d4[0] \n"
4893 "vmla.f32 q14, q9, d4[1] \n"
4894 "vmla.f32 q15, q10, d5[0] \n"
4895 "vmla.f32 q12, q11, d5[1] \n"
4896
4897 "pld [%2, #384] \n"
4898 "vldm %2, {d0-d5} \n" // r10 r11 r12
4899
4900 "pld [%4, #512] \n"
4901 "vldm %4!, {d16-d23} \n"
4902
4903 "vmla.f32 q13, q8, d0[0] \n"
4904 "vmla.f32 q14, q9, d0[1] \n"
4905 "vmla.f32 q15, q10, d1[0] \n"
4906 "vmla.f32 q12, q11, d1[1] \n"
4907
4908 "pld [%4, #512] \n"
4909 "vldm %4!, {d16-d23} \n"
4910
4911 "vmla.f32 q13, q8, d2[0] \n"
4912 "vmla.f32 q14, q9, d2[1] \n"
4913 "vmla.f32 q15, q10, d3[0] \n"
4914 "vmla.f32 q12, q11, d3[1] \n"
4915
4916 "pld [%4, #512] \n"
4917 "vldm %4!, {d16-d23} \n"
4918
4919 "vmla.f32 q13, q8, d4[0] \n"
4920 "vmla.f32 q14, q9, d4[1] \n"
4921 "vmla.f32 q15, q10, d5[0] \n"
4922 "vmla.f32 q12, q11, d5[1] \n"
4923
4924 "pld [%3, #384] \n"
4925 "vldm %3, {d0-d5} \n" // r20 r21 r22
4926
4927 "pld [%4, #512] \n"
4928 "vldm %4!, {d16-d23} \n"
4929
4930 "vmla.f32 q13, q8, d0[0] \n"
4931 "vmla.f32 q14, q9, d0[1] \n"
4932 "vmla.f32 q15, q10, d1[0] \n"
4933 "vmla.f32 q12, q11, d1[1] \n"
4934
4935 "pld [%4, #512] \n"
4936 "vldm %4!, {d16-d23} \n"
4937
4938 "vmla.f32 q13, q8, d2[0] \n"
4939 "vmla.f32 q14, q9, d2[1] \n"
4940 "vmla.f32 q15, q10, d3[0] \n"
4941 "vmla.f32 q12, q11, d3[1] \n"
4942
4943 // "pld [%4, #512] \n"
4944 "vldm %4, {d16-d23} \n"
4945
4946 "vmla.f32 q13, q8, d4[0] \n"
4947 "vmla.f32 q14, q9, d4[1] \n"
4948 "vmla.f32 q15, q10, d5[0] \n"
4949 "vmla.f32 q12, q11, d5[1] \n"
4950
4951 "vadd.f32 q14, q14, q13 \n"
4952
4953 "add %1, %1, #32 \n"
4954
4955 "vadd.f32 q15, q15, q14 \n"
4956
4957 "add %2, %2, #32 \n"
4958
4959 "vadd.f32 q12, q12, q15 \n"
4960
4961 "add %3, %3, #32 \n"
4962
4963 "sub %4, %4, #512 \n" // kptr -= 8 * 16;
4964
4965 "vst1.f32 {d24-d25}, [%0 :128]! \n"
4966
4967 : "=r"(outptr0), // %0
4968 "=r"(r0), // %1
4969 "=r"(r1), // %2
4970 "=r"(r2), // %3
4971 "=r"(kptr) // %4
4972 : "0"(outptr0),
4973 "1"(r0),
4974 "2"(r1),
4975 "3"(r2),
4976 "4"(kptr)
4977 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
4978 #endif // __aarch64__
4979 }
4980
4981 r0 += tailstep;
4982 r1 += tailstep;
4983 r2 += tailstep;
4984 }
4985 }
4986 }
4987 }
4988
conv3x3s2_im2col_sgemm_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)4989 static void conv3x3s2_im2col_sgemm_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
4990 {
4991 int w = bottom_blob.w;
4992 int inch = bottom_blob.c;
4993
4994 int outw = top_blob.w;
4995 int outh = top_blob.h;
4996 const int size = outw * outh;
4997
4998 // im2col
4999 Mat bottom_im2col(size, 9, inch, 16u, 4, opt.workspace_allocator);
5000 {
5001 const int gap = (w * 2 - outw * 2) * 4;
5002
5003 #pragma omp parallel for num_threads(opt.num_threads)
5004 for (int p = 0; p < inch; p++)
5005 {
5006 const Mat img = bottom_blob.channel(p);
5007 Mat out = bottom_im2col.channel(p);
5008
5009 float* ptr0 = out.row(0);
5010 float* ptr1 = out.row(1);
5011 float* ptr2 = out.row(2);
5012 float* ptr3 = out.row(3);
5013 float* ptr4 = out.row(4);
5014 float* ptr5 = out.row(5);
5015 float* ptr6 = out.row(6);
5016 float* ptr7 = out.row(7);
5017 float* ptr8 = out.row(8);
5018
5019 const float* r0 = img.row(0);
5020 const float* r1 = img.row(1);
5021 const float* r2 = img.row(2);
5022
5023 for (int i = 0; i < outh; i++)
5024 {
5025 int j = 0;
5026 for (; j + 1 < outw; j += 2)
5027 {
5028 float32x4_t _r00 = vld1q_f32(r0);
5029 float32x4_t _r01 = vld1q_f32(r0 + 4);
5030 float32x4_t _r02 = vld1q_f32(r0 + 8);
5031 float32x4_t _r03 = vld1q_f32(r0 + 12);
5032 float32x4_t _r04 = vld1q_f32(r0 + 16);
5033
5034 float32x4_t _r10 = vld1q_f32(r1);
5035 float32x4_t _r11 = vld1q_f32(r1 + 4);
5036 float32x4_t _r12 = vld1q_f32(r1 + 8);
5037 float32x4_t _r13 = vld1q_f32(r1 + 12);
5038 float32x4_t _r14 = vld1q_f32(r1 + 16);
5039
5040 float32x4_t _r20 = vld1q_f32(r2);
5041 float32x4_t _r21 = vld1q_f32(r2 + 4);
5042 float32x4_t _r22 = vld1q_f32(r2 + 8);
5043 float32x4_t _r23 = vld1q_f32(r2 + 12);
5044 float32x4_t _r24 = vld1q_f32(r2 + 16);
5045
5046 vst1q_f32(ptr0, _r00);
5047 vst1q_f32(ptr0 + 4, _r02);
5048 vst1q_f32(ptr1, _r01);
5049 vst1q_f32(ptr1 + 4, _r03);
5050 vst1q_f32(ptr2, _r02);
5051 vst1q_f32(ptr2 + 4, _r04);
5052
5053 vst1q_f32(ptr3, _r10);
5054 vst1q_f32(ptr3 + 4, _r12);
5055 vst1q_f32(ptr4, _r11);
5056 vst1q_f32(ptr4 + 4, _r13);
5057 vst1q_f32(ptr5, _r12);
5058 vst1q_f32(ptr5 + 4, _r14);
5059
5060 vst1q_f32(ptr6, _r20);
5061 vst1q_f32(ptr6 + 4, _r22);
5062 vst1q_f32(ptr7, _r21);
5063 vst1q_f32(ptr7 + 4, _r23);
5064 vst1q_f32(ptr8, _r22);
5065 vst1q_f32(ptr8 + 4, _r24);
5066
5067 r0 += 16;
5068 r1 += 16;
5069 r2 += 16;
5070
5071 ptr0 += 8;
5072 ptr1 += 8;
5073 ptr2 += 8;
5074 ptr3 += 8;
5075 ptr4 += 8;
5076 ptr5 += 8;
5077 ptr6 += 8;
5078 ptr7 += 8;
5079 ptr8 += 8;
5080 }
5081 for (; j < outw; j++)
5082 {
5083 float32x4_t _r00 = vld1q_f32(r0);
5084 float32x4_t _r01 = vld1q_f32(r0 + 4);
5085 float32x4_t _r02 = vld1q_f32(r0 + 8);
5086
5087 float32x4_t _r10 = vld1q_f32(r1);
5088 float32x4_t _r11 = vld1q_f32(r1 + 4);
5089 float32x4_t _r12 = vld1q_f32(r1 + 8);
5090
5091 float32x4_t _r20 = vld1q_f32(r2);
5092 float32x4_t _r21 = vld1q_f32(r2 + 4);
5093 float32x4_t _r22 = vld1q_f32(r2 + 8);
5094
5095 vst1q_f32(ptr0, _r00);
5096 vst1q_f32(ptr1, _r01);
5097 vst1q_f32(ptr2, _r02);
5098 vst1q_f32(ptr3, _r10);
5099 vst1q_f32(ptr4, _r11);
5100 vst1q_f32(ptr5, _r12);
5101 vst1q_f32(ptr6, _r20);
5102 vst1q_f32(ptr7, _r21);
5103 vst1q_f32(ptr8, _r22);
5104
5105 r0 += 8;
5106 r1 += 8;
5107 r2 += 8;
5108
5109 ptr0 += 4;
5110 ptr1 += 4;
5111 ptr2 += 4;
5112 ptr3 += 4;
5113 ptr4 += 4;
5114 ptr5 += 4;
5115 ptr6 += 4;
5116 ptr7 += 4;
5117 ptr8 += 4;
5118 }
5119
5120 r0 += gap;
5121 r1 += gap;
5122 r2 += gap;
5123 }
5124 }
5125 }
5126
5127 im2col_sgemm_pack4_neon(bottom_im2col, top_blob, kernel, _bias, opt);
5128 }
5129