1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_winograd64_pack4to1_bf16s_neon(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel_tm,const Mat & _bias,const Option & opt)15 static void conv3x3s1_winograd64_pack4to1_bf16s_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 int h = bottom_blob.h;
19 int inch = bottom_blob.c;
20 //size_t elemsize = bottom_blob.elemsize;
21 int elempack = bottom_blob.elempack;
22
23 int outw = top_blob.w;
24 int outh = top_blob.h;
25 int outch = top_blob.c;
26
27 // pad to 6n+2
28 Mat bottom_blob_bordered = bottom_blob;
29
30 outw = (outw + 5) / 6 * 6;
31 outh = (outh + 5) / 6 * 6;
32
33 w = outw + 2;
34 h = outh + 2;
35 copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
36
37 const float* bias = _bias;
38
39 // BEGIN transform input
40 Mat bottom_blob_tm;
41 {
42 int w_tm = outw / 6 * 8;
43 int h_tm = outh / 6 * 8;
44
45 const int tiles = w_tm / 8 * h_tm / 8;
46
47 // bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
48 bottom_blob_tm.create(tiles, 64, inch, 4u * elempack, elempack, opt.workspace_allocator);
49
50 // const float itm[8][8] = {
51 // {1.0f, 0.0f, -5.25f, 0.00f, 5.25f, 0.00f, -1.0f, 0.0f},
52 //
53 // {0.0f, 1.0f, 1.00f, -4.25f, -4.25f, 1.00f, 1.0f, 0.0f},
54 // {0.0f, -1.0f, 1.00f, 4.25f, -4.25f, -1.00f, 1.0f, 0.0f},
55 //
56 // {0.0f, 0.5f, 0.25f, -2.50f, -1.25f, 2.00f, 1.0f, 0.0f},
57 // {0.0f, -0.5f, 0.25f, 2.50f, -1.25f, -2.00f, 1.0f, 0.0f},
58 //
59 // {0.0f, 2.0f, 4.00f, -2.50f, -5.00f, 0.50f, 1.0f, 0.0f},
60 // {0.0f, -2.0f, 4.00f, 2.50f, -5.00f, -0.50f, 1.0f, 0.0f},
61 //
62 // {0.0f, -1.0f, 0.00f, 5.25f, 0.00f, -5.25f, 0.0f, 1.0f}
63 // };
64
65 // 0 = r00 - r06 + (r04 - r02) * 5.25
66 // 7 = r07 - r01 + (r03 - r05) * 5.25
67
68 // 1 = (r02 + r06 - r04 * 4.25) + (r01 - r03 * 4.25 + r05)
69 // 2 = (r02 + r06 - r04 * 4.25) - (r01 - r03 * 4.25 + r05)
70
71 // 3 = (r06 + r02 * 0.25 - r04 * 1.25) + (r01 * 0.5 - r03 * 2.5 + r05 * 2)
72 // 4 = (r06 + r02 * 0.25 - r04 * 1.25) - (r01 * 0.5 - r03 * 2.5 + r05 * 2)
73
74 // reuse r04 * 1.25
75 // reuse r03 * 2.5
76 // 5 = (r06 + (r02 - r04 * 1.25) * 4) + (r01 * 2 - r03 * 2.5 + r05 * 0.5)
77 // 6 = (r06 + (r02 - r04 * 1.25) * 4) - (r01 * 2 - r03 * 2.5 + r05 * 0.5)
78
79 #pragma omp parallel for num_threads(opt.num_threads)
80 for (int q = 0; q < inch; q++)
81 {
82 const Mat img0 = bottom_blob_bordered.channel(q);
83 Mat img0_tm = bottom_blob_tm.channel(q);
84
85 float tmp[8][8][4];
86
87 // tile
88 for (int i = 0; i < h_tm / 8; i++)
89 {
90 for (int j = 0; j < w_tm / 8; j++)
91 {
92 const unsigned short* r0 = img0.row<const unsigned short>(i * 6) + (j * 6) * 4;
93
94 for (int m = 0; m < 8; m++)
95 {
96 float32x4_t _r00 = vcvt_f32_bf16(vld1_u16(r0));
97 float32x4_t _r01 = vcvt_f32_bf16(vld1_u16(r0 + 4));
98 float32x4_t _r02 = vcvt_f32_bf16(vld1_u16(r0 + 8));
99 float32x4_t _r03 = vcvt_f32_bf16(vld1_u16(r0 + 12));
100 float32x4_t _r04 = vcvt_f32_bf16(vld1_u16(r0 + 16));
101 float32x4_t _r05 = vcvt_f32_bf16(vld1_u16(r0 + 20));
102 float32x4_t _r06 = vcvt_f32_bf16(vld1_u16(r0 + 24));
103 float32x4_t _r07 = vcvt_f32_bf16(vld1_u16(r0 + 28));
104
105 float32x4_t _tmp0m = vmlaq_n_f32(vsubq_f32(_r00, _r06), vsubq_f32(_r04, _r02), 5.25f);
106 float32x4_t _tmp7m = vmlaq_n_f32(vsubq_f32(_r07, _r01), vsubq_f32(_r03, _r05), 5.25f);
107 vst1q_f32(tmp[0][m], _tmp0m);
108 vst1q_f32(tmp[7][m], _tmp7m);
109
110 // tmp[0][m] = r0[0] - r0[6] + (r0[4] - r0[2]) * 5.25;
111 // tmp[7][m] = r0[7] - r0[1] + (r0[3] - r0[5]) * 5.25;
112
113 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_r02, _r06), _r04, 4.25f);
114 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_r01, _r05), _r03, 4.25f);
115
116 // float tmp12a = (r0[2] + r0[6] - r0[4] * 4.25);
117 // float tmp12b = (r0[1] + r0[5] - r0[3] * 4.25);
118
119 float32x4_t _tmp1m = vaddq_f32(_tmp12a, _tmp12b);
120 float32x4_t _tmp2m = vsubq_f32(_tmp12a, _tmp12b);
121 vst1q_f32(tmp[1][m], _tmp1m);
122 vst1q_f32(tmp[2][m], _tmp2m);
123
124 // tmp[1][m] = tmp12a + tmp12b;
125 // tmp[2][m] = tmp12a - tmp12b;
126
127 float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_r06, _r02, 0.25f), _r04, 1.25f);
128 float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 0.5f), _r03, 2.5f), _r05, 2.f);
129
130 // float tmp34a = (r0[6] + r0[2] * 0.25 - r0[4] * 1.25);
131 // float tmp34b = (r0[1] * 0.5 - r0[3] * 2.5 + r0[5] * 2);
132
133 float32x4_t _tmp3m = vaddq_f32(_tmp34a, _tmp34b);
134 float32x4_t _tmp4m = vsubq_f32(_tmp34a, _tmp34b);
135 vst1q_f32(tmp[3][m], _tmp3m);
136 vst1q_f32(tmp[4][m], _tmp4m);
137
138 // tmp[3][m] = tmp34a + tmp34b;
139 // tmp[4][m] = tmp34a - tmp34b;
140
141 float32x4_t _tmp56a = vmlaq_n_f32(_r06, vmlsq_n_f32(_r02, _r04, 1.25f), 4.f);
142 float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_r01, 2.f), _r03, 2.5f), _r05, 0.5f);
143
144 // float tmp56a = (r0[6] + (r0[2] - r0[4] * 1.25) * 4);
145 // float tmp56b = (r0[1] * 2 - r0[3] * 2.5 + r0[5] * 0.5);
146
147 float32x4_t _tmp5m = vaddq_f32(_tmp56a, _tmp56b);
148 float32x4_t _tmp6m = vsubq_f32(_tmp56a, _tmp56b);
149 vst1q_f32(tmp[5][m], _tmp5m);
150 vst1q_f32(tmp[6][m], _tmp6m);
151
152 // tmp[5][m] = tmp56a + tmp56b;
153 // tmp[6][m] = tmp56a - tmp56b;
154
155 r0 += w * 4;
156 }
157
158 float* r0_tm_0 = (float*)img0_tm + (i * w_tm / 8 + j) * 4;
159 float* r0_tm_1 = r0_tm_0 + tiles * 4;
160 float* r0_tm_2 = r0_tm_0 + tiles * 8;
161 float* r0_tm_3 = r0_tm_0 + tiles * 12;
162 float* r0_tm_4 = r0_tm_0 + tiles * 16;
163 float* r0_tm_5 = r0_tm_0 + tiles * 20;
164 float* r0_tm_6 = r0_tm_0 + tiles * 24;
165 float* r0_tm_7 = r0_tm_0 + tiles * 28;
166
167 for (int m = 0; m < 8; m++)
168 {
169 float32x4_t _tmp00 = vld1q_f32(tmp[m][0]);
170 float32x4_t _tmp01 = vld1q_f32(tmp[m][1]);
171 float32x4_t _tmp02 = vld1q_f32(tmp[m][2]);
172 float32x4_t _tmp03 = vld1q_f32(tmp[m][3]);
173 float32x4_t _tmp04 = vld1q_f32(tmp[m][4]);
174 float32x4_t _tmp05 = vld1q_f32(tmp[m][5]);
175 float32x4_t _tmp06 = vld1q_f32(tmp[m][6]);
176 float32x4_t _tmp07 = vld1q_f32(tmp[m][7]);
177
178 float32x4_t _r0tm0 = vmlaq_n_f32(vsubq_f32(_tmp00, _tmp06), vsubq_f32(_tmp04, _tmp02), 5.25f);
179 float32x4_t _r0tm7 = vmlaq_n_f32(vsubq_f32(_tmp07, _tmp01), vsubq_f32(_tmp03, _tmp05), 5.25f);
180
181 // r0_tm[0] = tmp0[0] - tmp0[6] + (tmp0[4] - tmp0[2]) * 5.25;
182 // r0_tm[7] = tmp0[7] - tmp0[1] + (tmp0[3] - tmp0[5]) * 5.25;
183
184 float32x4_t _tmp12a = vmlsq_n_f32(vaddq_f32(_tmp02, _tmp06), _tmp04, 4.25f);
185 float32x4_t _tmp12b = vmlsq_n_f32(vaddq_f32(_tmp01, _tmp05), _tmp03, 4.25f);
186
187 // float tmp12a = (tmp0[2] + tmp0[6] - tmp0[4] * 4.25);
188 // float tmp12b = (tmp0[1] + tmp0[5] - tmp0[3] * 4.25);
189
190 float32x4_t _r0tm1 = vaddq_f32(_tmp12a, _tmp12b);
191 float32x4_t _r0tm2 = vsubq_f32(_tmp12a, _tmp12b);
192
193 // r0_tm[1] = tmp12a + tmp12b;
194 // r0_tm[2] = tmp12a - tmp12b;
195
196 float32x4_t _tmp34a = vmlsq_n_f32(vmlaq_n_f32(_tmp06, _tmp02, 0.25f), _tmp04, 1.25f);
197 float32x4_t _tmp34b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 0.5f), _tmp03, 2.5f), _tmp05, 2.f);
198
199 // float tmp34a = (tmp0[6] + tmp0[2] * 0.25 - tmp0[4] * 1.25);
200 // float tmp34b = (tmp0[1] * 0.5 - tmp0[3] * 2.5 + tmp0[5] * 2);
201
202 float32x4_t _r0tm3 = vaddq_f32(_tmp34a, _tmp34b);
203 float32x4_t _r0tm4 = vsubq_f32(_tmp34a, _tmp34b);
204
205 // r0_tm[3] = tmp34a + tmp34b;
206 // r0_tm[4] = tmp34a - tmp34b;
207
208 float32x4_t _tmp56a = vmlaq_n_f32(_tmp06, vmlsq_n_f32(_tmp02, _tmp04, 1.25f), 4.f);
209 float32x4_t _tmp56b = vmlaq_n_f32(vmlsq_n_f32(vmulq_n_f32(_tmp01, 2.f), _tmp03, 2.5f), _tmp05, 0.5f);
210
211 // float tmp56a = (tmp0[6] + (tmp0[2] - tmp0[4] * 1.25) * 4);
212 // float tmp56b = (tmp0[1] * 2 - tmp0[3] * 2.5 + tmp0[5] * 0.5);
213
214 float32x4_t _r0tm5 = vaddq_f32(_tmp56a, _tmp56b);
215 float32x4_t _r0tm6 = vsubq_f32(_tmp56a, _tmp56b);
216
217 // r0_tm[5] = tmp56a + tmp56b;
218 // r0_tm[6] = tmp56a - tmp56b;
219
220 vst1q_f32(r0_tm_0, _r0tm0);
221 vst1q_f32(r0_tm_1, _r0tm1);
222 vst1q_f32(r0_tm_2, _r0tm2);
223 vst1q_f32(r0_tm_3, _r0tm3);
224 vst1q_f32(r0_tm_4, _r0tm4);
225 vst1q_f32(r0_tm_5, _r0tm5);
226 vst1q_f32(r0_tm_6, _r0tm6);
227 vst1q_f32(r0_tm_7, _r0tm7);
228
229 r0_tm_0 += tiles * 32;
230 r0_tm_1 += tiles * 32;
231 r0_tm_2 += tiles * 32;
232 r0_tm_3 += tiles * 32;
233 r0_tm_4 += tiles * 32;
234 r0_tm_5 += tiles * 32;
235 r0_tm_6 += tiles * 32;
236 r0_tm_7 += tiles * 32;
237 }
238 }
239 }
240 }
241 }
242 bottom_blob_bordered = Mat();
243 // END transform input
244
245 // BEGIN dot
246 Mat top_blob_tm;
247 {
248 int w_tm = outw / 6 * 8;
249 int h_tm = outh / 6 * 8;
250
251 const int tiles = h_tm / 8 * w_tm / 8;
252
253 // permute
254 // bottom_blob_tm.create(tiles, 64, inch, elemsize, elempack, opt.workspace_allocator);
255 Mat bottom_blob_tm2;
256 #if __aarch64__
257 if (tiles >= 12)
258 bottom_blob_tm2.create(12 * inch, tiles / 12 + (tiles % 12) / 8 + (tiles % 12 % 8) / 4 + tiles % 12 % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
259 else if (tiles >= 8)
260 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
261 else if (tiles >= 4)
262 bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
263 else // if (tiles >= 1)
264 bottom_blob_tm2.create(1 * inch, tiles, 64, 4u * elempack, elempack, opt.workspace_allocator);
265 #else
266 if (tiles >= 8)
267 bottom_blob_tm2.create(8 * inch, tiles / 8 + (tiles % 8) / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
268 else if (tiles >= 4)
269 bottom_blob_tm2.create(4 * inch, tiles / 4 + tiles % 4, 64, 4u * elempack, elempack, opt.workspace_allocator);
270 else // if (tiles >= 1)
271 bottom_blob_tm2.create(1 * inch, tiles, 64, 4u * elempack, elempack, opt.workspace_allocator);
272 #endif
273
274 #pragma omp parallel for num_threads(opt.num_threads)
275 for (int r = 0; r < 64; r++)
276 {
277 Mat tm2 = bottom_blob_tm2.channel(r);
278
279 // tile
280 int i = 0;
281 #if __aarch64__
282 for (; i + 11 < tiles; i += 12)
283 {
284 float* tm2p = tm2.row(i / 12);
285
286 const float* r0 = bottom_blob_tm;
287
288 r0 += (r * tiles + i) * 4;
289
290 for (int q = 0; q < inch; q++)
291 {
292 asm volatile(
293 "prfm pldl1keep, [%0, #512] \n"
294 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
295 "prfm pldl1keep, [%0, #512] \n"
296 "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0], #64 \n"
297 "prfm pldl1keep, [%0, #512] \n"
298 "ld4 {v16.4s, v17.4s, v18.4s, v19.4s}, [%0] \n"
299 "sub %0, %0, #128 \n"
300 "st1 {v0.4s}, [%1], #16 \n"
301 "st1 {v4.4s}, [%1], #16 \n"
302 "st1 {v16.4s}, [%1], #16 \n"
303 "st1 {v1.4s}, [%1], #16 \n"
304 "st1 {v5.4s}, [%1], #16 \n"
305 "st1 {v17.4s}, [%1], #16 \n"
306 "st1 {v2.4s}, [%1], #16 \n"
307 "st1 {v6.4s}, [%1], #16 \n"
308 "st1 {v18.4s}, [%1], #16 \n"
309 "st1 {v3.4s}, [%1], #16 \n"
310 "st1 {v7.4s}, [%1], #16 \n"
311 "st1 {v19.4s}, [%1], #16 \n"
312 : "=r"(r0), // %0
313 "=r"(tm2p) // %1
314 : "0"(r0),
315 "1"(tm2p)
316 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
317 r0 += bottom_blob_tm.cstep * 4;
318 }
319 }
320 #endif
321 for (; i + 7 < tiles; i += 8)
322 {
323 #if __aarch64__
324 float* tm2p = tm2.row(i / 12 + (i % 12) / 8);
325 #else
326 float* tm2p = tm2.row(i / 8);
327 #endif
328
329 const float* r0 = bottom_blob_tm;
330
331 r0 += (r * tiles + i) * 4;
332
333 for (int q = 0; q < inch; q++)
334 {
335 #if __aarch64__
336 asm volatile(
337 "prfm pldl1keep, [%0, #512] \n"
338 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
339 "prfm pldl1keep, [%0, #512] \n"
340 "ld4 {v4.4s, v5.4s, v6.4s, v7.4s}, [%0] \n"
341 "sub %0, %0, #64 \n"
342 "st1 {v0.4s}, [%1], #16 \n"
343 "st1 {v4.4s}, [%1], #16 \n"
344 "st1 {v1.4s}, [%1], #16 \n"
345 "st1 {v5.4s}, [%1], #16 \n"
346 "st1 {v2.4s}, [%1], #16 \n"
347 "st1 {v6.4s}, [%1], #16 \n"
348 "st1 {v3.4s}, [%1], #16 \n"
349 "st1 {v7.4s}, [%1], #16 \n"
350 : "=r"(r0), // %0
351 "=r"(tm2p) // %1
352 : "0"(r0),
353 "1"(tm2p)
354 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
355 #else
356 asm volatile(
357 "pld [%0, #256] \n"
358 "vld4.f32 {d0-d3}, [%0 :128]! \n"
359 "pld [%0, #256] \n"
360 "vld4.f32 {d4-d7}, [%0 :128]! \n"
361 "pld [%0, #256] \n"
362 "vld4.f32 {d16-d19}, [%0 :128]! \n"
363 "pld [%0, #256] \n"
364 "vld4.f32 {d20-d23}, [%0 :128] \n"
365 "sub %0, %0, #96 \n"
366 "vswp d1, d4 \n"
367 "vswp d3, d6 \n"
368 "vswp d17, d20 \n"
369 "vswp d19, d22 \n"
370 "vst1.f32 {d0-d1}, [%1 :128]! \n"
371 "vst1.f32 {d16-d17}, [%1 :128]! \n"
372 "vst1.f32 {d4-d5}, [%1 :128]! \n"
373 "vst1.f32 {d20-d21}, [%1 :128]! \n"
374 "vst1.f32 {d2-d3}, [%1 :128]! \n"
375 "vst1.f32 {d18-d19}, [%1 :128]! \n"
376 "vst1.f32 {d6-d7}, [%1 :128]! \n"
377 "vst1.f32 {d22-d23}, [%1 :128]! \n"
378 : "=r"(r0), // %0
379 "=r"(tm2p) // %1
380 : "0"(r0),
381 "1"(tm2p)
382 : "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
383 #endif
384 r0 += bottom_blob_tm.cstep * 4;
385 }
386 }
387 for (; i + 3 < tiles; i += 4)
388 {
389 #if __aarch64__
390 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
391 #else
392 float* tm2p = tm2.row(i / 8 + (i % 8) / 4);
393 #endif
394
395 const float* r0 = bottom_blob_tm;
396
397 r0 += (r * tiles + i) * 4;
398
399 for (int q = 0; q < inch; q++)
400 {
401 #if __aarch64__
402 asm volatile(
403 "prfm pldl1keep, [%0, #512] \n"
404 "ld4 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0] \n"
405 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
406 : "=r"(r0), // %0
407 "=r"(tm2p) // %1
408 : "0"(r0),
409 "1"(tm2p)
410 : "memory", "v0", "v1", "v2", "v3");
411 #else
412 asm volatile(
413 "pld [%0, #256] \n"
414 "vld4.f32 {d0-d3}, [%0 :128]! \n"
415 "pld [%0, #256] \n"
416 "vld4.f32 {d4-d7}, [%0 :128] \n"
417 "sub %0, %0, #32 \n"
418 "vswp d1, d4 \n"
419 "vswp d3, d6 \n"
420 "vst1.f32 {d0-d1}, [%1 :128]! \n"
421 "vst1.f32 {d4-d5}, [%1 :128]! \n"
422 "vst1.f32 {d2-d3}, [%1 :128]! \n"
423 "vst1.f32 {d6-d7}, [%1 :128]! \n"
424 : "=r"(r0), // %0
425 "=r"(tm2p) // %1
426 : "0"(r0),
427 "1"(tm2p)
428 : "memory", "q0", "q1", "q2", "q3");
429 #endif // __aarch64__
430 r0 += bottom_blob_tm.cstep * 4;
431 }
432 }
433 for (; i < tiles; i++)
434 {
435 #if __aarch64__
436 float* tm2p = tm2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
437 #else
438 float* tm2p = tm2.row(i / 8 + (i % 8) / 4 + i % 4);
439 #endif
440
441 const float* r0 = bottom_blob_tm;
442
443 r0 += (r * tiles + i) * 4;
444
445 for (int q = 0; q < inch; q++)
446 {
447 #if __aarch64__
448 asm volatile(
449 "prfm pldl1keep, [%0, #128] \n"
450 "ld1 {v0.4s}, [%0] \n"
451 "st1 {v0.4s}, [%1], #16 \n"
452 : "=r"(r0), // %0
453 "=r"(tm2p) // %1
454 : "0"(r0),
455 "1"(tm2p)
456 : "memory", "v0");
457 #else
458 asm volatile(
459 "pld [%0, #128] \n"
460 "vld1.f32 {d0-d1}, [%0 :128] \n"
461 "vst1.f32 {d0-d1}, [%1 :128]! \n"
462 : "=r"(r0), // %0
463 "=r"(tm2p) // %1
464 : "0"(r0),
465 "1"(tm2p)
466 : "memory", "q0");
467 #endif // __aarch64__
468 r0 += bottom_blob_tm.cstep * 4;
469 }
470 }
471 }
472
473 bottom_blob_tm = Mat();
474 // permute end
475
476 top_blob_tm.create(tiles, 64, outch, 4u, 1, opt.workspace_allocator);
477
478 int nn_outch = 0;
479 int remain_outch_start = 0;
480
481 #if __aarch64__
482 nn_outch = outch >> 3;
483
484 #pragma omp parallel for num_threads(opt.num_threads)
485 for (int pp = 0; pp < nn_outch; pp++)
486 {
487 int p = pp * 8;
488
489 float* output0_tm = top_blob_tm.channel(p);
490 float* output1_tm = top_blob_tm.channel(p + 1);
491 float* output2_tm = top_blob_tm.channel(p + 2);
492 float* output3_tm = top_blob_tm.channel(p + 3);
493 float* output4_tm = top_blob_tm.channel(p + 4);
494 float* output5_tm = top_blob_tm.channel(p + 5);
495 float* output6_tm = top_blob_tm.channel(p + 6);
496 float* output7_tm = top_blob_tm.channel(p + 7);
497
498 const Mat kernel01_tm = kernel_tm.channel(p / 8);
499
500 for (int r = 0; r < 64; r++)
501 {
502 const Mat bb2 = bottom_blob_tm2.channel(r);
503
504 int i = 0;
505 for (; i + 11 < tiles; i += 12)
506 {
507 const float* r0 = bb2.row(i / 12);
508
509 const float* kptr = kernel01_tm.row(r);
510
511 int nn = inch; // inch always > 0
512
513 asm volatile(
514 "eor v8.16b, v8.16b, v8.16b \n"
515 "eor v9.16b, v9.16b, v9.16b \n"
516 "eor v10.16b, v10.16b, v10.16b \n"
517 "eor v11.16b, v11.16b, v11.16b \n"
518 "eor v12.16b, v12.16b, v12.16b \n"
519 "eor v13.16b, v13.16b, v13.16b \n"
520 "eor v14.16b, v14.16b, v14.16b \n"
521 "eor v15.16b, v15.16b, v15.16b \n"
522 "eor v16.16b, v16.16b, v16.16b \n"
523 "eor v17.16b, v17.16b, v17.16b \n"
524 "eor v18.16b, v18.16b, v18.16b \n"
525 "eor v19.16b, v19.16b, v19.16b \n"
526 "eor v20.16b, v20.16b, v20.16b \n"
527 "eor v21.16b, v21.16b, v21.16b \n"
528 "eor v22.16b, v22.16b, v22.16b \n"
529 "eor v23.16b, v23.16b, v23.16b \n"
530 "eor v24.16b, v24.16b, v24.16b \n"
531 "eor v25.16b, v25.16b, v25.16b \n"
532 "eor v26.16b, v26.16b, v26.16b \n"
533 "eor v27.16b, v27.16b, v27.16b \n"
534 "eor v28.16b, v28.16b, v28.16b \n"
535 "eor v29.16b, v29.16b, v29.16b \n"
536 "eor v30.16b, v30.16b, v30.16b \n"
537 "eor v31.16b, v31.16b, v31.16b \n"
538
539 "0: \n"
540
541 "prfm pldl1keep, [%9, #512] \n"
542 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
543
544 "prfm pldl1keep, [%10, #512] \n"
545 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
546
547 "subs %w0, %w0, #1 \n"
548
549 "fmla v8.4s, v0.4s, v4.s[0] \n"
550 "fmla v11.4s, v0.4s, v4.s[1] \n"
551 "fmla v14.4s, v0.4s, v4.s[2] \n"
552 "fmla v17.4s, v0.4s, v4.s[3] \n"
553 "fmla v20.4s, v0.4s, v5.s[0] \n"
554 "fmla v23.4s, v0.4s, v5.s[1] \n"
555 "fmla v26.4s, v0.4s, v5.s[2] \n"
556 "fmla v29.4s, v0.4s, v5.s[3] \n"
557
558 "fmla v9.4s, v1.4s, v4.s[0] \n"
559 "fmla v12.4s, v1.4s, v4.s[1] \n"
560 "fmla v15.4s, v1.4s, v4.s[2] \n"
561 "fmla v18.4s, v1.4s, v4.s[3] \n"
562 "fmla v21.4s, v1.4s, v5.s[0] \n"
563 "fmla v24.4s, v1.4s, v5.s[1] \n"
564 "fmla v27.4s, v1.4s, v5.s[2] \n"
565 "fmla v30.4s, v1.4s, v5.s[3] \n"
566
567 "fmla v10.4s, v2.4s, v4.s[0] \n"
568 "fmla v13.4s, v2.4s, v4.s[1] \n"
569 "fmla v16.4s, v2.4s, v4.s[2] \n"
570 "fmla v19.4s, v2.4s, v4.s[3] \n"
571 "fmla v22.4s, v2.4s, v5.s[0] \n"
572 "fmla v25.4s, v2.4s, v5.s[1] \n"
573 "fmla v28.4s, v2.4s, v5.s[2] \n"
574 "fmla v31.4s, v2.4s, v5.s[3] \n"
575
576 "fmla v8.4s, v3.4s, v6.s[0] \n"
577 "fmla v11.4s, v3.4s, v6.s[1] \n"
578 "fmla v14.4s, v3.4s, v6.s[2] \n"
579 "fmla v17.4s, v3.4s, v6.s[3] \n"
580 "fmla v20.4s, v3.4s, v7.s[0] \n"
581 "fmla v23.4s, v3.4s, v7.s[1] \n"
582 "fmla v26.4s, v3.4s, v7.s[2] \n"
583 "fmla v29.4s, v3.4s, v7.s[3] \n"
584
585 "prfm pldl1keep, [%9, #512] \n"
586 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
587
588 "fmla v9.4s, v0.4s, v6.s[0] \n"
589 "fmla v12.4s, v0.4s, v6.s[1] \n"
590 "fmla v15.4s, v0.4s, v6.s[2] \n"
591 "fmla v18.4s, v0.4s, v6.s[3] \n"
592 "fmla v21.4s, v0.4s, v7.s[0] \n"
593 "fmla v24.4s, v0.4s, v7.s[1] \n"
594 "fmla v27.4s, v0.4s, v7.s[2] \n"
595 "fmla v30.4s, v0.4s, v7.s[3] \n"
596
597 "fmla v10.4s, v1.4s, v6.s[0] \n"
598 "fmla v13.4s, v1.4s, v6.s[1] \n"
599 "fmla v16.4s, v1.4s, v6.s[2] \n"
600 "fmla v19.4s, v1.4s, v6.s[3] \n"
601 "fmla v22.4s, v1.4s, v7.s[0] \n"
602 "fmla v25.4s, v1.4s, v7.s[1] \n"
603 "fmla v28.4s, v1.4s, v7.s[2] \n"
604 "fmla v31.4s, v1.4s, v7.s[3] \n"
605
606 "prfm pldl1keep, [%10, #512] \n"
607 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
608
609 "fmla v8.4s, v2.4s, v4.s[0] \n"
610 "fmla v11.4s, v2.4s, v4.s[1] \n"
611 "fmla v14.4s, v2.4s, v4.s[2] \n"
612 "fmla v17.4s, v2.4s, v4.s[3] \n"
613 "fmla v20.4s, v2.4s, v5.s[0] \n"
614 "fmla v23.4s, v2.4s, v5.s[1] \n"
615 "fmla v26.4s, v2.4s, v5.s[2] \n"
616 "fmla v29.4s, v2.4s, v5.s[3] \n"
617
618 "fmla v9.4s, v3.4s, v4.s[0] \n"
619 "fmla v12.4s, v3.4s, v4.s[1] \n"
620 "fmla v15.4s, v3.4s, v4.s[2] \n"
621 "fmla v18.4s, v3.4s, v4.s[3] \n"
622 "fmla v21.4s, v3.4s, v5.s[0] \n"
623 "fmla v24.4s, v3.4s, v5.s[1] \n"
624 "fmla v27.4s, v3.4s, v5.s[2] \n"
625 "fmla v30.4s, v3.4s, v5.s[3] \n"
626
627 "prfm pldl1keep, [%9, #512] \n"
628 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
629
630 "fmla v10.4s, v0.4s, v4.s[0] \n"
631 "fmla v13.4s, v0.4s, v4.s[1] \n"
632 "fmla v16.4s, v0.4s, v4.s[2] \n"
633 "fmla v19.4s, v0.4s, v4.s[3] \n"
634 "fmla v22.4s, v0.4s, v5.s[0] \n"
635 "fmla v25.4s, v0.4s, v5.s[1] \n"
636 "fmla v28.4s, v0.4s, v5.s[2] \n"
637 "fmla v31.4s, v0.4s, v5.s[3] \n"
638
639 "fmla v8.4s, v1.4s, v6.s[0] \n"
640 "fmla v11.4s, v1.4s, v6.s[1] \n"
641 "fmla v14.4s, v1.4s, v6.s[2] \n"
642 "fmla v17.4s, v1.4s, v6.s[3] \n"
643 "fmla v20.4s, v1.4s, v7.s[0] \n"
644 "fmla v23.4s, v1.4s, v7.s[1] \n"
645 "fmla v26.4s, v1.4s, v7.s[2] \n"
646 "fmla v29.4s, v1.4s, v7.s[3] \n"
647
648 "fmla v9.4s, v2.4s, v6.s[0] \n"
649 "fmla v12.4s, v2.4s, v6.s[1] \n"
650 "fmla v15.4s, v2.4s, v6.s[2] \n"
651 "fmla v18.4s, v2.4s, v6.s[3] \n"
652 "fmla v21.4s, v2.4s, v7.s[0] \n"
653 "fmla v24.4s, v2.4s, v7.s[1] \n"
654 "fmla v27.4s, v2.4s, v7.s[2] \n"
655 "fmla v30.4s, v2.4s, v7.s[3] \n"
656
657 "fmla v10.4s, v3.4s, v6.s[0] \n"
658 "fmla v13.4s, v3.4s, v6.s[1] \n"
659 "fmla v16.4s, v3.4s, v6.s[2] \n"
660 "fmla v19.4s, v3.4s, v6.s[3] \n"
661 "fmla v22.4s, v3.4s, v7.s[0] \n"
662 "fmla v25.4s, v3.4s, v7.s[1] \n"
663 "fmla v28.4s, v3.4s, v7.s[2] \n"
664 "fmla v31.4s, v3.4s, v7.s[3] \n"
665
666 "bne 0b \n"
667
668 "st1 {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
669 "st1 {v11.4s, v12.4s, v13.4s}, [%2], #48 \n"
670 "st1 {v14.4s, v15.4s, v16.4s}, [%3], #48 \n"
671 "st1 {v17.4s, v18.4s, v19.4s}, [%4], #48 \n"
672 "st1 {v20.4s, v21.4s, v22.4s}, [%5], #48 \n"
673 "st1 {v23.4s, v24.4s, v25.4s}, [%6], #48 \n"
674 "st1 {v26.4s, v27.4s, v28.4s}, [%7], #48 \n"
675 "st1 {v29.4s, v30.4s, v31.4s}, [%8], #48 \n"
676
677 : "=r"(nn), // %0
678 "=r"(output0_tm), // %1
679 "=r"(output1_tm), // %2
680 "=r"(output2_tm), // %3
681 "=r"(output3_tm), // %4
682 "=r"(output4_tm), // %5
683 "=r"(output5_tm), // %6
684 "=r"(output6_tm), // %7
685 "=r"(output7_tm), // %8
686 "=r"(r0), // %9
687 "=r"(kptr) // %10
688 : "0"(nn),
689 "1"(output0_tm),
690 "2"(output1_tm),
691 "3"(output2_tm),
692 "4"(output3_tm),
693 "5"(output4_tm),
694 "6"(output5_tm),
695 "7"(output6_tm),
696 "8"(output7_tm),
697 "9"(r0),
698 "10"(kptr)
699 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
700 }
701 for (; i + 7 < tiles; i += 8)
702 {
703 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
704
705 const float* kptr = kernel01_tm.row(r);
706
707 int nn = inch; // inch always > 0
708
709 asm volatile(
710 "eor v16.16b, v16.16b, v16.16b \n"
711 "eor v17.16b, v17.16b, v17.16b \n"
712 "eor v18.16b, v18.16b, v18.16b \n"
713 "eor v19.16b, v19.16b, v19.16b \n"
714 "eor v20.16b, v20.16b, v20.16b \n"
715 "eor v21.16b, v21.16b, v21.16b \n"
716 "eor v22.16b, v22.16b, v22.16b \n"
717 "eor v23.16b, v23.16b, v23.16b \n"
718 "eor v24.16b, v24.16b, v24.16b \n"
719 "eor v25.16b, v25.16b, v25.16b \n"
720 "eor v26.16b, v26.16b, v26.16b \n"
721 "eor v27.16b, v27.16b, v27.16b \n"
722 "eor v28.16b, v28.16b, v28.16b \n"
723 "eor v29.16b, v29.16b, v29.16b \n"
724 "eor v30.16b, v30.16b, v30.16b \n"
725 "eor v31.16b, v31.16b, v31.16b \n"
726
727 "0: \n"
728
729 "prfm pldl1keep, [%9, #512] \n"
730 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
731
732 "prfm pldl1keep, [%10, #512] \n"
733 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
734
735 "subs %w0, %w0, #1 \n"
736
737 "fmla v16.4s, v0.4s, v4.s[0] \n"
738 "fmla v18.4s, v0.4s, v4.s[1] \n"
739 "fmla v20.4s, v0.4s, v4.s[2] \n"
740 "fmla v22.4s, v0.4s, v4.s[3] \n"
741 "fmla v24.4s, v0.4s, v5.s[0] \n"
742 "fmla v26.4s, v0.4s, v5.s[1] \n"
743 "fmla v28.4s, v0.4s, v5.s[2] \n"
744 "fmla v30.4s, v0.4s, v5.s[3] \n"
745 "fmla v17.4s, v1.4s, v4.s[0] \n"
746 "fmla v19.4s, v1.4s, v4.s[1] \n"
747 "fmla v21.4s, v1.4s, v4.s[2] \n"
748 "fmla v23.4s, v1.4s, v4.s[3] \n"
749 "fmla v25.4s, v1.4s, v5.s[0] \n"
750 "fmla v27.4s, v1.4s, v5.s[1] \n"
751 "fmla v29.4s, v1.4s, v5.s[2] \n"
752 "fmla v31.4s, v1.4s, v5.s[3] \n"
753
754 "fmla v16.4s, v2.4s, v6.s[0] \n"
755 "fmla v18.4s, v2.4s, v6.s[1] \n"
756 "fmla v20.4s, v2.4s, v6.s[2] \n"
757 "fmla v22.4s, v2.4s, v6.s[3] \n"
758 "fmla v24.4s, v2.4s, v7.s[0] \n"
759 "fmla v26.4s, v2.4s, v7.s[1] \n"
760 "fmla v28.4s, v2.4s, v7.s[2] \n"
761 "fmla v30.4s, v2.4s, v7.s[3] \n"
762 "fmla v17.4s, v3.4s, v6.s[0] \n"
763 "fmla v19.4s, v3.4s, v6.s[1] \n"
764 "fmla v21.4s, v3.4s, v6.s[2] \n"
765 "fmla v23.4s, v3.4s, v6.s[3] \n"
766 "fmla v25.4s, v3.4s, v7.s[0] \n"
767 "fmla v27.4s, v3.4s, v7.s[1] \n"
768 "fmla v29.4s, v3.4s, v7.s[2] \n"
769 "fmla v31.4s, v3.4s, v7.s[3] \n"
770
771 "prfm pldl1keep, [%9, #512] \n"
772 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%9], #64 \n"
773
774 "prfm pldl1keep, [%10, #512] \n"
775 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
776
777 "fmla v16.4s, v12.4s, v8.s[0] \n"
778 "fmla v18.4s, v12.4s, v8.s[1] \n"
779 "fmla v20.4s, v12.4s, v8.s[2] \n"
780 "fmla v22.4s, v12.4s, v8.s[3] \n"
781 "fmla v24.4s, v12.4s, v9.s[0] \n"
782 "fmla v26.4s, v12.4s, v9.s[1] \n"
783 "fmla v28.4s, v12.4s, v9.s[2] \n"
784 "fmla v30.4s, v12.4s, v9.s[3] \n"
785 "fmla v17.4s, v13.4s, v8.s[0] \n"
786 "fmla v19.4s, v13.4s, v8.s[1] \n"
787 "fmla v21.4s, v13.4s, v8.s[2] \n"
788 "fmla v23.4s, v13.4s, v8.s[3] \n"
789 "fmla v25.4s, v13.4s, v9.s[0] \n"
790 "fmla v27.4s, v13.4s, v9.s[1] \n"
791 "fmla v29.4s, v13.4s, v9.s[2] \n"
792 "fmla v31.4s, v13.4s, v9.s[3] \n"
793
794 "fmla v16.4s, v14.4s, v10.s[0] \n"
795 "fmla v18.4s, v14.4s, v10.s[1] \n"
796 "fmla v20.4s, v14.4s, v10.s[2] \n"
797 "fmla v22.4s, v14.4s, v10.s[3] \n"
798 "fmla v24.4s, v14.4s, v11.s[0] \n"
799 "fmla v26.4s, v14.4s, v11.s[1] \n"
800 "fmla v28.4s, v14.4s, v11.s[2] \n"
801 "fmla v30.4s, v14.4s, v11.s[3] \n"
802 "fmla v17.4s, v15.4s, v10.s[0] \n"
803 "fmla v19.4s, v15.4s, v10.s[1] \n"
804 "fmla v21.4s, v15.4s, v10.s[2] \n"
805 "fmla v23.4s, v15.4s, v10.s[3] \n"
806 "fmla v25.4s, v15.4s, v11.s[0] \n"
807 "fmla v27.4s, v15.4s, v11.s[1] \n"
808 "fmla v29.4s, v15.4s, v11.s[2] \n"
809 "fmla v31.4s, v15.4s, v11.s[3] \n"
810
811 "bne 0b \n"
812
813 "st1 {v16.4s, v17.4s}, [%1], #32 \n"
814 "st1 {v18.4s, v19.4s}, [%2], #32 \n"
815 "st1 {v20.4s, v21.4s}, [%3], #32 \n"
816 "st1 {v22.4s, v23.4s}, [%4], #32 \n"
817 "st1 {v24.4s, v25.4s}, [%5], #32 \n"
818 "st1 {v26.4s, v27.4s}, [%6], #32 \n"
819 "st1 {v28.4s, v29.4s}, [%7], #32 \n"
820 "st1 {v30.4s, v31.4s}, [%8], #32 \n"
821
822 : "=r"(nn), // %0
823 "=r"(output0_tm), // %1
824 "=r"(output1_tm), // %2
825 "=r"(output2_tm), // %3
826 "=r"(output3_tm), // %4
827 "=r"(output4_tm), // %5
828 "=r"(output5_tm), // %6
829 "=r"(output6_tm), // %7
830 "=r"(output7_tm), // %8
831 "=r"(r0), // %9
832 "=r"(kptr) // %10
833 : "0"(nn),
834 "1"(output0_tm),
835 "2"(output1_tm),
836 "3"(output2_tm),
837 "4"(output3_tm),
838 "5"(output4_tm),
839 "6"(output5_tm),
840 "7"(output6_tm),
841 "8"(output7_tm),
842 "9"(r0),
843 "10"(kptr)
844 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
845 }
846 for (; i + 3 < tiles; i += 4)
847 {
848 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
849
850 const float* kptr = kernel01_tm.row(r);
851
852 int nn = inch; // inch always > 0
853
854 asm volatile(
855 "eor v16.16b, v16.16b, v16.16b \n"
856 "eor v17.16b, v17.16b, v17.16b \n"
857 "eor v18.16b, v18.16b, v18.16b \n"
858 "eor v19.16b, v19.16b, v19.16b \n"
859 "eor v20.16b, v20.16b, v20.16b \n"
860 "eor v21.16b, v21.16b, v21.16b \n"
861 "eor v22.16b, v22.16b, v22.16b \n"
862 "eor v23.16b, v23.16b, v23.16b \n"
863
864 "0: \n"
865
866 "prfm pldl1keep, [%9, #512] \n"
867 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%9], #64 \n"
868
869 "prfm pldl1keep, [%10, #512] \n"
870 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
871
872 "subs %w0, %w0, #1 \n"
873
874 "fmla v16.4s, v0.4s, v4.s[0] \n"
875 "fmla v17.4s, v0.4s, v4.s[1] \n"
876 "fmla v18.4s, v0.4s, v4.s[2] \n"
877 "fmla v19.4s, v0.4s, v4.s[3] \n"
878 "fmla v20.4s, v0.4s, v5.s[0] \n"
879 "fmla v21.4s, v0.4s, v5.s[1] \n"
880 "fmla v22.4s, v0.4s, v5.s[2] \n"
881 "fmla v23.4s, v0.4s, v5.s[3] \n"
882
883 "prfm pldl1keep, [%10, #512] \n"
884 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
885
886 "fmla v16.4s, v1.4s, v6.s[0] \n"
887 "fmla v17.4s, v1.4s, v6.s[1] \n"
888 "fmla v18.4s, v1.4s, v6.s[2] \n"
889 "fmla v19.4s, v1.4s, v6.s[3] \n"
890 "fmla v20.4s, v1.4s, v7.s[0] \n"
891 "fmla v21.4s, v1.4s, v7.s[1] \n"
892 "fmla v22.4s, v1.4s, v7.s[2] \n"
893 "fmla v23.4s, v1.4s, v7.s[3] \n"
894
895 "fmla v16.4s, v2.4s, v8.s[0] \n"
896 "fmla v17.4s, v2.4s, v8.s[1] \n"
897 "fmla v18.4s, v2.4s, v8.s[2] \n"
898 "fmla v19.4s, v2.4s, v8.s[3] \n"
899 "fmla v20.4s, v2.4s, v9.s[0] \n"
900 "fmla v21.4s, v2.4s, v9.s[1] \n"
901 "fmla v22.4s, v2.4s, v9.s[2] \n"
902 "fmla v23.4s, v2.4s, v9.s[3] \n"
903
904 "fmla v16.4s, v3.4s, v10.s[0] \n"
905 "fmla v17.4s, v3.4s, v10.s[1] \n"
906 "fmla v18.4s, v3.4s, v10.s[2] \n"
907 "fmla v19.4s, v3.4s, v10.s[3] \n"
908 "fmla v20.4s, v3.4s, v11.s[0] \n"
909 "fmla v21.4s, v3.4s, v11.s[1] \n"
910 "fmla v22.4s, v3.4s, v11.s[2] \n"
911 "fmla v23.4s, v3.4s, v11.s[3] \n"
912
913 "bne 0b \n"
914
915 "st1 {v16.4s}, [%1], #16 \n"
916 "st1 {v17.4s}, [%2], #16 \n"
917 "st1 {v18.4s}, [%3], #16 \n"
918 "st1 {v19.4s}, [%4], #16 \n"
919 "st1 {v20.4s}, [%5], #16 \n"
920 "st1 {v21.4s}, [%6], #16 \n"
921 "st1 {v22.4s}, [%7], #16 \n"
922 "st1 {v23.4s}, [%8], #16 \n"
923
924 : "=r"(nn), // %0
925 "=r"(output0_tm), // %1
926 "=r"(output1_tm), // %2
927 "=r"(output2_tm), // %3
928 "=r"(output3_tm), // %4
929 "=r"(output4_tm), // %5
930 "=r"(output5_tm), // %6
931 "=r"(output6_tm), // %7
932 "=r"(output7_tm), // %8
933 "=r"(r0), // %9
934 "=r"(kptr) // %10
935 : "0"(nn),
936 "1"(output0_tm),
937 "2"(output1_tm),
938 "3"(output2_tm),
939 "4"(output3_tm),
940 "5"(output4_tm),
941 "6"(output5_tm),
942 "7"(output6_tm),
943 "8"(output7_tm),
944 "9"(r0),
945 "10"(kptr)
946 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
947 }
948 for (; i < tiles; i++)
949 {
950 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
951
952 const float* kptr = kernel01_tm.row(r);
953
954 int nn = inch; // inch always > 0
955
956 asm volatile(
957 "eor v16.16b, v16.16b, v16.16b \n"
958 "eor v17.16b, v17.16b, v17.16b \n"
959 "eor v18.16b, v18.16b, v18.16b \n"
960 "eor v19.16b, v19.16b, v19.16b \n"
961
962 "0: \n"
963
964 "prfm pldl1keep, [%9, #128] \n"
965 "ld1 {v0.4s}, [%9], #16 \n"
966
967 "prfm pldl1keep, [%10, #512] \n"
968 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%10], #64 \n"
969
970 "subs %w0, %w0, #1 \n"
971
972 "fmla v16.4s, v4.4s, v0.s[0] \n"
973 "fmla v17.4s, v5.4s, v0.s[0] \n"
974 "fmla v18.4s, v6.4s, v0.s[1] \n"
975 "fmla v19.4s, v7.4s, v0.s[1] \n"
976
977 "prfm pldl1keep, [%10, #512] \n"
978 "ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [%10], #64 \n"
979
980 "fmla v16.4s, v8.4s, v0.s[2] \n"
981 "fmla v17.4s, v9.4s, v0.s[2] \n"
982 "fmla v18.4s, v10.4s, v0.s[3] \n"
983 "fmla v19.4s, v11.4s, v0.s[3] \n"
984
985 "bne 0b \n"
986
987 "fadd v16.4s, v16.4s, v18.4s \n"
988 "fadd v17.4s, v17.4s, v19.4s \n"
989
990 "st1 {v16.s}[0], [%1], #4 \n"
991 "st1 {v16.s}[1], [%2], #4 \n"
992 "st1 {v16.s}[2], [%3], #4 \n"
993 "st1 {v16.s}[3], [%4], #4 \n"
994 "st1 {v17.s}[0], [%5], #4 \n"
995 "st1 {v17.s}[1], [%6], #4 \n"
996 "st1 {v17.s}[2], [%7], #4 \n"
997 "st1 {v17.s}[3], [%8], #4 \n"
998
999 : "=r"(nn), // %0
1000 "=r"(output0_tm), // %1
1001 "=r"(output1_tm), // %2
1002 "=r"(output2_tm), // %3
1003 "=r"(output3_tm), // %4
1004 "=r"(output4_tm), // %5
1005 "=r"(output5_tm), // %6
1006 "=r"(output6_tm), // %7
1007 "=r"(output7_tm), // %8
1008 "=r"(r0), // %9
1009 "=r"(kptr) // %10
1010 : "0"(nn),
1011 "1"(output0_tm),
1012 "2"(output1_tm),
1013 "3"(output2_tm),
1014 "4"(output3_tm),
1015 "5"(output4_tm),
1016 "6"(output5_tm),
1017 "7"(output6_tm),
1018 "8"(output7_tm),
1019 "9"(r0),
1020 "10"(kptr)
1021 : "cc", "memory", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v16", "v17", "v18", "v19");
1022 }
1023 }
1024 }
1025
1026 remain_outch_start += nn_outch << 3;
1027 nn_outch = (outch - remain_outch_start) >> 2;
1028 #else // __aarch64__
1029 nn_outch = outch >> 2;
1030 #endif // __aarch64__
1031
1032 #pragma omp parallel for num_threads(opt.num_threads)
1033 for (int pp = 0; pp < nn_outch; pp++)
1034 {
1035 int p = remain_outch_start + pp * 4;
1036
1037 float* output0_tm = top_blob_tm.channel(p);
1038 float* output1_tm = top_blob_tm.channel(p + 1);
1039 float* output2_tm = top_blob_tm.channel(p + 2);
1040 float* output3_tm = top_blob_tm.channel(p + 3);
1041
1042 #if __aarch64__
1043 const Mat kernel01_tm = kernel_tm.channel(p / 8 + (p % 8) / 4);
1044 #else
1045 const Mat kernel01_tm = kernel_tm.channel(p / 4);
1046 #endif
1047
1048 for (int r = 0; r < 64; r++)
1049 {
1050 const Mat bb2 = bottom_blob_tm2.channel(r);
1051
1052 int i = 0;
1053 #if __aarch64__
1054 for (; i + 11 < tiles; i += 12)
1055 {
1056 const float* r0 = bb2.row(i / 12);
1057
1058 const float* kptr = kernel01_tm.row(r);
1059
1060 int nn = inch; // inch always > 0
1061
1062 asm volatile(
1063 "eor v8.16b, v8.16b, v8.16b \n"
1064 "eor v9.16b, v9.16b, v9.16b \n"
1065 "eor v10.16b, v10.16b, v10.16b \n"
1066 "eor v11.16b, v11.16b, v11.16b \n"
1067 "eor v12.16b, v12.16b, v12.16b \n"
1068 "eor v13.16b, v13.16b, v13.16b \n"
1069 "eor v14.16b, v14.16b, v14.16b \n"
1070 "eor v15.16b, v15.16b, v15.16b \n"
1071 "eor v16.16b, v16.16b, v16.16b \n"
1072 "eor v17.16b, v17.16b, v17.16b \n"
1073 "eor v18.16b, v18.16b, v18.16b \n"
1074 "eor v19.16b, v19.16b, v19.16b \n"
1075
1076 "0: \n"
1077
1078 "prfm pldl1keep, [%5, #512] \n"
1079 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1080
1081 "prfm pldl1keep, [%6, #512] \n"
1082 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1083
1084 "subs %w0, %w0, #1 \n"
1085
1086 "fmla v8.4s, v0.4s, v4.s[0] \n"
1087 "fmla v11.4s, v0.4s, v4.s[1] \n"
1088 "fmla v14.4s, v0.4s, v4.s[2] \n"
1089 "fmla v17.4s, v0.4s, v4.s[3] \n"
1090 "fmla v9.4s, v1.4s, v4.s[0] \n"
1091 "fmla v12.4s, v1.4s, v4.s[1] \n"
1092 "fmla v15.4s, v1.4s, v4.s[2] \n"
1093 "fmla v18.4s, v1.4s, v4.s[3] \n"
1094 "fmla v10.4s, v2.4s, v4.s[0] \n"
1095 "fmla v13.4s, v2.4s, v4.s[1] \n"
1096 "fmla v16.4s, v2.4s, v4.s[2] \n"
1097 "fmla v19.4s, v2.4s, v4.s[3] \n"
1098
1099 "prfm pldl1keep, [%5, #512] \n"
1100 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%5], #64 \n"
1101
1102 "fmla v8.4s, v3.4s, v5.s[0] \n"
1103 "fmla v11.4s, v3.4s, v5.s[1] \n"
1104 "fmla v14.4s, v3.4s, v5.s[2] \n"
1105 "fmla v17.4s, v3.4s, v5.s[3] \n"
1106 "fmla v9.4s, v20.4s, v5.s[0] \n"
1107 "fmla v12.4s, v20.4s, v5.s[1] \n"
1108 "fmla v15.4s, v20.4s, v5.s[2] \n"
1109 "fmla v18.4s, v20.4s, v5.s[3] \n"
1110 "fmla v10.4s, v21.4s, v5.s[0] \n"
1111 "fmla v13.4s, v21.4s, v5.s[1] \n"
1112 "fmla v16.4s, v21.4s, v5.s[2] \n"
1113 "fmla v19.4s, v21.4s, v5.s[3] \n"
1114
1115 "prfm pldl1keep, [%5, #512] \n"
1116 "ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [%5], #64 \n"
1117
1118 "fmla v8.4s, v22.4s, v6.s[0] \n"
1119 "fmla v11.4s, v22.4s, v6.s[1] \n"
1120 "fmla v14.4s, v22.4s, v6.s[2] \n"
1121 "fmla v17.4s, v22.4s, v6.s[3] \n"
1122 "fmla v9.4s, v23.4s, v6.s[0] \n"
1123 "fmla v12.4s, v23.4s, v6.s[1] \n"
1124 "fmla v15.4s, v23.4s, v6.s[2] \n"
1125 "fmla v18.4s, v23.4s, v6.s[3] \n"
1126 "fmla v10.4s, v24.4s, v6.s[0] \n"
1127 "fmla v13.4s, v24.4s, v6.s[1] \n"
1128 "fmla v16.4s, v24.4s, v6.s[2] \n"
1129 "fmla v19.4s, v24.4s, v6.s[3] \n"
1130
1131 "fmla v8.4s, v25.4s, v7.s[0] \n"
1132 "fmla v11.4s, v25.4s, v7.s[1] \n"
1133 "fmla v14.4s, v25.4s, v7.s[2] \n"
1134 "fmla v17.4s, v25.4s, v7.s[3] \n"
1135 "fmla v9.4s, v26.4s, v7.s[0] \n"
1136 "fmla v12.4s, v26.4s, v7.s[1] \n"
1137 "fmla v15.4s, v26.4s, v7.s[2] \n"
1138 "fmla v18.4s, v26.4s, v7.s[3] \n"
1139 "fmla v10.4s, v27.4s, v7.s[0] \n"
1140 "fmla v13.4s, v27.4s, v7.s[1] \n"
1141 "fmla v16.4s, v27.4s, v7.s[2] \n"
1142 "fmla v19.4s, v27.4s, v7.s[3] \n"
1143
1144 "bne 0b \n"
1145
1146 "st1 {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
1147 "st1 {v11.4s, v12.4s, v13.4s}, [%2], #48 \n"
1148 "st1 {v14.4s, v15.4s, v16.4s}, [%3], #48 \n"
1149 "st1 {v17.4s, v18.4s, v19.4s}, [%4], #48 \n"
1150
1151 : "=r"(nn), // %0
1152 "=r"(output0_tm), // %1
1153 "=r"(output1_tm), // %2
1154 "=r"(output2_tm), // %3
1155 "=r"(output3_tm), // %4
1156 "=r"(r0), // %5
1157 "=r"(kptr) // %6
1158 : "0"(nn),
1159 "1"(output0_tm),
1160 "2"(output1_tm),
1161 "3"(output2_tm),
1162 "4"(output3_tm),
1163 "5"(r0),
1164 "6"(kptr)
1165 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27");
1166 }
1167 #endif // __aarch64__
1168 for (; i + 7 < tiles; i += 8)
1169 {
1170 #if __aarch64__
1171 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1172 #else
1173 const float* r0 = bb2.row(i / 8);
1174 #endif
1175
1176 const float* kptr = kernel01_tm.row(r);
1177
1178 int nn = inch; // inch always > 0
1179
1180 #if __aarch64__
1181 asm volatile(
1182 "eor v8.16b, v8.16b, v8.16b \n"
1183 "eor v9.16b, v9.16b, v9.16b \n"
1184 "eor v10.16b, v10.16b, v10.16b \n"
1185 "eor v11.16b, v11.16b, v11.16b \n"
1186 "eor v12.16b, v12.16b, v12.16b \n"
1187 "eor v13.16b, v13.16b, v13.16b \n"
1188 "eor v14.16b, v14.16b, v14.16b \n"
1189 "eor v15.16b, v15.16b, v15.16b \n"
1190
1191 "0: \n"
1192
1193 "prfm pldl1keep, [%5, #512] \n"
1194 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1195
1196 "prfm pldl1keep, [%6, #512] \n"
1197 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1198
1199 "subs %w0, %w0, #1 \n"
1200
1201 "fmla v8.4s, v0.4s, v4.s[0] \n"
1202 "fmla v10.4s, v0.4s, v4.s[1] \n"
1203 "fmla v12.4s, v0.4s, v4.s[2] \n"
1204 "fmla v14.4s, v0.4s, v4.s[3] \n"
1205 "fmla v9.4s, v1.4s, v4.s[0] \n"
1206 "fmla v11.4s, v1.4s, v4.s[1] \n"
1207 "fmla v13.4s, v1.4s, v4.s[2] \n"
1208 "fmla v15.4s, v1.4s, v4.s[3] \n"
1209
1210 "fmla v8.4s, v2.4s, v5.s[0] \n"
1211 "fmla v10.4s, v2.4s, v5.s[1] \n"
1212 "fmla v12.4s, v2.4s, v5.s[2] \n"
1213 "fmla v14.4s, v2.4s, v5.s[3] \n"
1214 "fmla v9.4s, v3.4s, v5.s[0] \n"
1215 "fmla v11.4s, v3.4s, v5.s[1] \n"
1216 "fmla v13.4s, v3.4s, v5.s[2] \n"
1217 "fmla v15.4s, v3.4s, v5.s[3] \n"
1218
1219 "prfm pldl1keep, [%5, #512] \n"
1220 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%5], #64 \n"
1221
1222 "fmla v8.4s, v16.4s, v6.s[0] \n"
1223 "fmla v10.4s, v16.4s, v6.s[1] \n"
1224 "fmla v12.4s, v16.4s, v6.s[2] \n"
1225 "fmla v14.4s, v16.4s, v6.s[3] \n"
1226 "fmla v9.4s, v17.4s, v6.s[0] \n"
1227 "fmla v11.4s, v17.4s, v6.s[1] \n"
1228 "fmla v13.4s, v17.4s, v6.s[2] \n"
1229 "fmla v15.4s, v17.4s, v6.s[3] \n"
1230
1231 "fmla v8.4s, v18.4s, v7.s[0] \n"
1232 "fmla v10.4s, v18.4s, v7.s[1] \n"
1233 "fmla v12.4s, v18.4s, v7.s[2] \n"
1234 "fmla v14.4s, v18.4s, v7.s[3] \n"
1235 "fmla v9.4s, v19.4s, v7.s[0] \n"
1236 "fmla v11.4s, v19.4s, v7.s[1] \n"
1237 "fmla v13.4s, v19.4s, v7.s[2] \n"
1238 "fmla v15.4s, v19.4s, v7.s[3] \n"
1239
1240 "bne 0b \n"
1241
1242 "st1 {v8.4s, v9.4s}, [%1], #32 \n"
1243 "st1 {v10.4s, v11.4s}, [%2], #32 \n"
1244 "st1 {v12.4s, v13.4s}, [%3], #32 \n"
1245 "st1 {v14.4s, v15.4s}, [%4], #32 \n"
1246
1247 : "=r"(nn), // %0
1248 "=r"(output0_tm), // %1
1249 "=r"(output1_tm), // %2
1250 "=r"(output2_tm), // %3
1251 "=r"(output3_tm), // %4
1252 "=r"(r0), // %5
1253 "=r"(kptr) // %6
1254 : "0"(nn),
1255 "1"(output0_tm),
1256 "2"(output1_tm),
1257 "3"(output2_tm),
1258 "4"(output3_tm),
1259 "5"(r0),
1260 "6"(kptr)
1261 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1262 #else // __aarch64__
1263 asm volatile(
1264 "veor q8, q8 \n"
1265 "veor q9, q9 \n"
1266 "veor q10, q10 \n"
1267 "veor q11, q11 \n"
1268 "veor q12, q12 \n"
1269 "veor q13, q13 \n"
1270 "veor q14, q14 \n"
1271 "veor q15, q15 \n"
1272
1273 "0: \n"
1274
1275 "pld [%5, #512] \n"
1276 "vldm %5!, {d0-d7} \n"
1277
1278 "pld [%6, #512] \n"
1279 "vldm %6!, {d8-d15} \n"
1280
1281 "vmla.f32 q8, q0, d8[0] \n"
1282 "vmla.f32 q10, q0, d8[1] \n"
1283 "vmla.f32 q12, q0, d9[0] \n"
1284 "vmla.f32 q14, q0, d9[1] \n"
1285 "vmla.f32 q9, q1, d8[0] \n"
1286 "vmla.f32 q11, q1, d8[1] \n"
1287 "vmla.f32 q13, q1, d9[0] \n"
1288 "vmla.f32 q15, q1, d9[1] \n"
1289
1290 "vmla.f32 q8, q2, d10[0] \n"
1291 "vmla.f32 q10, q2, d10[1] \n"
1292 "vmla.f32 q12, q2, d11[0] \n"
1293 "vmla.f32 q14, q2, d11[1] \n"
1294 "vmla.f32 q9, q3, d10[0] \n"
1295 "vmla.f32 q11, q3, d10[1] \n"
1296 "vmla.f32 q13, q3, d11[0] \n"
1297 "vmla.f32 q15, q3, d11[1] \n"
1298
1299 "pld [%5, #512] \n"
1300 "vldm %5!, {d0-d7} \n"
1301
1302 "vmla.f32 q8, q0, d12[0] \n"
1303 "vmla.f32 q10, q0, d12[1] \n"
1304 "vmla.f32 q12, q0, d13[0] \n"
1305 "vmla.f32 q14, q0, d13[1] \n"
1306 "vmla.f32 q9, q1, d12[0] \n"
1307 "vmla.f32 q11, q1, d12[1] \n"
1308 "vmla.f32 q13, q1, d13[0] \n"
1309 "vmla.f32 q15, q1, d13[1] \n"
1310
1311 "subs %0, %0, #1 \n"
1312
1313 "vmla.f32 q8, q2, d14[0] \n"
1314 "vmla.f32 q10, q2, d14[1] \n"
1315 "vmla.f32 q12, q2, d15[0] \n"
1316 "vmla.f32 q14, q2, d15[1] \n"
1317 "vmla.f32 q9, q3, d14[0] \n"
1318 "vmla.f32 q11, q3, d14[1] \n"
1319 "vmla.f32 q13, q3, d15[0] \n"
1320 "vmla.f32 q15, q3, d15[1] \n"
1321
1322 "bne 0b \n"
1323
1324 "vst1.f32 {d16-d19}, [%1]! \n"
1325 "vst1.f32 {d20-d23}, [%2]! \n"
1326 "vst1.f32 {d24-d27}, [%3]! \n"
1327 "vst1.f32 {d28-d31}, [%4]! \n"
1328
1329 : "=r"(nn), // %0
1330 "=r"(output0_tm), // %1
1331 "=r"(output1_tm), // %2
1332 "=r"(output2_tm), // %3
1333 "=r"(output3_tm), // %4
1334 "=r"(r0), // %5
1335 "=r"(kptr) // %6
1336 : "0"(nn),
1337 "1"(output0_tm),
1338 "2"(output1_tm),
1339 "3"(output2_tm),
1340 "4"(output3_tm),
1341 "5"(r0),
1342 "6"(kptr)
1343 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1344 #endif // __aarch64__
1345 }
1346 for (; i + 3 < tiles; i += 4)
1347 {
1348 #if __aarch64__
1349 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1350 #else
1351 const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1352 #endif
1353
1354 const float* kptr = kernel01_tm.row(r);
1355
1356 int nn = inch; // inch always > 0
1357
1358 #if __aarch64__
1359 asm volatile(
1360 "eor v8.16b, v8.16b, v8.16b \n"
1361 "eor v9.16b, v9.16b, v9.16b \n"
1362 "eor v10.16b, v10.16b, v10.16b \n"
1363 "eor v11.16b, v11.16b, v11.16b \n"
1364
1365 "0: \n"
1366
1367 "prfm pldl1keep, [%5, #512] \n"
1368 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%5], #64 \n"
1369
1370 "prfm pldl1keep, [%6, #512] \n"
1371 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1372
1373 "subs %w0, %w0, #1 \n"
1374
1375 "fmla v8.4s, v0.4s, v4.s[0] \n"
1376 "fmla v9.4s, v0.4s, v4.s[1] \n"
1377 "fmla v10.4s, v0.4s, v4.s[2] \n"
1378 "fmla v11.4s, v0.4s, v4.s[3] \n"
1379
1380 "fmla v8.4s, v1.4s, v5.s[0] \n"
1381 "fmla v9.4s, v1.4s, v5.s[1] \n"
1382 "fmla v10.4s, v1.4s, v5.s[2] \n"
1383 "fmla v11.4s, v1.4s, v5.s[3] \n"
1384
1385 "fmla v8.4s, v2.4s, v6.s[0] \n"
1386 "fmla v9.4s, v2.4s, v6.s[1] \n"
1387 "fmla v10.4s, v2.4s, v6.s[2] \n"
1388 "fmla v11.4s, v2.4s, v6.s[3] \n"
1389
1390 "fmla v8.4s, v3.4s, v7.s[0] \n"
1391 "fmla v9.4s, v3.4s, v7.s[1] \n"
1392 "fmla v10.4s, v3.4s, v7.s[2] \n"
1393 "fmla v11.4s, v3.4s, v7.s[3] \n"
1394
1395 "bne 0b \n"
1396
1397 "st1 {v8.4s}, [%1], #16 \n"
1398 "st1 {v9.4s}, [%2], #16 \n"
1399 "st1 {v10.4s}, [%3], #16 \n"
1400 "st1 {v11.4s}, [%4], #16 \n"
1401
1402 : "=r"(nn), // %0
1403 "=r"(output0_tm), // %1
1404 "=r"(output1_tm), // %2
1405 "=r"(output2_tm), // %3
1406 "=r"(output3_tm), // %4
1407 "=r"(r0), // %5
1408 "=r"(kptr) // %6
1409 : "0"(nn),
1410 "1"(output0_tm),
1411 "2"(output1_tm),
1412 "3"(output2_tm),
1413 "4"(output3_tm),
1414 "5"(r0),
1415 "6"(kptr)
1416 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
1417 #else // __aarch64__
1418 asm volatile(
1419 "veor q8, q8 \n"
1420 "veor q9, q9 \n"
1421 "veor q10, q10 \n"
1422 "veor q11, q11 \n"
1423
1424 "0: \n"
1425
1426 "pld [%5, #512] \n"
1427 "vldm %5!, {d0-d7} \n"
1428
1429 "pld [%6, #512] \n"
1430 "vldm %6!, {d8-d15} \n"
1431
1432 "vmla.f32 q8, q0, d8[0] \n"
1433 "vmla.f32 q9, q0, d8[1] \n"
1434 "vmla.f32 q10, q0, d9[0] \n"
1435 "vmla.f32 q11, q0, d9[1] \n"
1436
1437 "vmla.f32 q8, q1, d10[0] \n"
1438 "vmla.f32 q9, q1, d10[1] \n"
1439 "vmla.f32 q10, q1, d11[0] \n"
1440 "vmla.f32 q11, q1, d11[1] \n"
1441
1442 "subs %0, %0, #1 \n"
1443
1444 "vmla.f32 q8, q2, d12[0] \n"
1445 "vmla.f32 q9, q2, d12[1] \n"
1446 "vmla.f32 q10, q2, d13[0] \n"
1447 "vmla.f32 q11, q2, d13[1] \n"
1448
1449 "vmla.f32 q8, q3, d14[0] \n"
1450 "vmla.f32 q9, q3, d14[1] \n"
1451 "vmla.f32 q10, q3, d15[0] \n"
1452 "vmla.f32 q11, q3, d15[1] \n"
1453
1454 "bne 0b \n"
1455
1456 "vst1.f32 {d16-d17}, [%1]! \n"
1457 "vst1.f32 {d18-d19}, [%2]! \n"
1458 "vst1.f32 {d20-d21}, [%3]! \n"
1459 "vst1.f32 {d22-d23}, [%4]! \n"
1460
1461 : "=r"(nn), // %0
1462 "=r"(output0_tm), // %1
1463 "=r"(output1_tm), // %2
1464 "=r"(output2_tm), // %3
1465 "=r"(output3_tm), // %4
1466 "=r"(r0), // %5
1467 "=r"(kptr) // %6
1468 : "0"(nn),
1469 "1"(output0_tm),
1470 "2"(output1_tm),
1471 "3"(output2_tm),
1472 "4"(output3_tm),
1473 "5"(r0),
1474 "6"(kptr)
1475 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1476 #endif // __aarch64__
1477 }
1478 for (; i < tiles; i++)
1479 {
1480 #if __aarch64__
1481 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
1482 #else
1483 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
1484 #endif
1485
1486 const float* kptr = kernel01_tm.row(r);
1487
1488 int nn = inch; // inch always > 0
1489
1490 #if __aarch64__
1491 asm volatile(
1492 "eor v8.16b, v8.16b, v8.16b \n"
1493 "eor v9.16b, v9.16b, v9.16b \n"
1494 "eor v10.16b, v10.16b, v10.16b \n"
1495 "eor v11.16b, v11.16b, v11.16b \n"
1496
1497 "0: \n"
1498
1499 "prfm pldl1keep, [%5, #128] \n"
1500 "ld1 {v0.4s}, [%5], #16 \n"
1501
1502 "prfm pldl1keep, [%6, #512] \n"
1503 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%6], #64 \n"
1504
1505 "subs %w0, %w0, #1 \n"
1506
1507 "fmla v8.4s, v4.4s, v0.s[0] \n"
1508 "fmla v9.4s, v5.4s, v0.s[1] \n"
1509 "fmla v10.4s, v6.4s, v0.s[2] \n"
1510 "fmla v11.4s, v7.4s, v0.s[3] \n"
1511
1512 "bne 0b \n"
1513
1514 "fadd v8.4s, v8.4s, v9.4s \n"
1515 "fadd v10.4s, v10.4s, v11.4s \n"
1516 "fadd v8.4s, v8.4s, v10.4s \n"
1517
1518 "st1 {v8.s}[0], [%1], #4 \n"
1519 "st1 {v8.s}[1], [%2], #4 \n"
1520 "st1 {v8.s}[2], [%3], #4 \n"
1521 "st1 {v8.s}[3], [%4], #4 \n"
1522
1523 : "=r"(nn), // %0
1524 "=r"(output0_tm), // %1
1525 "=r"(output1_tm), // %2
1526 "=r"(output2_tm), // %3
1527 "=r"(output3_tm), // %4
1528 "=r"(r0), // %5
1529 "=r"(kptr) // %6
1530 : "0"(nn),
1531 "1"(output0_tm),
1532 "2"(output1_tm),
1533 "3"(output2_tm),
1534 "4"(output3_tm),
1535 "5"(r0),
1536 "6"(kptr)
1537 : "cc", "memory", "v0", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
1538 #else // __aarch64__
1539 asm volatile(
1540 "veor q8, q8 \n"
1541 "veor q9, q9 \n"
1542 "veor q10, q10 \n"
1543 "veor q11, q11 \n"
1544
1545 "0: \n"
1546
1547 "pld [%5, #128] \n"
1548 "vld1.f32 {d0-d1}, [%5]! \n"
1549
1550 "pld [%6, #512] \n"
1551 "vldm %6!, {d8-d15} \n"
1552
1553 "subs %0, %0, #1 \n"
1554
1555 "vmla.f32 q8, q4, d0[0] \n"
1556 "vmla.f32 q9, q5, d0[1] \n"
1557 "vmla.f32 q10, q6, d1[0] \n"
1558 "vmla.f32 q11, q7, d1[1] \n"
1559
1560 "bne 0b \n"
1561
1562 "vadd.f32 q8, q8, q9 \n"
1563 "vadd.f32 q10, q10, q11 \n"
1564 "vadd.f32 q8, q8, q10 \n"
1565
1566 "vst1.f32 {d16[0]}, [%1]! \n"
1567 "vst1.f32 {d16[1]}, [%2]! \n"
1568 "vst1.f32 {d17[0]}, [%3]! \n"
1569 "vst1.f32 {d17[1]}, [%4]! \n"
1570
1571 : "=r"(nn), // %0
1572 "=r"(output0_tm), // %1
1573 "=r"(output1_tm), // %2
1574 "=r"(output2_tm), // %3
1575 "=r"(output3_tm), // %4
1576 "=r"(r0), // %5
1577 "=r"(kptr) // %6
1578 : "0"(nn),
1579 "1"(output0_tm),
1580 "2"(output1_tm),
1581 "3"(output2_tm),
1582 "4"(output3_tm),
1583 "5"(r0),
1584 "6"(kptr)
1585 : "cc", "memory", "q0", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11");
1586 #endif // __aarch64__
1587 }
1588 }
1589 }
1590
1591 remain_outch_start += nn_outch << 2;
1592
1593 #pragma omp parallel for num_threads(opt.num_threads)
1594 for (int p = remain_outch_start; p < outch; p++)
1595 {
1596 float* output0_tm = top_blob_tm.channel(p);
1597
1598 #if __aarch64__
1599 const Mat kernel0_tm = kernel_tm.channel(p / 8 + (p % 8) / 4 + p % 4);
1600 #else
1601 const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
1602 #endif
1603
1604 for (int r = 0; r < 64; r++)
1605 {
1606 const Mat bb2 = bottom_blob_tm2.channel(r);
1607
1608 int i = 0;
1609 #if __aarch64__
1610 for (; i + 11 < tiles; i += 12)
1611 {
1612 const float* r0 = bb2.row(i / 12);
1613
1614 const float* kptr = kernel0_tm.row(r);
1615
1616 int nn = inch; // inch always > 0
1617
1618 asm volatile(
1619 "eor v8.16b, v8.16b, v8.16b \n"
1620 "eor v9.16b, v9.16b, v9.16b \n"
1621 "eor v10.16b, v10.16b, v10.16b \n"
1622 "eor v5.16b, v5.16b, v5.16b \n"
1623 "eor v6.16b, v6.16b, v6.16b \n"
1624 "eor v7.16b, v7.16b, v7.16b \n"
1625
1626 "0: \n"
1627
1628 "prfm pldl1keep, [%2, #512] \n"
1629 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1630
1631 "prfm pldl1keep, [%3, #128] \n"
1632 "ld1 {v4.4s}, [%3], #16 \n"
1633
1634 "subs %w0, %w0, #1 \n"
1635
1636 "fmla v8.4s, v0.4s, v4.s[0] \n"
1637 "fmla v9.4s, v1.4s, v4.s[0] \n"
1638 "fmla v10.4s, v2.4s, v4.s[0] \n"
1639
1640 "prfm pldl1keep, [%2, #512] \n"
1641 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n"
1642
1643 "fmla v5.4s, v3.4s, v4.s[1] \n"
1644 "fmla v6.4s, v12.4s, v4.s[1] \n"
1645 "fmla v7.4s, v13.4s, v4.s[1] \n"
1646
1647 "prfm pldl1keep, [%2, #512] \n"
1648 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%2], #64 \n"
1649
1650 "fmla v8.4s, v14.4s, v4.s[2] \n"
1651 "fmla v9.4s, v15.4s, v4.s[2] \n"
1652 "fmla v10.4s, v16.4s, v4.s[2] \n"
1653
1654 "fmla v5.4s, v17.4s, v4.s[3] \n"
1655 "fmla v6.4s, v18.4s, v4.s[3] \n"
1656 "fmla v7.4s, v19.4s, v4.s[3] \n"
1657
1658 "bne 0b \n"
1659
1660 "fadd v8.4s, v8.4s, v5.4s \n"
1661 "fadd v9.4s, v9.4s, v6.4s \n"
1662 "fadd v10.4s, v10.4s, v7.4s \n"
1663
1664 "st1 {v8.4s, v9.4s, v10.4s}, [%1], #48 \n"
1665
1666 : "=r"(nn), // %0
1667 "=r"(output0_tm), // %1
1668 "=r"(r0), // %2
1669 "=r"(kptr) // %3
1670 : "0"(nn),
1671 "1"(output0_tm),
1672 "2"(r0),
1673 "3"(kptr)
1674 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19");
1675 }
1676 #endif
1677 for (; i + 7 < tiles; i += 8)
1678 {
1679 #if __aarch64__
1680 const float* r0 = bb2.row(i / 12 + (i % 12) / 8);
1681 #else
1682 const float* r0 = bb2.row(i / 8);
1683 #endif
1684
1685 const float* kptr = kernel0_tm.row(r);
1686
1687 int nn = inch; // inch always > 0
1688
1689 #if __aarch64__
1690 asm volatile(
1691 "eor v8.16b, v8.16b, v8.16b \n"
1692 "eor v9.16b, v9.16b, v9.16b \n"
1693 "eor v10.16b, v10.16b, v10.16b \n"
1694 "eor v11.16b, v11.16b, v11.16b \n"
1695
1696 "0: \n"
1697
1698 "prfm pldl1keep, [%2, #512] \n"
1699 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1700
1701 "prfm pldl1keep, [%3, #128] \n"
1702 "ld1 {v4.4s}, [%3], #16 \n"
1703
1704 "subs %w0, %w0, #1 \n"
1705
1706 "fmla v8.4s, v0.4s, v4.s[0] \n"
1707 "fmla v9.4s, v1.4s, v4.s[0] \n"
1708 "fmla v10.4s, v2.4s, v4.s[1] \n"
1709 "fmla v11.4s, v3.4s, v4.s[1] \n"
1710
1711 "prfm pldl1keep, [%2, #512] \n"
1712 "ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [%2], #64 \n"
1713
1714 "fmla v8.4s, v12.4s, v4.s[2] \n"
1715 "fmla v9.4s, v13.4s, v4.s[2] \n"
1716 "fmla v10.4s, v14.4s, v4.s[3] \n"
1717 "fmla v11.4s, v15.4s, v4.s[3] \n"
1718
1719 "bne 0b \n"
1720
1721 "fadd v8.4s, v8.4s, v10.4s \n"
1722 "fadd v9.4s, v9.4s, v11.4s \n"
1723
1724 "st1 {v8.4s, v9.4s}, [%1], #32 \n"
1725
1726 : "=r"(nn), // %0
1727 "=r"(output0_tm), // %1
1728 "=r"(r0), // %2
1729 "=r"(kptr) // %3
1730 : "0"(nn),
1731 "1"(output0_tm),
1732 "2"(r0),
1733 "3"(kptr)
1734 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15");
1735 #else // __aarch64__
1736 asm volatile(
1737 "veor q8, q8 \n"
1738 "veor q9, q9 \n"
1739 "veor q10, q10 \n"
1740 "veor q11, q11 \n"
1741
1742 "0: \n"
1743
1744 "pld [%2, #512] \n"
1745 "vldm %2!, {d0-d7} \n"
1746
1747 "pld [%3, #128] \n"
1748 "vld1.f32 {d8-d9}, [%3]! \n"
1749
1750 "vmla.f32 q8, q0, d8[0] \n"
1751 "vmla.f32 q9, q1, d8[0] \n"
1752 "vmla.f32 q10, q2, d8[1] \n"
1753 "vmla.f32 q11, q3, d8[1] \n"
1754
1755 "pld [%2, #512] \n"
1756 "vldm %2!, {d24-d31} \n"
1757
1758 "subs %0, %0, #1 \n"
1759
1760 "vmla.f32 q8, q12, d9[0] \n"
1761 "vmla.f32 q9, q13, d9[0] \n"
1762 "vmla.f32 q10, q14, d9[1] \n"
1763 "vmla.f32 q11, q15, d9[1] \n"
1764
1765 "bne 0b \n"
1766
1767 "vadd.f32 q8, q8, q10 \n"
1768 "vadd.f32 q9, q9, q11 \n"
1769
1770 "vst1.f32 {d16-d19}, [%1]! \n"
1771
1772 : "=r"(nn), // %0
1773 "=r"(output0_tm), // %1
1774 "=r"(r0), // %2
1775 "=r"(kptr) // %3
1776 : "0"(nn),
1777 "1"(output0_tm),
1778 "2"(r0),
1779 "3"(kptr)
1780 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
1781 #endif // __aarch64__
1782 }
1783 for (; i + 3 < tiles; i += 4)
1784 {
1785 #if __aarch64__
1786 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4);
1787 #else
1788 const float* r0 = bb2.row(i / 8 + (i % 8) / 4);
1789 #endif
1790
1791 const float* kptr = kernel0_tm.row(r);
1792
1793 int nn = inch; // inch always > 0
1794
1795 #if __aarch64__
1796 asm volatile(
1797 "eor v8.16b, v8.16b, v8.16b \n"
1798 "eor v9.16b, v9.16b, v9.16b \n"
1799 "eor v10.16b, v10.16b, v10.16b \n"
1800 "eor v11.16b, v11.16b, v11.16b \n"
1801
1802 "0: \n"
1803
1804 "prfm pldl1keep, [%2, #512] \n"
1805 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
1806
1807 "prfm pldl1keep, [%3, #128] \n"
1808 "ld1 {v4.4s}, [%3], #16 \n"
1809
1810 "subs %w0, %w0, #1 \n"
1811
1812 "fmla v8.4s, v0.4s, v4.s[0] \n"
1813 "fmla v9.4s, v1.4s, v4.s[1] \n"
1814 "fmla v10.4s, v2.4s, v4.s[2] \n"
1815 "fmla v11.4s, v3.4s, v4.s[3] \n"
1816
1817 "bne 0b \n"
1818
1819 "fadd v8.4s, v8.4s, v9.4s \n"
1820 "fadd v10.4s, v10.4s, v11.4s \n"
1821 "fadd v8.4s, v8.4s, v10.4s \n"
1822
1823 "st1 {v8.4s}, [%1], #16 \n"
1824
1825 : "=r"(nn), // %0
1826 "=r"(output0_tm), // %1
1827 "=r"(r0), // %2
1828 "=r"(kptr) // %3
1829 : "0"(nn),
1830 "1"(output0_tm),
1831 "2"(r0),
1832 "3"(kptr)
1833 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11");
1834 #else // __aarch64__
1835 asm volatile(
1836 "veor q8, q8 \n"
1837 "veor q9, q9 \n"
1838 "veor q10, q10 \n"
1839 "veor q11, q11 \n"
1840
1841 "0: \n"
1842
1843 "pld [%2, #512] \n"
1844 "vldm %2!, {d0-d7} \n"
1845
1846 "pld [%3, #128] \n"
1847 "vld1.f32 {d8-d9}, [%3]! \n"
1848
1849 "subs %0, %0, #1 \n"
1850
1851 "vmla.f32 q8, q0, d8[0] \n"
1852 "vmla.f32 q9, q1, d8[1] \n"
1853 "vmla.f32 q10, q2, d9[0] \n"
1854 "vmla.f32 q11, q3, d9[1] \n"
1855
1856 "bne 0b \n"
1857
1858 "vadd.f32 q8, q8, q9 \n"
1859 "vadd.f32 q10, q10, q11 \n"
1860 "vadd.f32 q8, q8, q10 \n"
1861
1862 "vst1.f32 {d16-d17}, [%1]! \n"
1863
1864 : "=r"(nn), // %0
1865 "=r"(output0_tm), // %1
1866 "=r"(r0), // %2
1867 "=r"(kptr) // %3
1868 : "0"(nn),
1869 "1"(output0_tm),
1870 "2"(r0),
1871 "3"(kptr)
1872 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11");
1873 #endif // __aarch64__
1874 }
1875 for (; i < tiles; i++)
1876 {
1877 #if __aarch64__
1878 const float* r0 = bb2.row(i / 12 + (i % 12) / 8 + (i % 12 % 8) / 4 + i % 12 % 4);
1879 #else
1880 const float* r0 = bb2.row(i / 8 + (i % 8) / 4 + i % 4);
1881 #endif
1882
1883 const float* kptr = kernel0_tm.row(r);
1884
1885 float32x4_t _sum0 = vdupq_n_f32(0.f);
1886
1887 for (int q = 0; q < inch; q++)
1888 {
1889 float32x4_t _r0 = vld1q_f32(r0);
1890
1891 float32x4_t _k0 = vld1q_f32(kptr);
1892
1893 _sum0 = vmlaq_f32(_sum0, _r0, _k0);
1894
1895 kptr += 4;
1896 r0 += 4;
1897 }
1898
1899 #if __aarch64__
1900 float sum0 = vaddvq_f32(_sum0);
1901 #else
1902 float32x2_t _ss = vadd_f32(vget_low_f32(_sum0), vget_high_f32(_sum0));
1903 float32x2_t _ss2 = vpadd_f32(_ss, _ss);
1904 float sum0 = vget_lane_f32(_ss2, 0);
1905 #endif
1906
1907 output0_tm[0] = sum0;
1908
1909 output0_tm++;
1910 }
1911 }
1912 }
1913 }
1914 bottom_blob_tm = Mat();
1915 // END dot
1916
1917 // BEGIN transform output
1918 Mat top_blob_bordered;
1919 if (outw == top_blob.w && outh == top_blob.h)
1920 {
1921 top_blob_bordered = top_blob;
1922 }
1923 else
1924 {
1925 top_blob_bordered.create(outw, outh, outch, 2u, 1, opt.workspace_allocator);
1926 }
1927 {
1928 // const float otm[6][8] = {
1929 // {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 32.0f, 32.0f, 0.0f},
1930 // {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 16.0f,-16.0f, 0.0f},
1931 // {0.0f, 1.0f, 1.0f, 4.0f, 4.0f, 8.0f, 8.0f, 0.0f},
1932 // {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 4.0f, -4.0f, 0.0f},
1933 // {0.0f, 1.0f, 1.0f, 16.0f, 16.0f, 2.0f, 2.0f, 0.0f},
1934 // {0.0f, 1.0f, -1.0f, 32.0f, -32.0f, 1.0f, -1.0f, 1.0f}
1935 // };
1936
1937 // 0 = r0 + (r1 + r2) + (r3 + r4) + (r5 + r6) * 32
1938 // 1 = (r1 - r2) + (r3 - r4) * 2 + (r5 - r6) * 16
1939 // 2 = (r1 + r2) + (r3 + r4) * 4 + (r5 + r6) * 8
1940 // 3 = (r1 - r2) + (r3 - r4) * 8 + (r5 - r6) * 4
1941 // 4 = (r1 + r2) + (r3 + r4) * 16+ (r5 + r6) * 2
1942 // 5 = r7 + (r1 - r2) + (r3 - r4) * 32+ (r5 - r6)
1943
1944 int w_tm = outw / 6 * 8;
1945 int h_tm = outh / 6 * 8;
1946 const int tiles = w_tm / 8 * h_tm / 8;
1947
1948 #pragma omp parallel for num_threads(opt.num_threads)
1949 for (int p = 0; p < outch; p++)
1950 {
1951 const Mat out0_tm = top_blob_tm.channel(p);
1952 Mat out0 = top_blob_bordered.channel(p);
1953
1954 const float bias0 = bias ? bias[p] : 0.f;
1955 // float32x2_t _bias0 = vdup_n_f32(bias0);
1956
1957 float tmp[6][8];
1958
1959 // tile
1960 for (int i = 0; i < outh / 6; i++)
1961 {
1962 for (int j = 0; j < outw / 6; j++)
1963 {
1964 // top_blob_tm.create(tiles, 64, outch, 4u, 1, opt.workspace_allocator);
1965
1966 const float* output0_tm_0 = (const float*)out0_tm + (i * w_tm / 8 + j) * 1;
1967 const float* output0_tm_1 = output0_tm_0 + tiles * 1;
1968 const float* output0_tm_2 = output0_tm_0 + tiles * 2;
1969 const float* output0_tm_3 = output0_tm_0 + tiles * 3;
1970 const float* output0_tm_4 = output0_tm_0 + tiles * 4;
1971 const float* output0_tm_5 = output0_tm_0 + tiles * 5;
1972 const float* output0_tm_6 = output0_tm_0 + tiles * 6;
1973 const float* output0_tm_7 = output0_tm_0 + tiles * 7;
1974
1975 // TODO neon optimize
1976 for (int m = 0; m < 8; m++)
1977 {
1978 float tmp024a = output0_tm_1[0] + output0_tm_2[0];
1979 float tmp135a = output0_tm_1[0] - output0_tm_2[0];
1980
1981 float tmp024b = output0_tm_3[0] + output0_tm_4[0];
1982 float tmp135b = output0_tm_3[0] - output0_tm_4[0];
1983
1984 float tmp024c = output0_tm_5[0] + output0_tm_6[0];
1985 float tmp135c = output0_tm_5[0] - output0_tm_6[0];
1986
1987 tmp[0][m] = output0_tm_0[0] + tmp024a + tmp024b + tmp024c * 32;
1988 tmp[2][m] = tmp024a + tmp024b * 4 + tmp024c * 8;
1989 tmp[4][m] = tmp024a + tmp024b * 16 + tmp024c + tmp024c;
1990
1991 tmp[1][m] = tmp135a + tmp135b + tmp135b + tmp135c * 16;
1992 tmp[3][m] = tmp135a + tmp135b * 8 + tmp135c * 4;
1993 tmp[5][m] = output0_tm_7[0] + tmp135a + tmp135b * 32 + tmp135c;
1994
1995 output0_tm_0 += tiles * 8;
1996 output0_tm_1 += tiles * 8;
1997 output0_tm_2 += tiles * 8;
1998 output0_tm_3 += tiles * 8;
1999 output0_tm_4 += tiles * 8;
2000 output0_tm_5 += tiles * 8;
2001 output0_tm_6 += tiles * 8;
2002 output0_tm_7 += tiles * 8;
2003 }
2004
2005 unsigned short* output0 = out0.row<unsigned short>(i * 6) + j * 6;
2006
2007 for (int m = 0; m < 6; m++)
2008 {
2009 const float* tmp0 = tmp[m];
2010
2011 float tmp024a = tmp0[1] + tmp0[2];
2012 float tmp135a = tmp0[1] - tmp0[2];
2013
2014 float tmp024b = tmp0[3] + tmp0[4];
2015 float tmp135b = tmp0[3] - tmp0[4];
2016
2017 float tmp024c = tmp0[5] + tmp0[6];
2018 float tmp135c = tmp0[5] - tmp0[6];
2019
2020 output0[0] = float32_to_bfloat16(bias0 + tmp0[0] + tmp024a + tmp024b + tmp024c * 32);
2021 output0[2] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 4 + tmp024c * 8);
2022 output0[4] = float32_to_bfloat16(bias0 + tmp024a + tmp024b * 16 + tmp024c + tmp024c);
2023
2024 output0[1] = float32_to_bfloat16(bias0 + tmp135a + tmp135b + tmp135b + tmp135c * 16);
2025 output0[3] = float32_to_bfloat16(bias0 + tmp135a + tmp135b * 8 + tmp135c * 4);
2026 output0[5] = float32_to_bfloat16(bias0 + tmp0[7] + tmp135a + tmp135b * 32 + tmp135c);
2027
2028 output0 += outw;
2029 }
2030 }
2031 }
2032 }
2033 }
2034 // END transform output
2035
2036 // cut result pad
2037 copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
2038 }
2039