1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv3x3s1_pack1ton_rvv(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 const int packn = csrr_vlenb() / 4;
18 const word_type vl = vsetvl_e32m1(packn);
19
20 int inch = bottom_blob.c;
21 int outw = top_blob.w;
22 int outh = top_blob.h;
23 int outch = top_blob.c;
24
25 const float* bias = _bias;
26
27 #pragma omp parallel for num_threads(opt.num_threads)
28 for (int p = 0; p < outch; p++)
29 {
30 Mat out0 = top_blob.channel(p);
31
32 vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl);
33 out0.fill(_bias0);
34
35 const float* k0 = kernel.channel(p);
36
37 int q = 0;
38 for (; q < inch; q++)
39 {
40 float* outptr0 = out0;
41
42 const Mat img0 = bottom_blob.channel(q);
43
44 const float* r0 = img0.row(0);
45 const float* r1 = img0.row(1);
46 const float* r2 = img0.row(2);
47
48 vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl);
49 vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl);
50 vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl);
51 vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl);
52 vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl);
53 vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl);
54 vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl);
55 vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl);
56 vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl);
57
58 int i = 0;
59 for (; i < outh; i++)
60 {
61 int j = 0;
62 for (; j + 7 < outw; j += 8)
63 {
64 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
65 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
66 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
67 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
68 vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl);
69 vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl);
70 vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl);
71 vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl);
72
73 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
74 _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl);
75 _sum2 = vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl);
76 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl);
77 _sum4 = vfmacc_vf_f32m1(_sum4, r0[4], _k00, vl);
78 _sum5 = vfmacc_vf_f32m1(_sum5, r0[5], _k00, vl);
79 _sum6 = vfmacc_vf_f32m1(_sum6, r0[6], _k00, vl);
80 _sum7 = vfmacc_vf_f32m1(_sum7, r0[7], _k00, vl);
81 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
82 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl);
83 _sum2 = vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl);
84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl);
85 _sum4 = vfmacc_vf_f32m1(_sum4, r0[5], _k01, vl);
86 _sum5 = vfmacc_vf_f32m1(_sum5, r0[6], _k01, vl);
87 _sum6 = vfmacc_vf_f32m1(_sum6, r0[7], _k01, vl);
88 _sum7 = vfmacc_vf_f32m1(_sum7, r0[8], _k01, vl);
89 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
90 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl);
91 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl);
92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl);
93 _sum4 = vfmacc_vf_f32m1(_sum4, r0[6], _k02, vl);
94 _sum5 = vfmacc_vf_f32m1(_sum5, r0[7], _k02, vl);
95 _sum6 = vfmacc_vf_f32m1(_sum6, r0[8], _k02, vl);
96 _sum7 = vfmacc_vf_f32m1(_sum7, r0[9], _k02, vl);
97
98 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
99 _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl);
100 _sum2 = vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl);
101 _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl);
102 _sum4 = vfmacc_vf_f32m1(_sum4, r1[4], _k10, vl);
103 _sum5 = vfmacc_vf_f32m1(_sum5, r1[5], _k10, vl);
104 _sum6 = vfmacc_vf_f32m1(_sum6, r1[6], _k10, vl);
105 _sum7 = vfmacc_vf_f32m1(_sum7, r1[7], _k10, vl);
106 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
107 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl);
108 _sum2 = vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl);
109 _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl);
110 _sum4 = vfmacc_vf_f32m1(_sum4, r1[5], _k11, vl);
111 _sum5 = vfmacc_vf_f32m1(_sum5, r1[6], _k11, vl);
112 _sum6 = vfmacc_vf_f32m1(_sum6, r1[7], _k11, vl);
113 _sum7 = vfmacc_vf_f32m1(_sum7, r1[8], _k11, vl);
114 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
115 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl);
116 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl);
117 _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl);
118 _sum4 = vfmacc_vf_f32m1(_sum4, r1[6], _k12, vl);
119 _sum5 = vfmacc_vf_f32m1(_sum5, r1[7], _k12, vl);
120 _sum6 = vfmacc_vf_f32m1(_sum6, r1[8], _k12, vl);
121 _sum7 = vfmacc_vf_f32m1(_sum7, r1[9], _k12, vl);
122
123 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
124 _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl);
125 _sum2 = vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl);
126 _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl);
127 _sum4 = vfmacc_vf_f32m1(_sum4, r2[4], _k20, vl);
128 _sum5 = vfmacc_vf_f32m1(_sum5, r2[5], _k20, vl);
129 _sum6 = vfmacc_vf_f32m1(_sum6, r2[6], _k20, vl);
130 _sum7 = vfmacc_vf_f32m1(_sum7, r2[7], _k20, vl);
131 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
132 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl);
133 _sum2 = vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl);
134 _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl);
135 _sum4 = vfmacc_vf_f32m1(_sum4, r2[5], _k21, vl);
136 _sum5 = vfmacc_vf_f32m1(_sum5, r2[6], _k21, vl);
137 _sum6 = vfmacc_vf_f32m1(_sum6, r2[7], _k21, vl);
138 _sum7 = vfmacc_vf_f32m1(_sum7, r2[8], _k21, vl);
139 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
140 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl);
141 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl);
142 _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl);
143 _sum4 = vfmacc_vf_f32m1(_sum4, r2[6], _k22, vl);
144 _sum5 = vfmacc_vf_f32m1(_sum5, r2[7], _k22, vl);
145 _sum6 = vfmacc_vf_f32m1(_sum6, r2[8], _k22, vl);
146 _sum7 = vfmacc_vf_f32m1(_sum7, r2[9], _k22, vl);
147
148 vse32_v_f32m1(outptr0, _sum0, vl);
149 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
150 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
151 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
152 vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl);
153 vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl);
154 vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl);
155 vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl);
156
157 outptr0 += packn * 8;
158
159 r0 += 8;
160 r1 += 8;
161 r2 += 8;
162 }
163 for (; j + 3 < outw; j += 4)
164 {
165 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
166 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
167 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
168 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
169
170 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
171 _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl);
172 _sum2 = vfmacc_vf_f32m1(_sum2, r0[2], _k00, vl);
173 _sum3 = vfmacc_vf_f32m1(_sum3, r0[3], _k00, vl);
174 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
175 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl);
176 _sum2 = vfmacc_vf_f32m1(_sum2, r0[3], _k01, vl);
177 _sum3 = vfmacc_vf_f32m1(_sum3, r0[4], _k01, vl);
178 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
179 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl);
180 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k02, vl);
181 _sum3 = vfmacc_vf_f32m1(_sum3, r0[5], _k02, vl);
182
183 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
184 _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl);
185 _sum2 = vfmacc_vf_f32m1(_sum2, r1[2], _k10, vl);
186 _sum3 = vfmacc_vf_f32m1(_sum3, r1[3], _k10, vl);
187 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
188 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl);
189 _sum2 = vfmacc_vf_f32m1(_sum2, r1[3], _k11, vl);
190 _sum3 = vfmacc_vf_f32m1(_sum3, r1[4], _k11, vl);
191 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
192 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl);
193 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k12, vl);
194 _sum3 = vfmacc_vf_f32m1(_sum3, r1[5], _k12, vl);
195
196 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
197 _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl);
198 _sum2 = vfmacc_vf_f32m1(_sum2, r2[2], _k20, vl);
199 _sum3 = vfmacc_vf_f32m1(_sum3, r2[3], _k20, vl);
200 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
201 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl);
202 _sum2 = vfmacc_vf_f32m1(_sum2, r2[3], _k21, vl);
203 _sum3 = vfmacc_vf_f32m1(_sum3, r2[4], _k21, vl);
204 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
205 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl);
206 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k22, vl);
207 _sum3 = vfmacc_vf_f32m1(_sum3, r2[5], _k22, vl);
208
209 vse32_v_f32m1(outptr0, _sum0, vl);
210 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
211 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
212 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
213
214 outptr0 += packn * 4;
215
216 r0 += 4;
217 r1 += 4;
218 r2 += 4;
219 }
220 for (; j + 1 < outw; j += 2)
221 {
222 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
223 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
224
225 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
226 _sum1 = vfmacc_vf_f32m1(_sum1, r0[1], _k00, vl);
227 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
228 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k01, vl);
229 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
230 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k02, vl);
231
232 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
233 _sum1 = vfmacc_vf_f32m1(_sum1, r1[1], _k10, vl);
234 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
235 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k11, vl);
236 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
237 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k12, vl);
238
239 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
240 _sum1 = vfmacc_vf_f32m1(_sum1, r2[1], _k20, vl);
241 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
242 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k21, vl);
243 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
244 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k22, vl);
245
246 vse32_v_f32m1(outptr0, _sum0, vl);
247 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
248
249 outptr0 += packn * 2;
250
251 r0 += 2;
252 r1 += 2;
253 r2 += 2;
254 }
255 for (; j < outw; j++)
256 {
257 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
258
259 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
260 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
261 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
262
263 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
264 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
265 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
266
267 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
268 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
269 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
270
271 vse32_v_f32m1(outptr0, _sum0, vl);
272
273 outptr0 += packn;
274
275 r0 += 1;
276 r1 += 1;
277 r2 += 1;
278 }
279
280 r0 += 2;
281 r1 += 2;
282 r2 += 2;
283 }
284
285 k0 += 9 * packn;
286 }
287 }
288 }
289
conv3x3s2_pack1ton_rvv(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)290 static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
291 {
292 const int packn = csrr_vlenb() / 4;
293 const word_type vl = vsetvl_e32m1(packn);
294
295 int w = bottom_blob.w;
296 int inch = bottom_blob.c;
297 int outw = top_blob.w;
298 int outh = top_blob.h;
299 int outch = top_blob.c;
300
301 const int tailstep = w - 2 * outw + w;
302
303 const float* bias = _bias;
304
305 #pragma omp parallel for num_threads(opt.num_threads)
306 for (int p = 0; p < outch; p++)
307 {
308 Mat out0 = top_blob.channel(p);
309
310 vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl);
311 out0.fill(_bias0);
312
313 const float* k0 = kernel.channel(p);
314
315 int q = 0;
316 for (; q < inch; q++)
317 {
318 float* outptr0 = out0;
319
320 const Mat img0 = bottom_blob.channel(q);
321
322 const float* r0 = img0.row(0);
323 const float* r1 = img0.row(1);
324 const float* r2 = img0.row(2);
325
326 vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl);
327 vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl);
328 vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl);
329 vfloat32m1_t _k10 = vle32_v_f32m1(k0 + packn * 3, vl);
330 vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn * 4, vl);
331 vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 5, vl);
332 vfloat32m1_t _k20 = vle32_v_f32m1(k0 + packn * 6, vl);
333 vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn * 7, vl);
334 vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 8, vl);
335
336 int i = 0;
337 for (; i < outh; i++)
338 {
339 int j = 0;
340 for (; j + 7 < outw; j += 8)
341 {
342 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
343 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
344 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
345 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
346 vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl);
347 vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl);
348 vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl);
349 vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl);
350
351 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
352 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl);
353 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl);
354 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl);
355 _sum4 = vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl);
356 _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl);
357 _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl);
358 _sum7 = vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl);
359 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
360 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl);
361 _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl);
362 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl);
363 _sum4 = vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl);
364 _sum5 = vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl);
365 _sum6 = vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl);
366 _sum7 = vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl);
367 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
368 _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl);
369 _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl);
370 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl);
371 _sum4 = vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl);
372 _sum5 = vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl);
373 _sum6 = vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl);
374 _sum7 = vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl);
375
376 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
377 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl);
378 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl);
379 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl);
380 _sum4 = vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl);
381 _sum5 = vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl);
382 _sum6 = vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl);
383 _sum7 = vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl);
384 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
385 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl);
386 _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl);
387 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl);
388 _sum4 = vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl);
389 _sum5 = vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl);
390 _sum6 = vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl);
391 _sum7 = vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl);
392 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
393 _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl);
394 _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl);
395 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl);
396 _sum4 = vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl);
397 _sum5 = vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl);
398 _sum6 = vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl);
399 _sum7 = vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl);
400
401 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
402 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl);
403 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl);
404 _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl);
405 _sum4 = vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl);
406 _sum5 = vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl);
407 _sum6 = vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl);
408 _sum7 = vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl);
409 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
410 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl);
411 _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl);
412 _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl);
413 _sum4 = vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl);
414 _sum5 = vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl);
415 _sum6 = vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl);
416 _sum7 = vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl);
417 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
418 _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl);
419 _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl);
420 _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl);
421 _sum4 = vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl);
422 _sum5 = vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl);
423 _sum6 = vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl);
424 _sum7 = vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl);
425
426 vse32_v_f32m1(outptr0, _sum0, vl);
427 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
428 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
429 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
430 vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl);
431 vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl);
432 vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl);
433 vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl);
434
435 outptr0 += packn * 8;
436
437 r0 += 16;
438 r1 += 16;
439 r2 += 16;
440 }
441 for (; j + 3 < outw; j += 4)
442 {
443 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
444 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
445 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
446 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
447
448 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
449 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl);
450 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl);
451 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl);
452 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
453 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl);
454 _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl);
455 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl);
456 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
457 _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl);
458 _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl);
459 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl);
460
461 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
462 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl);
463 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl);
464 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl);
465 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
466 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl);
467 _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl);
468 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl);
469 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
470 _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl);
471 _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl);
472 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl);
473
474 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
475 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl);
476 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl);
477 _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl);
478 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
479 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl);
480 _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl);
481 _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl);
482 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
483 _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl);
484 _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl);
485 _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl);
486
487 vse32_v_f32m1(outptr0, _sum0, vl);
488 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
489 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
490 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
491
492 outptr0 += packn * 4;
493
494 r0 += 8;
495 r1 += 8;
496 r2 += 8;
497 }
498 for (; j + 1 < outw; j += 2)
499 {
500 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
501 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
502
503 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
504 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl);
505 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
506 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl);
507 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
508 _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl);
509
510 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
511 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl);
512 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
513 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl);
514 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
515 _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl);
516
517 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
518 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl);
519 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
520 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl);
521 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
522 _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl);
523
524 vse32_v_f32m1(outptr0, _sum0, vl);
525 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
526
527 outptr0 += packn * 2;
528
529 r0 += 4;
530 r1 += 4;
531 r2 += 4;
532 }
533 for (; j < outw; j++)
534 {
535 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
536
537 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
538 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
539 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
540
541 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
542 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
543 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
544
545 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
546 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
547 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
548
549 vse32_v_f32m1(outptr0, _sum0, vl);
550
551 outptr0 += packn;
552
553 r0 += 2;
554 r1 += 2;
555 r2 += 2;
556 }
557
558 r0 += tailstep;
559 r1 += tailstep;
560 r2 += tailstep;
561 }
562
563 k0 += 9 * packn;
564 }
565 }
566 }
567