1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
convdw5x5s1_packn_rvv(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 const int packn = csrr_vlenb() / 4;
18 const word_type vl = vsetvl_e32m1(packn);
19
20 int w = bottom_blob.w;
21
22 int outw = top_blob.w;
23 int outh = top_blob.h;
24
25 const int group = bottom_blob.c;
26
27 const float* bias = _bias;
28
29 #pragma omp parallel for num_threads(opt.num_threads)
30 for (int g = 0; g < group; g++)
31 {
32 Mat out = top_blob.channel(g);
33
34 vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl);
35
36 const float* k0 = kernel.row(g);
37
38 float* outptr0 = out.row(0);
39 float* outptr1 = out.row(1);
40
41 const Mat img0 = bottom_blob.channel(g);
42
43 const float* r0 = img0.row(0);
44 const float* r1 = img0.row(1);
45 const float* r2 = img0.row(2);
46 const float* r3 = img0.row(3);
47 const float* r4 = img0.row(4);
48 const float* r5 = img0.row(5);
49
50 int i = 0;
51 for (; i + 1 < outh; i += 2)
52 {
53 int j = 0;
54 for (; j < outw; j++)
55 {
56 vfloat32m1_t _sum0 = _bias0;
57 vfloat32m1_t _sum1 = _bias0;
58
59 vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
60 vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl);
61 vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
62 vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
63 vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
64
65 vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl);
66 vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl);
67 vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl);
68 vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl);
69 vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl);
70 k0 += packn * 5;
71
72 _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl);
73 _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl);
74 _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl);
75 _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl);
76 _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl);
77
78 vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
79 vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl);
80 vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl);
81 vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl);
82 vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl);
83
84 _sum1 = vfmacc_vv_f32m1(_sum1, _k00, _r10, vl);
85 _sum1 = vfmacc_vv_f32m1(_sum1, _k01, _r11, vl);
86 _sum1 = vfmacc_vv_f32m1(_sum1, _k02, _r12, vl);
87 _sum1 = vfmacc_vv_f32m1(_sum1, _k03, _r13, vl);
88 _sum1 = vfmacc_vv_f32m1(_sum1, _k04, _r14, vl);
89
90 vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl);
91 vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl);
92 vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl);
93 vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl);
94 vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl);
95 k0 += packn * 5;
96
97 _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl);
98 _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl);
99 _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl);
100 _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl);
101 _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl);
102
103 vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
104 vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl);
105 vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl);
106 vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl);
107 vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl);
108
109 _sum1 = vfmacc_vv_f32m1(_sum1, _k10, _r20, vl);
110 _sum1 = vfmacc_vv_f32m1(_sum1, _k11, _r21, vl);
111 _sum1 = vfmacc_vv_f32m1(_sum1, _k12, _r22, vl);
112 _sum1 = vfmacc_vv_f32m1(_sum1, _k13, _r23, vl);
113 _sum1 = vfmacc_vv_f32m1(_sum1, _k14, _r24, vl);
114
115 vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl);
116 vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl);
117 vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl);
118 vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl);
119 vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl);
120 k0 += packn * 5;
121
122 _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl);
123 _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl);
124 _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl);
125 _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl);
126 _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl);
127
128 vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
129 vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl);
130 vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl);
131 vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl);
132 vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl);
133
134 _sum1 = vfmacc_vv_f32m1(_sum1, _k20, _r30, vl);
135 _sum1 = vfmacc_vv_f32m1(_sum1, _k21, _r31, vl);
136 _sum1 = vfmacc_vv_f32m1(_sum1, _k22, _r32, vl);
137 _sum1 = vfmacc_vv_f32m1(_sum1, _k23, _r33, vl);
138 _sum1 = vfmacc_vv_f32m1(_sum1, _k24, _r34, vl);
139
140 vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl);
141 vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl);
142 vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl);
143 vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl);
144 vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl);
145 k0 += packn * 5;
146
147 _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl);
148 _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl);
149 _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl);
150 _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl);
151 _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl);
152
153 vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl);
154 vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl);
155 vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl);
156 vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl);
157 vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl);
158
159 _sum1 = vfmacc_vv_f32m1(_sum1, _k30, _r40, vl);
160 _sum1 = vfmacc_vv_f32m1(_sum1, _k31, _r41, vl);
161 _sum1 = vfmacc_vv_f32m1(_sum1, _k32, _r42, vl);
162 _sum1 = vfmacc_vv_f32m1(_sum1, _k33, _r43, vl);
163 _sum1 = vfmacc_vv_f32m1(_sum1, _k34, _r44, vl);
164
165 vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl);
166 vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl);
167 vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl);
168 vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl);
169 vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl);
170 k0 -= packn * 20;
171
172 _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl);
173 _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl);
174 _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl);
175 _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl);
176 _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl);
177
178 vfloat32m1_t _r50 = vle32_v_f32m1(r5, vl);
179 vfloat32m1_t _r51 = vle32_v_f32m1(r5 + packn, vl);
180 vfloat32m1_t _r52 = vle32_v_f32m1(r5 + packn * 2, vl);
181 vfloat32m1_t _r53 = vle32_v_f32m1(r5 + packn * 3, vl);
182 vfloat32m1_t _r54 = vle32_v_f32m1(r5 + packn * 4, vl);
183
184 _sum1 = vfmacc_vv_f32m1(_sum1, _k40, _r50, vl);
185 _sum1 = vfmacc_vv_f32m1(_sum1, _k41, _r51, vl);
186 _sum1 = vfmacc_vv_f32m1(_sum1, _k42, _r52, vl);
187 _sum1 = vfmacc_vv_f32m1(_sum1, _k43, _r53, vl);
188 _sum1 = vfmacc_vv_f32m1(_sum1, _k44, _r54, vl);
189
190 vse32_v_f32m1(outptr0, _sum0, vl);
191 vse32_v_f32m1(outptr1, _sum1, vl);
192
193 outptr0 += packn;
194 outptr1 += packn;
195
196 r0 += packn;
197 r1 += packn;
198 r2 += packn;
199 r3 += packn;
200 r4 += packn;
201 r5 += packn;
202 }
203
204 r0 += 4 * packn + w * packn;
205 r1 += 4 * packn + w * packn;
206 r2 += 4 * packn + w * packn;
207 r3 += 4 * packn + w * packn;
208 r4 += 4 * packn + w * packn;
209 r5 += 4 * packn + w * packn;
210
211 outptr0 += outw * packn;
212 outptr1 += outw * packn;
213 }
214 for (; i < outh; i++)
215 {
216 int j = 0;
217 for (; j < outw; j++)
218 {
219 vfloat32m1_t _sum0 = _bias0;
220
221 vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
222 vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl);
223 vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
224 vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
225 vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
226
227 vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl);
228 vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl);
229 vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl);
230 vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl);
231 vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl);
232 k0 += packn * 5;
233
234 _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl);
235 _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl);
236 _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl);
237 _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl);
238 _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl);
239
240 vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
241 vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl);
242 vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl);
243 vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl);
244 vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl);
245
246 vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl);
247 vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl);
248 vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl);
249 vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl);
250 vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl);
251 k0 += packn * 5;
252
253 _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl);
254 _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl);
255 _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl);
256 _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl);
257 _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl);
258
259 vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
260 vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl);
261 vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl);
262 vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl);
263 vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl);
264
265 vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl);
266 vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl);
267 vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl);
268 vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl);
269 vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl);
270 k0 += packn * 5;
271
272 _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl);
273 _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl);
274 _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl);
275 _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl);
276 _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl);
277
278 vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
279 vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl);
280 vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl);
281 vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl);
282 vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl);
283
284 vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl);
285 vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl);
286 vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl);
287 vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl);
288 vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl);
289 k0 += packn * 5;
290
291 _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl);
292 _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl);
293 _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl);
294 _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl);
295 _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl);
296
297 vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl);
298 vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl);
299 vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl);
300 vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl);
301 vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl);
302
303 vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl);
304 vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl);
305 vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl);
306 vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl);
307 vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl);
308 k0 -= packn * 20;
309
310 _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl);
311 _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl);
312 _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl);
313 _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl);
314 _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl);
315
316 vse32_v_f32m1(outptr0, _sum0, vl);
317
318 outptr0 += packn;
319
320 r0 += packn;
321 r1 += packn;
322 r2 += packn;
323 r3 += packn;
324 r4 += packn;
325 }
326
327 r0 += 4 * packn;
328 r1 += 4 * packn;
329 r2 += 4 * packn;
330 r3 += 4 * packn;
331 r4 += 4 * packn;
332 }
333 }
334 }
335
convdw5x5s2_packn_rvv(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)336 static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
337 {
338 const int packn = csrr_vlenb() / 4;
339 const word_type vl = vsetvl_e32m1(packn);
340
341 int w = bottom_blob.w;
342
343 int outw = top_blob.w;
344 int outh = top_blob.h;
345
346 const int group = bottom_blob.c;
347
348 const int tailstep = (w - 2 * outw + w) * packn;
349
350 const float* bias = _bias;
351
352 #pragma omp parallel for num_threads(opt.num_threads)
353 for (int g = 0; g < group; g++)
354 {
355 Mat out = top_blob.channel(g);
356
357 vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + g * packn, vl) : vfmv_v_f_f32m1(0.f, vl);
358
359 const float* k0 = kernel.row(g);
360
361 float* outptr0 = out;
362
363 const Mat img0 = bottom_blob.channel(g);
364
365 const float* r0 = img0.row(0);
366 const float* r1 = img0.row(1);
367 const float* r2 = img0.row(2);
368 const float* r3 = img0.row(3);
369 const float* r4 = img0.row(4);
370
371 int i = 0;
372 for (; i < outh; i++)
373 {
374 int j = 0;
375 for (; j < outw; j++)
376 {
377 vfloat32m1_t _sum0 = _bias0;
378
379 vfloat32m1_t _r00 = vle32_v_f32m1(r0, vl);
380 vfloat32m1_t _r01 = vle32_v_f32m1(r0 + packn, vl);
381 vfloat32m1_t _r02 = vle32_v_f32m1(r0 + packn * 2, vl);
382 vfloat32m1_t _r03 = vle32_v_f32m1(r0 + packn * 3, vl);
383 vfloat32m1_t _r04 = vle32_v_f32m1(r0 + packn * 4, vl);
384
385 vfloat32m1_t _k00 = vle32_v_f32m1(k0, vl);
386 vfloat32m1_t _k01 = vle32_v_f32m1(k0 + packn, vl);
387 vfloat32m1_t _k02 = vle32_v_f32m1(k0 + packn * 2, vl);
388 vfloat32m1_t _k03 = vle32_v_f32m1(k0 + packn * 3, vl);
389 vfloat32m1_t _k04 = vle32_v_f32m1(k0 + packn * 4, vl);
390 k0 += packn * 5;
391
392 _sum0 = vfmacc_vv_f32m1(_sum0, _k00, _r00, vl);
393 _sum0 = vfmacc_vv_f32m1(_sum0, _k01, _r01, vl);
394 _sum0 = vfmacc_vv_f32m1(_sum0, _k02, _r02, vl);
395 _sum0 = vfmacc_vv_f32m1(_sum0, _k03, _r03, vl);
396 _sum0 = vfmacc_vv_f32m1(_sum0, _k04, _r04, vl);
397
398 vfloat32m1_t _r10 = vle32_v_f32m1(r1, vl);
399 vfloat32m1_t _r11 = vle32_v_f32m1(r1 + packn, vl);
400 vfloat32m1_t _r12 = vle32_v_f32m1(r1 + packn * 2, vl);
401 vfloat32m1_t _r13 = vle32_v_f32m1(r1 + packn * 3, vl);
402 vfloat32m1_t _r14 = vle32_v_f32m1(r1 + packn * 4, vl);
403
404 vfloat32m1_t _k10 = vle32_v_f32m1(k0, vl);
405 vfloat32m1_t _k11 = vle32_v_f32m1(k0 + packn, vl);
406 vfloat32m1_t _k12 = vle32_v_f32m1(k0 + packn * 2, vl);
407 vfloat32m1_t _k13 = vle32_v_f32m1(k0 + packn * 3, vl);
408 vfloat32m1_t _k14 = vle32_v_f32m1(k0 + packn * 4, vl);
409 k0 += packn * 5;
410
411 _sum0 = vfmacc_vv_f32m1(_sum0, _k10, _r10, vl);
412 _sum0 = vfmacc_vv_f32m1(_sum0, _k11, _r11, vl);
413 _sum0 = vfmacc_vv_f32m1(_sum0, _k12, _r12, vl);
414 _sum0 = vfmacc_vv_f32m1(_sum0, _k13, _r13, vl);
415 _sum0 = vfmacc_vv_f32m1(_sum0, _k14, _r14, vl);
416
417 vfloat32m1_t _r20 = vle32_v_f32m1(r2, vl);
418 vfloat32m1_t _r21 = vle32_v_f32m1(r2 + packn, vl);
419 vfloat32m1_t _r22 = vle32_v_f32m1(r2 + packn * 2, vl);
420 vfloat32m1_t _r23 = vle32_v_f32m1(r2 + packn * 3, vl);
421 vfloat32m1_t _r24 = vle32_v_f32m1(r2 + packn * 4, vl);
422
423 vfloat32m1_t _k20 = vle32_v_f32m1(k0, vl);
424 vfloat32m1_t _k21 = vle32_v_f32m1(k0 + packn, vl);
425 vfloat32m1_t _k22 = vle32_v_f32m1(k0 + packn * 2, vl);
426 vfloat32m1_t _k23 = vle32_v_f32m1(k0 + packn * 3, vl);
427 vfloat32m1_t _k24 = vle32_v_f32m1(k0 + packn * 4, vl);
428 k0 += packn * 5;
429
430 _sum0 = vfmacc_vv_f32m1(_sum0, _k20, _r20, vl);
431 _sum0 = vfmacc_vv_f32m1(_sum0, _k21, _r21, vl);
432 _sum0 = vfmacc_vv_f32m1(_sum0, _k22, _r22, vl);
433 _sum0 = vfmacc_vv_f32m1(_sum0, _k23, _r23, vl);
434 _sum0 = vfmacc_vv_f32m1(_sum0, _k24, _r24, vl);
435
436 vfloat32m1_t _r30 = vle32_v_f32m1(r3, vl);
437 vfloat32m1_t _r31 = vle32_v_f32m1(r3 + packn, vl);
438 vfloat32m1_t _r32 = vle32_v_f32m1(r3 + packn * 2, vl);
439 vfloat32m1_t _r33 = vle32_v_f32m1(r3 + packn * 3, vl);
440 vfloat32m1_t _r34 = vle32_v_f32m1(r3 + packn * 4, vl);
441
442 vfloat32m1_t _k30 = vle32_v_f32m1(k0, vl);
443 vfloat32m1_t _k31 = vle32_v_f32m1(k0 + packn, vl);
444 vfloat32m1_t _k32 = vle32_v_f32m1(k0 + packn * 2, vl);
445 vfloat32m1_t _k33 = vle32_v_f32m1(k0 + packn * 3, vl);
446 vfloat32m1_t _k34 = vle32_v_f32m1(k0 + packn * 4, vl);
447 k0 += packn * 5;
448
449 _sum0 = vfmacc_vv_f32m1(_sum0, _k30, _r30, vl);
450 _sum0 = vfmacc_vv_f32m1(_sum0, _k31, _r31, vl);
451 _sum0 = vfmacc_vv_f32m1(_sum0, _k32, _r32, vl);
452 _sum0 = vfmacc_vv_f32m1(_sum0, _k33, _r33, vl);
453 _sum0 = vfmacc_vv_f32m1(_sum0, _k34, _r34, vl);
454
455 vfloat32m1_t _r40 = vle32_v_f32m1(r4, vl);
456 vfloat32m1_t _r41 = vle32_v_f32m1(r4 + packn, vl);
457 vfloat32m1_t _r42 = vle32_v_f32m1(r4 + packn * 2, vl);
458 vfloat32m1_t _r43 = vle32_v_f32m1(r4 + packn * 3, vl);
459 vfloat32m1_t _r44 = vle32_v_f32m1(r4 + packn * 4, vl);
460
461 vfloat32m1_t _k40 = vle32_v_f32m1(k0, vl);
462 vfloat32m1_t _k41 = vle32_v_f32m1(k0 + packn, vl);
463 vfloat32m1_t _k42 = vle32_v_f32m1(k0 + packn * 2, vl);
464 vfloat32m1_t _k43 = vle32_v_f32m1(k0 + packn * 3, vl);
465 vfloat32m1_t _k44 = vle32_v_f32m1(k0 + packn * 4, vl);
466 k0 -= packn * 20;
467
468 _sum0 = vfmacc_vv_f32m1(_sum0, _k40, _r40, vl);
469 _sum0 = vfmacc_vv_f32m1(_sum0, _k41, _r41, vl);
470 _sum0 = vfmacc_vv_f32m1(_sum0, _k42, _r42, vl);
471 _sum0 = vfmacc_vv_f32m1(_sum0, _k43, _r43, vl);
472 _sum0 = vfmacc_vv_f32m1(_sum0, _k44, _r44, vl);
473
474 vse32_v_f32m1(outptr0, _sum0, vl);
475
476 outptr0 += packn;
477
478 r0 += packn * 2;
479 r1 += packn * 2;
480 r2 += packn * 2;
481 r3 += packn * 2;
482 r4 += packn * 2;
483 }
484
485 r0 += tailstep;
486 r1 += tailstep;
487 r2 += tailstep;
488 r3 += tailstep;
489 r4 += tailstep;
490 }
491 }
492 }
493