1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
conv7x7s2_pack1ton_rvv(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 const int packn = csrr_vlenb() / 4;
18 const word_type vl = vsetvl_e32m1(packn);
19
20 int w = bottom_blob.w;
21 int inch = bottom_blob.c;
22
23 int outw = top_blob.w;
24 int outh = top_blob.h;
25 int outch = top_blob.c;
26
27 const int tailstep = w - 2 * outw + w;
28
29 const float* bias = _bias;
30
31 #pragma omp parallel for num_threads(opt.num_threads)
32 for (int p = 0; p < outch; p++)
33 {
34 Mat out0 = top_blob.channel(p);
35
36 vfloat32m1_t _bias0 = bias ? vle32_v_f32m1(bias + p * packn, vl) : vfmv_v_f_f32m1(0.f, vl);
37 out0.fill(_bias0);
38
39 for (int q = 0; q < inch; q++)
40 {
41 float* outptr0 = out0;
42
43 const Mat img0 = bottom_blob.channel(q);
44
45 const float* r0 = img0.row(0);
46 const float* r1 = img0.row(1);
47 const float* r2 = img0.row(2);
48 const float* r3 = img0.row(3);
49 const float* r4 = img0.row(4);
50 const float* r5 = img0.row(5);
51 const float* r6 = img0.row(6);
52
53 const float* kptr = kernel.channel(p).row(q);
54
55 int i = 0;
56
57 for (; i < outh; i++)
58 {
59 int j = 0;
60 for (; j + 7 < outw; j += 8)
61 {
62 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
63 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
64 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
65 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
66 vfloat32m1_t _sum4 = vle32_v_f32m1(outptr0 + packn * 4, vl);
67 vfloat32m1_t _sum5 = vle32_v_f32m1(outptr0 + packn * 5, vl);
68 vfloat32m1_t _sum6 = vle32_v_f32m1(outptr0 + packn * 6, vl);
69 vfloat32m1_t _sum7 = vle32_v_f32m1(outptr0 + packn * 7, vl);
70
71 vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl);
72 vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl);
73 vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl);
74 vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl);
75 vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl);
76 vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl);
77 vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl);
78
79 kptr += packn * 7;
80
81 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
82 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl);
83 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl);
84 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl);
85 _sum4 = vfmacc_vf_f32m1(_sum4, r0[8], _k00, vl);
86 _sum5 = vfmacc_vf_f32m1(_sum5, r0[10], _k00, vl);
87 _sum6 = vfmacc_vf_f32m1(_sum6, r0[12], _k00, vl);
88 _sum7 = vfmacc_vf_f32m1(_sum7, r0[14], _k00, vl);
89 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
90 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl);
91 _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl);
92 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl);
93 _sum4 = vfmacc_vf_f32m1(_sum4, r0[9], _k01, vl);
94 _sum5 = vfmacc_vf_f32m1(_sum5, r0[11], _k01, vl);
95 _sum6 = vfmacc_vf_f32m1(_sum6, r0[13], _k01, vl);
96 _sum7 = vfmacc_vf_f32m1(_sum7, r0[15], _k01, vl);
97 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
98 _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl);
99 _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl);
100 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl);
101 _sum4 = vfmacc_vf_f32m1(_sum4, r0[10], _k02, vl);
102 _sum5 = vfmacc_vf_f32m1(_sum5, r0[12], _k02, vl);
103 _sum6 = vfmacc_vf_f32m1(_sum6, r0[14], _k02, vl);
104 _sum7 = vfmacc_vf_f32m1(_sum7, r0[16], _k02, vl);
105 _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl);
106 _sum1 = vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl);
107 _sum2 = vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl);
108 _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl);
109 _sum4 = vfmacc_vf_f32m1(_sum4, r0[11], _k03, vl);
110 _sum5 = vfmacc_vf_f32m1(_sum5, r0[13], _k03, vl);
111 _sum6 = vfmacc_vf_f32m1(_sum6, r0[15], _k03, vl);
112 _sum7 = vfmacc_vf_f32m1(_sum7, r0[17], _k03, vl);
113 _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl);
114 _sum1 = vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl);
115 _sum2 = vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl);
116 _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl);
117 _sum4 = vfmacc_vf_f32m1(_sum4, r0[12], _k04, vl);
118 _sum5 = vfmacc_vf_f32m1(_sum5, r0[14], _k04, vl);
119 _sum6 = vfmacc_vf_f32m1(_sum6, r0[16], _k04, vl);
120 _sum7 = vfmacc_vf_f32m1(_sum7, r0[18], _k04, vl);
121 _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl);
122 _sum1 = vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl);
123 _sum2 = vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl);
124 _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl);
125 _sum4 = vfmacc_vf_f32m1(_sum4, r0[13], _k05, vl);
126 _sum5 = vfmacc_vf_f32m1(_sum5, r0[15], _k05, vl);
127 _sum6 = vfmacc_vf_f32m1(_sum6, r0[17], _k05, vl);
128 _sum7 = vfmacc_vf_f32m1(_sum7, r0[19], _k05, vl);
129 _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl);
130 _sum1 = vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl);
131 _sum2 = vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl);
132 _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl);
133 _sum4 = vfmacc_vf_f32m1(_sum4, r0[14], _k06, vl);
134 _sum5 = vfmacc_vf_f32m1(_sum5, r0[16], _k06, vl);
135 _sum6 = vfmacc_vf_f32m1(_sum6, r0[18], _k06, vl);
136 _sum7 = vfmacc_vf_f32m1(_sum7, r0[20], _k06, vl);
137
138 vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl);
139 vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl);
140 vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl);
141 vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl);
142 vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl);
143 vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl);
144 vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl);
145
146 kptr += packn * 7;
147
148 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
149 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl);
150 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl);
151 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl);
152 _sum4 = vfmacc_vf_f32m1(_sum4, r1[8], _k10, vl);
153 _sum5 = vfmacc_vf_f32m1(_sum5, r1[10], _k10, vl);
154 _sum6 = vfmacc_vf_f32m1(_sum6, r1[12], _k10, vl);
155 _sum7 = vfmacc_vf_f32m1(_sum7, r1[14], _k10, vl);
156 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
157 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl);
158 _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl);
159 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl);
160 _sum4 = vfmacc_vf_f32m1(_sum4, r1[9], _k11, vl);
161 _sum5 = vfmacc_vf_f32m1(_sum5, r1[11], _k11, vl);
162 _sum6 = vfmacc_vf_f32m1(_sum6, r1[13], _k11, vl);
163 _sum7 = vfmacc_vf_f32m1(_sum7, r1[15], _k11, vl);
164 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
165 _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl);
166 _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl);
167 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl);
168 _sum4 = vfmacc_vf_f32m1(_sum4, r1[10], _k12, vl);
169 _sum5 = vfmacc_vf_f32m1(_sum5, r1[12], _k12, vl);
170 _sum6 = vfmacc_vf_f32m1(_sum6, r1[14], _k12, vl);
171 _sum7 = vfmacc_vf_f32m1(_sum7, r1[16], _k12, vl);
172 _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl);
173 _sum1 = vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl);
174 _sum2 = vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl);
175 _sum3 = vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl);
176 _sum4 = vfmacc_vf_f32m1(_sum4, r1[11], _k13, vl);
177 _sum5 = vfmacc_vf_f32m1(_sum5, r1[13], _k13, vl);
178 _sum6 = vfmacc_vf_f32m1(_sum6, r1[15], _k13, vl);
179 _sum7 = vfmacc_vf_f32m1(_sum7, r1[17], _k13, vl);
180 _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl);
181 _sum1 = vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl);
182 _sum2 = vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl);
183 _sum3 = vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl);
184 _sum4 = vfmacc_vf_f32m1(_sum4, r1[12], _k14, vl);
185 _sum5 = vfmacc_vf_f32m1(_sum5, r1[14], _k14, vl);
186 _sum6 = vfmacc_vf_f32m1(_sum6, r1[16], _k14, vl);
187 _sum7 = vfmacc_vf_f32m1(_sum7, r1[18], _k14, vl);
188 _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl);
189 _sum1 = vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl);
190 _sum2 = vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl);
191 _sum3 = vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl);
192 _sum4 = vfmacc_vf_f32m1(_sum4, r1[13], _k15, vl);
193 _sum5 = vfmacc_vf_f32m1(_sum5, r1[15], _k15, vl);
194 _sum6 = vfmacc_vf_f32m1(_sum6, r1[17], _k15, vl);
195 _sum7 = vfmacc_vf_f32m1(_sum7, r1[19], _k15, vl);
196 _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl);
197 _sum1 = vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl);
198 _sum2 = vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl);
199 _sum3 = vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl);
200 _sum4 = vfmacc_vf_f32m1(_sum4, r1[14], _k16, vl);
201 _sum5 = vfmacc_vf_f32m1(_sum5, r1[16], _k16, vl);
202 _sum6 = vfmacc_vf_f32m1(_sum6, r1[18], _k16, vl);
203 _sum7 = vfmacc_vf_f32m1(_sum7, r1[20], _k16, vl);
204
205 vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl);
206 vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl);
207 vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl);
208 vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl);
209 vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl);
210 vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl);
211 vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl);
212
213 kptr += packn * 7;
214
215 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
216 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl);
217 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl);
218 _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl);
219 _sum4 = vfmacc_vf_f32m1(_sum4, r2[8], _k20, vl);
220 _sum5 = vfmacc_vf_f32m1(_sum5, r2[10], _k20, vl);
221 _sum6 = vfmacc_vf_f32m1(_sum6, r2[12], _k20, vl);
222 _sum7 = vfmacc_vf_f32m1(_sum7, r2[14], _k20, vl);
223 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
224 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl);
225 _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl);
226 _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl);
227 _sum4 = vfmacc_vf_f32m1(_sum4, r2[9], _k21, vl);
228 _sum5 = vfmacc_vf_f32m1(_sum5, r2[11], _k21, vl);
229 _sum6 = vfmacc_vf_f32m1(_sum6, r2[13], _k21, vl);
230 _sum7 = vfmacc_vf_f32m1(_sum7, r2[15], _k21, vl);
231 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
232 _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl);
233 _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl);
234 _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl);
235 _sum4 = vfmacc_vf_f32m1(_sum4, r2[10], _k22, vl);
236 _sum5 = vfmacc_vf_f32m1(_sum5, r2[12], _k22, vl);
237 _sum6 = vfmacc_vf_f32m1(_sum6, r2[14], _k22, vl);
238 _sum7 = vfmacc_vf_f32m1(_sum7, r2[16], _k22, vl);
239 _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl);
240 _sum1 = vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl);
241 _sum2 = vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl);
242 _sum3 = vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl);
243 _sum4 = vfmacc_vf_f32m1(_sum4, r2[11], _k23, vl);
244 _sum5 = vfmacc_vf_f32m1(_sum5, r2[13], _k23, vl);
245 _sum6 = vfmacc_vf_f32m1(_sum6, r2[15], _k23, vl);
246 _sum7 = vfmacc_vf_f32m1(_sum7, r2[17], _k23, vl);
247 _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl);
248 _sum1 = vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl);
249 _sum2 = vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl);
250 _sum3 = vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl);
251 _sum4 = vfmacc_vf_f32m1(_sum4, r2[12], _k24, vl);
252 _sum5 = vfmacc_vf_f32m1(_sum5, r2[14], _k24, vl);
253 _sum6 = vfmacc_vf_f32m1(_sum6, r2[16], _k24, vl);
254 _sum7 = vfmacc_vf_f32m1(_sum7, r2[18], _k24, vl);
255 _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl);
256 _sum1 = vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl);
257 _sum2 = vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl);
258 _sum3 = vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl);
259 _sum4 = vfmacc_vf_f32m1(_sum4, r2[13], _k25, vl);
260 _sum5 = vfmacc_vf_f32m1(_sum5, r2[15], _k25, vl);
261 _sum6 = vfmacc_vf_f32m1(_sum6, r2[17], _k25, vl);
262 _sum7 = vfmacc_vf_f32m1(_sum7, r2[19], _k25, vl);
263 _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl);
264 _sum1 = vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl);
265 _sum2 = vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl);
266 _sum3 = vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl);
267 _sum4 = vfmacc_vf_f32m1(_sum4, r2[14], _k26, vl);
268 _sum5 = vfmacc_vf_f32m1(_sum5, r2[16], _k26, vl);
269 _sum6 = vfmacc_vf_f32m1(_sum6, r2[18], _k26, vl);
270 _sum7 = vfmacc_vf_f32m1(_sum7, r2[20], _k26, vl);
271
272 vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl);
273 vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl);
274 vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl);
275 vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl);
276 vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl);
277 vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl);
278 vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl);
279
280 kptr += packn * 7;
281
282 _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl);
283 _sum1 = vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl);
284 _sum2 = vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl);
285 _sum3 = vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl);
286 _sum4 = vfmacc_vf_f32m1(_sum4, r3[8], _k30, vl);
287 _sum5 = vfmacc_vf_f32m1(_sum5, r3[10], _k30, vl);
288 _sum6 = vfmacc_vf_f32m1(_sum6, r3[12], _k30, vl);
289 _sum7 = vfmacc_vf_f32m1(_sum7, r3[14], _k30, vl);
290 _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl);
291 _sum1 = vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl);
292 _sum2 = vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl);
293 _sum3 = vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl);
294 _sum4 = vfmacc_vf_f32m1(_sum4, r3[9], _k31, vl);
295 _sum5 = vfmacc_vf_f32m1(_sum5, r3[11], _k31, vl);
296 _sum6 = vfmacc_vf_f32m1(_sum6, r3[13], _k31, vl);
297 _sum7 = vfmacc_vf_f32m1(_sum7, r3[15], _k31, vl);
298 _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl);
299 _sum1 = vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl);
300 _sum2 = vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl);
301 _sum3 = vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl);
302 _sum4 = vfmacc_vf_f32m1(_sum4, r3[10], _k32, vl);
303 _sum5 = vfmacc_vf_f32m1(_sum5, r3[12], _k32, vl);
304 _sum6 = vfmacc_vf_f32m1(_sum6, r3[14], _k32, vl);
305 _sum7 = vfmacc_vf_f32m1(_sum7, r3[16], _k32, vl);
306 _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl);
307 _sum1 = vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl);
308 _sum2 = vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl);
309 _sum3 = vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl);
310 _sum4 = vfmacc_vf_f32m1(_sum4, r3[11], _k33, vl);
311 _sum5 = vfmacc_vf_f32m1(_sum5, r3[13], _k33, vl);
312 _sum6 = vfmacc_vf_f32m1(_sum6, r3[15], _k33, vl);
313 _sum7 = vfmacc_vf_f32m1(_sum7, r3[17], _k33, vl);
314 _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl);
315 _sum1 = vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl);
316 _sum2 = vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl);
317 _sum3 = vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl);
318 _sum4 = vfmacc_vf_f32m1(_sum4, r3[12], _k34, vl);
319 _sum5 = vfmacc_vf_f32m1(_sum5, r3[14], _k34, vl);
320 _sum6 = vfmacc_vf_f32m1(_sum6, r3[16], _k34, vl);
321 _sum7 = vfmacc_vf_f32m1(_sum7, r3[18], _k34, vl);
322 _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl);
323 _sum1 = vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl);
324 _sum2 = vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl);
325 _sum3 = vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl);
326 _sum4 = vfmacc_vf_f32m1(_sum4, r3[13], _k35, vl);
327 _sum5 = vfmacc_vf_f32m1(_sum5, r3[15], _k35, vl);
328 _sum6 = vfmacc_vf_f32m1(_sum6, r3[17], _k35, vl);
329 _sum7 = vfmacc_vf_f32m1(_sum7, r3[19], _k35, vl);
330 _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl);
331 _sum1 = vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl);
332 _sum2 = vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl);
333 _sum3 = vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl);
334 _sum4 = vfmacc_vf_f32m1(_sum4, r3[14], _k36, vl);
335 _sum5 = vfmacc_vf_f32m1(_sum5, r3[16], _k36, vl);
336 _sum6 = vfmacc_vf_f32m1(_sum6, r3[18], _k36, vl);
337 _sum7 = vfmacc_vf_f32m1(_sum7, r3[20], _k36, vl);
338
339 vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl);
340 vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl);
341 vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl);
342 vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl);
343 vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl);
344 vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl);
345 vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl);
346
347 kptr += packn * 7;
348
349 _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl);
350 _sum1 = vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl);
351 _sum2 = vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl);
352 _sum3 = vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl);
353 _sum4 = vfmacc_vf_f32m1(_sum4, r4[8], _k40, vl);
354 _sum5 = vfmacc_vf_f32m1(_sum5, r4[10], _k40, vl);
355 _sum6 = vfmacc_vf_f32m1(_sum6, r4[12], _k40, vl);
356 _sum7 = vfmacc_vf_f32m1(_sum7, r4[14], _k40, vl);
357 _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl);
358 _sum1 = vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl);
359 _sum2 = vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl);
360 _sum3 = vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl);
361 _sum4 = vfmacc_vf_f32m1(_sum4, r4[9], _k41, vl);
362 _sum5 = vfmacc_vf_f32m1(_sum5, r4[11], _k41, vl);
363 _sum6 = vfmacc_vf_f32m1(_sum6, r4[13], _k41, vl);
364 _sum7 = vfmacc_vf_f32m1(_sum7, r4[15], _k41, vl);
365 _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl);
366 _sum1 = vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl);
367 _sum2 = vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl);
368 _sum3 = vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl);
369 _sum4 = vfmacc_vf_f32m1(_sum4, r4[10], _k42, vl);
370 _sum5 = vfmacc_vf_f32m1(_sum5, r4[12], _k42, vl);
371 _sum6 = vfmacc_vf_f32m1(_sum6, r4[14], _k42, vl);
372 _sum7 = vfmacc_vf_f32m1(_sum7, r4[16], _k42, vl);
373 _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl);
374 _sum1 = vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl);
375 _sum2 = vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl);
376 _sum3 = vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl);
377 _sum4 = vfmacc_vf_f32m1(_sum4, r4[11], _k43, vl);
378 _sum5 = vfmacc_vf_f32m1(_sum5, r4[13], _k43, vl);
379 _sum6 = vfmacc_vf_f32m1(_sum6, r4[15], _k43, vl);
380 _sum7 = vfmacc_vf_f32m1(_sum7, r4[17], _k43, vl);
381 _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl);
382 _sum1 = vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl);
383 _sum2 = vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl);
384 _sum3 = vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl);
385 _sum4 = vfmacc_vf_f32m1(_sum4, r4[12], _k44, vl);
386 _sum5 = vfmacc_vf_f32m1(_sum5, r4[14], _k44, vl);
387 _sum6 = vfmacc_vf_f32m1(_sum6, r4[16], _k44, vl);
388 _sum7 = vfmacc_vf_f32m1(_sum7, r4[18], _k44, vl);
389 _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl);
390 _sum1 = vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl);
391 _sum2 = vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl);
392 _sum3 = vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl);
393 _sum4 = vfmacc_vf_f32m1(_sum4, r4[13], _k45, vl);
394 _sum5 = vfmacc_vf_f32m1(_sum5, r4[15], _k45, vl);
395 _sum6 = vfmacc_vf_f32m1(_sum6, r4[17], _k45, vl);
396 _sum7 = vfmacc_vf_f32m1(_sum7, r4[19], _k45, vl);
397 _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl);
398 _sum1 = vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl);
399 _sum2 = vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl);
400 _sum3 = vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl);
401 _sum4 = vfmacc_vf_f32m1(_sum4, r4[14], _k46, vl);
402 _sum5 = vfmacc_vf_f32m1(_sum5, r4[16], _k46, vl);
403 _sum6 = vfmacc_vf_f32m1(_sum6, r4[18], _k46, vl);
404 _sum7 = vfmacc_vf_f32m1(_sum7, r4[20], _k46, vl);
405
406 vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl);
407 vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl);
408 vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl);
409 vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl);
410 vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl);
411 vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl);
412 vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl);
413
414 kptr += packn * 7;
415
416 _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl);
417 _sum1 = vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl);
418 _sum2 = vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl);
419 _sum3 = vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl);
420 _sum4 = vfmacc_vf_f32m1(_sum4, r5[8], _k50, vl);
421 _sum5 = vfmacc_vf_f32m1(_sum5, r5[10], _k50, vl);
422 _sum6 = vfmacc_vf_f32m1(_sum6, r5[12], _k50, vl);
423 _sum7 = vfmacc_vf_f32m1(_sum7, r5[14], _k50, vl);
424 _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl);
425 _sum1 = vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl);
426 _sum2 = vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl);
427 _sum3 = vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl);
428 _sum4 = vfmacc_vf_f32m1(_sum4, r5[9], _k51, vl);
429 _sum5 = vfmacc_vf_f32m1(_sum5, r5[11], _k51, vl);
430 _sum6 = vfmacc_vf_f32m1(_sum6, r5[13], _k51, vl);
431 _sum7 = vfmacc_vf_f32m1(_sum7, r5[15], _k51, vl);
432 _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl);
433 _sum1 = vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl);
434 _sum2 = vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl);
435 _sum3 = vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl);
436 _sum4 = vfmacc_vf_f32m1(_sum4, r5[10], _k52, vl);
437 _sum5 = vfmacc_vf_f32m1(_sum5, r5[12], _k52, vl);
438 _sum6 = vfmacc_vf_f32m1(_sum6, r5[14], _k52, vl);
439 _sum7 = vfmacc_vf_f32m1(_sum7, r5[16], _k52, vl);
440 _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl);
441 _sum1 = vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl);
442 _sum2 = vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl);
443 _sum3 = vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl);
444 _sum4 = vfmacc_vf_f32m1(_sum4, r5[11], _k53, vl);
445 _sum5 = vfmacc_vf_f32m1(_sum5, r5[13], _k53, vl);
446 _sum6 = vfmacc_vf_f32m1(_sum6, r5[15], _k53, vl);
447 _sum7 = vfmacc_vf_f32m1(_sum7, r5[17], _k53, vl);
448 _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl);
449 _sum1 = vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl);
450 _sum2 = vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl);
451 _sum3 = vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl);
452 _sum4 = vfmacc_vf_f32m1(_sum4, r5[12], _k54, vl);
453 _sum5 = vfmacc_vf_f32m1(_sum5, r5[14], _k54, vl);
454 _sum6 = vfmacc_vf_f32m1(_sum6, r5[16], _k54, vl);
455 _sum7 = vfmacc_vf_f32m1(_sum7, r5[18], _k54, vl);
456 _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl);
457 _sum1 = vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl);
458 _sum2 = vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl);
459 _sum3 = vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl);
460 _sum4 = vfmacc_vf_f32m1(_sum4, r5[13], _k55, vl);
461 _sum5 = vfmacc_vf_f32m1(_sum5, r5[15], _k55, vl);
462 _sum6 = vfmacc_vf_f32m1(_sum6, r5[17], _k55, vl);
463 _sum7 = vfmacc_vf_f32m1(_sum7, r5[19], _k55, vl);
464 _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl);
465 _sum1 = vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl);
466 _sum2 = vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl);
467 _sum3 = vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl);
468 _sum4 = vfmacc_vf_f32m1(_sum4, r5[14], _k56, vl);
469 _sum5 = vfmacc_vf_f32m1(_sum5, r5[16], _k56, vl);
470 _sum6 = vfmacc_vf_f32m1(_sum6, r5[18], _k56, vl);
471 _sum7 = vfmacc_vf_f32m1(_sum7, r5[20], _k56, vl);
472
473 vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl);
474 vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl);
475 vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl);
476 vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl);
477 vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl);
478 vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl);
479 vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl);
480
481 kptr -= packn * 42;
482
483 _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl);
484 _sum1 = vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl);
485 _sum2 = vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl);
486 _sum3 = vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl);
487 _sum4 = vfmacc_vf_f32m1(_sum4, r6[8], _k60, vl);
488 _sum5 = vfmacc_vf_f32m1(_sum5, r6[10], _k60, vl);
489 _sum6 = vfmacc_vf_f32m1(_sum6, r6[12], _k60, vl);
490 _sum7 = vfmacc_vf_f32m1(_sum7, r6[14], _k60, vl);
491 _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl);
492 _sum1 = vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl);
493 _sum2 = vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl);
494 _sum3 = vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl);
495 _sum4 = vfmacc_vf_f32m1(_sum4, r6[9], _k61, vl);
496 _sum5 = vfmacc_vf_f32m1(_sum5, r6[11], _k61, vl);
497 _sum6 = vfmacc_vf_f32m1(_sum6, r6[13], _k61, vl);
498 _sum7 = vfmacc_vf_f32m1(_sum7, r6[15], _k61, vl);
499 _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl);
500 _sum1 = vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl);
501 _sum2 = vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl);
502 _sum3 = vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl);
503 _sum4 = vfmacc_vf_f32m1(_sum4, r6[10], _k62, vl);
504 _sum5 = vfmacc_vf_f32m1(_sum5, r6[12], _k62, vl);
505 _sum6 = vfmacc_vf_f32m1(_sum6, r6[14], _k62, vl);
506 _sum7 = vfmacc_vf_f32m1(_sum7, r6[16], _k62, vl);
507 _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl);
508 _sum1 = vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl);
509 _sum2 = vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl);
510 _sum3 = vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl);
511 _sum4 = vfmacc_vf_f32m1(_sum4, r6[11], _k63, vl);
512 _sum5 = vfmacc_vf_f32m1(_sum5, r6[13], _k63, vl);
513 _sum6 = vfmacc_vf_f32m1(_sum6, r6[15], _k63, vl);
514 _sum7 = vfmacc_vf_f32m1(_sum7, r6[17], _k63, vl);
515 _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl);
516 _sum1 = vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl);
517 _sum2 = vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl);
518 _sum3 = vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl);
519 _sum4 = vfmacc_vf_f32m1(_sum4, r6[12], _k64, vl);
520 _sum5 = vfmacc_vf_f32m1(_sum5, r6[14], _k64, vl);
521 _sum6 = vfmacc_vf_f32m1(_sum6, r6[16], _k64, vl);
522 _sum7 = vfmacc_vf_f32m1(_sum7, r6[18], _k64, vl);
523 _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl);
524 _sum1 = vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl);
525 _sum2 = vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl);
526 _sum3 = vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl);
527 _sum4 = vfmacc_vf_f32m1(_sum4, r6[13], _k65, vl);
528 _sum5 = vfmacc_vf_f32m1(_sum5, r6[15], _k65, vl);
529 _sum6 = vfmacc_vf_f32m1(_sum6, r6[17], _k65, vl);
530 _sum7 = vfmacc_vf_f32m1(_sum7, r6[19], _k65, vl);
531 _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl);
532 _sum1 = vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl);
533 _sum2 = vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl);
534 _sum3 = vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl);
535 _sum4 = vfmacc_vf_f32m1(_sum4, r6[14], _k66, vl);
536 _sum5 = vfmacc_vf_f32m1(_sum5, r6[16], _k66, vl);
537 _sum6 = vfmacc_vf_f32m1(_sum6, r6[18], _k66, vl);
538 _sum7 = vfmacc_vf_f32m1(_sum7, r6[20], _k66, vl);
539
540 vse32_v_f32m1(outptr0, _sum0, vl);
541 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
542 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
543 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
544 vse32_v_f32m1(outptr0 + packn * 4, _sum4, vl);
545 vse32_v_f32m1(outptr0 + packn * 5, _sum5, vl);
546 vse32_v_f32m1(outptr0 + packn * 6, _sum6, vl);
547 vse32_v_f32m1(outptr0 + packn * 7, _sum7, vl);
548
549 outptr0 += packn * 8;
550
551 r0 += 16;
552 r1 += 16;
553 r2 += 16;
554 r3 += 16;
555 r4 += 16;
556 r5 += 16;
557 r6 += 16;
558 }
559 for (; j + 3 < outw; j += 4)
560 {
561 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
562 vfloat32m1_t _sum1 = vle32_v_f32m1(outptr0 + packn, vl);
563 vfloat32m1_t _sum2 = vle32_v_f32m1(outptr0 + packn * 2, vl);
564 vfloat32m1_t _sum3 = vle32_v_f32m1(outptr0 + packn * 3, vl);
565
566 vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl);
567 vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl);
568 vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl);
569 vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl);
570 vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl);
571 vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl);
572 vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl);
573
574 kptr += packn * 7;
575
576 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
577 _sum1 = vfmacc_vf_f32m1(_sum1, r0[2], _k00, vl);
578 _sum2 = vfmacc_vf_f32m1(_sum2, r0[4], _k00, vl);
579 _sum3 = vfmacc_vf_f32m1(_sum3, r0[6], _k00, vl);
580 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
581 _sum1 = vfmacc_vf_f32m1(_sum1, r0[3], _k01, vl);
582 _sum2 = vfmacc_vf_f32m1(_sum2, r0[5], _k01, vl);
583 _sum3 = vfmacc_vf_f32m1(_sum3, r0[7], _k01, vl);
584 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
585 _sum1 = vfmacc_vf_f32m1(_sum1, r0[4], _k02, vl);
586 _sum2 = vfmacc_vf_f32m1(_sum2, r0[6], _k02, vl);
587 _sum3 = vfmacc_vf_f32m1(_sum3, r0[8], _k02, vl);
588 _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl);
589 _sum1 = vfmacc_vf_f32m1(_sum1, r0[5], _k03, vl);
590 _sum2 = vfmacc_vf_f32m1(_sum2, r0[7], _k03, vl);
591 _sum3 = vfmacc_vf_f32m1(_sum3, r0[9], _k03, vl);
592 _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl);
593 _sum1 = vfmacc_vf_f32m1(_sum1, r0[6], _k04, vl);
594 _sum2 = vfmacc_vf_f32m1(_sum2, r0[8], _k04, vl);
595 _sum3 = vfmacc_vf_f32m1(_sum3, r0[10], _k04, vl);
596 _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl);
597 _sum1 = vfmacc_vf_f32m1(_sum1, r0[7], _k05, vl);
598 _sum2 = vfmacc_vf_f32m1(_sum2, r0[9], _k05, vl);
599 _sum3 = vfmacc_vf_f32m1(_sum3, r0[11], _k05, vl);
600 _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl);
601 _sum1 = vfmacc_vf_f32m1(_sum1, r0[8], _k06, vl);
602 _sum2 = vfmacc_vf_f32m1(_sum2, r0[10], _k06, vl);
603 _sum3 = vfmacc_vf_f32m1(_sum3, r0[12], _k06, vl);
604
605 vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl);
606 vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl);
607 vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl);
608 vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl);
609 vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl);
610 vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl);
611 vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl);
612
613 kptr += packn * 7;
614
615 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
616 _sum1 = vfmacc_vf_f32m1(_sum1, r1[2], _k10, vl);
617 _sum2 = vfmacc_vf_f32m1(_sum2, r1[4], _k10, vl);
618 _sum3 = vfmacc_vf_f32m1(_sum3, r1[6], _k10, vl);
619 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
620 _sum1 = vfmacc_vf_f32m1(_sum1, r1[3], _k11, vl);
621 _sum2 = vfmacc_vf_f32m1(_sum2, r1[5], _k11, vl);
622 _sum3 = vfmacc_vf_f32m1(_sum3, r1[7], _k11, vl);
623 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
624 _sum1 = vfmacc_vf_f32m1(_sum1, r1[4], _k12, vl);
625 _sum2 = vfmacc_vf_f32m1(_sum2, r1[6], _k12, vl);
626 _sum3 = vfmacc_vf_f32m1(_sum3, r1[8], _k12, vl);
627 _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl);
628 _sum1 = vfmacc_vf_f32m1(_sum1, r1[5], _k13, vl);
629 _sum2 = vfmacc_vf_f32m1(_sum2, r1[7], _k13, vl);
630 _sum3 = vfmacc_vf_f32m1(_sum3, r1[9], _k13, vl);
631 _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl);
632 _sum1 = vfmacc_vf_f32m1(_sum1, r1[6], _k14, vl);
633 _sum2 = vfmacc_vf_f32m1(_sum2, r1[8], _k14, vl);
634 _sum3 = vfmacc_vf_f32m1(_sum3, r1[10], _k14, vl);
635 _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl);
636 _sum1 = vfmacc_vf_f32m1(_sum1, r1[7], _k15, vl);
637 _sum2 = vfmacc_vf_f32m1(_sum2, r1[9], _k15, vl);
638 _sum3 = vfmacc_vf_f32m1(_sum3, r1[11], _k15, vl);
639 _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl);
640 _sum1 = vfmacc_vf_f32m1(_sum1, r1[8], _k16, vl);
641 _sum2 = vfmacc_vf_f32m1(_sum2, r1[10], _k16, vl);
642 _sum3 = vfmacc_vf_f32m1(_sum3, r1[12], _k16, vl);
643
644 vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl);
645 vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl);
646 vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl);
647 vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl);
648 vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl);
649 vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl);
650 vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl);
651
652 kptr += packn * 7;
653
654 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
655 _sum1 = vfmacc_vf_f32m1(_sum1, r2[2], _k20, vl);
656 _sum2 = vfmacc_vf_f32m1(_sum2, r2[4], _k20, vl);
657 _sum3 = vfmacc_vf_f32m1(_sum3, r2[6], _k20, vl);
658 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
659 _sum1 = vfmacc_vf_f32m1(_sum1, r2[3], _k21, vl);
660 _sum2 = vfmacc_vf_f32m1(_sum2, r2[5], _k21, vl);
661 _sum3 = vfmacc_vf_f32m1(_sum3, r2[7], _k21, vl);
662 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
663 _sum1 = vfmacc_vf_f32m1(_sum1, r2[4], _k22, vl);
664 _sum2 = vfmacc_vf_f32m1(_sum2, r2[6], _k22, vl);
665 _sum3 = vfmacc_vf_f32m1(_sum3, r2[8], _k22, vl);
666 _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl);
667 _sum1 = vfmacc_vf_f32m1(_sum1, r2[5], _k23, vl);
668 _sum2 = vfmacc_vf_f32m1(_sum2, r2[7], _k23, vl);
669 _sum3 = vfmacc_vf_f32m1(_sum3, r2[9], _k23, vl);
670 _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl);
671 _sum1 = vfmacc_vf_f32m1(_sum1, r2[6], _k24, vl);
672 _sum2 = vfmacc_vf_f32m1(_sum2, r2[8], _k24, vl);
673 _sum3 = vfmacc_vf_f32m1(_sum3, r2[10], _k24, vl);
674 _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl);
675 _sum1 = vfmacc_vf_f32m1(_sum1, r2[7], _k25, vl);
676 _sum2 = vfmacc_vf_f32m1(_sum2, r2[9], _k25, vl);
677 _sum3 = vfmacc_vf_f32m1(_sum3, r2[11], _k25, vl);
678 _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl);
679 _sum1 = vfmacc_vf_f32m1(_sum1, r2[8], _k26, vl);
680 _sum2 = vfmacc_vf_f32m1(_sum2, r2[10], _k26, vl);
681 _sum3 = vfmacc_vf_f32m1(_sum3, r2[12], _k26, vl);
682
683 vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl);
684 vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl);
685 vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl);
686 vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl);
687 vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl);
688 vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl);
689 vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl);
690
691 kptr += packn * 7;
692
693 _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl);
694 _sum1 = vfmacc_vf_f32m1(_sum1, r3[2], _k30, vl);
695 _sum2 = vfmacc_vf_f32m1(_sum2, r3[4], _k30, vl);
696 _sum3 = vfmacc_vf_f32m1(_sum3, r3[6], _k30, vl);
697 _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl);
698 _sum1 = vfmacc_vf_f32m1(_sum1, r3[3], _k31, vl);
699 _sum2 = vfmacc_vf_f32m1(_sum2, r3[5], _k31, vl);
700 _sum3 = vfmacc_vf_f32m1(_sum3, r3[7], _k31, vl);
701 _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl);
702 _sum1 = vfmacc_vf_f32m1(_sum1, r3[4], _k32, vl);
703 _sum2 = vfmacc_vf_f32m1(_sum2, r3[6], _k32, vl);
704 _sum3 = vfmacc_vf_f32m1(_sum3, r3[8], _k32, vl);
705 _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl);
706 _sum1 = vfmacc_vf_f32m1(_sum1, r3[5], _k33, vl);
707 _sum2 = vfmacc_vf_f32m1(_sum2, r3[7], _k33, vl);
708 _sum3 = vfmacc_vf_f32m1(_sum3, r3[9], _k33, vl);
709 _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl);
710 _sum1 = vfmacc_vf_f32m1(_sum1, r3[6], _k34, vl);
711 _sum2 = vfmacc_vf_f32m1(_sum2, r3[8], _k34, vl);
712 _sum3 = vfmacc_vf_f32m1(_sum3, r3[10], _k34, vl);
713 _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl);
714 _sum1 = vfmacc_vf_f32m1(_sum1, r3[7], _k35, vl);
715 _sum2 = vfmacc_vf_f32m1(_sum2, r3[9], _k35, vl);
716 _sum3 = vfmacc_vf_f32m1(_sum3, r3[11], _k35, vl);
717 _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl);
718 _sum1 = vfmacc_vf_f32m1(_sum1, r3[8], _k36, vl);
719 _sum2 = vfmacc_vf_f32m1(_sum2, r3[10], _k36, vl);
720 _sum3 = vfmacc_vf_f32m1(_sum3, r3[12], _k36, vl);
721
722 vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl);
723 vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl);
724 vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl);
725 vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl);
726 vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl);
727 vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl);
728 vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl);
729
730 kptr += packn * 7;
731
732 _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl);
733 _sum1 = vfmacc_vf_f32m1(_sum1, r4[2], _k40, vl);
734 _sum2 = vfmacc_vf_f32m1(_sum2, r4[4], _k40, vl);
735 _sum3 = vfmacc_vf_f32m1(_sum3, r4[6], _k40, vl);
736 _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl);
737 _sum1 = vfmacc_vf_f32m1(_sum1, r4[3], _k41, vl);
738 _sum2 = vfmacc_vf_f32m1(_sum2, r4[5], _k41, vl);
739 _sum3 = vfmacc_vf_f32m1(_sum3, r4[7], _k41, vl);
740 _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl);
741 _sum1 = vfmacc_vf_f32m1(_sum1, r4[4], _k42, vl);
742 _sum2 = vfmacc_vf_f32m1(_sum2, r4[6], _k42, vl);
743 _sum3 = vfmacc_vf_f32m1(_sum3, r4[8], _k42, vl);
744 _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl);
745 _sum1 = vfmacc_vf_f32m1(_sum1, r4[5], _k43, vl);
746 _sum2 = vfmacc_vf_f32m1(_sum2, r4[7], _k43, vl);
747 _sum3 = vfmacc_vf_f32m1(_sum3, r4[9], _k43, vl);
748 _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl);
749 _sum1 = vfmacc_vf_f32m1(_sum1, r4[6], _k44, vl);
750 _sum2 = vfmacc_vf_f32m1(_sum2, r4[8], _k44, vl);
751 _sum3 = vfmacc_vf_f32m1(_sum3, r4[10], _k44, vl);
752 _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl);
753 _sum1 = vfmacc_vf_f32m1(_sum1, r4[7], _k45, vl);
754 _sum2 = vfmacc_vf_f32m1(_sum2, r4[9], _k45, vl);
755 _sum3 = vfmacc_vf_f32m1(_sum3, r4[11], _k45, vl);
756 _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl);
757 _sum1 = vfmacc_vf_f32m1(_sum1, r4[8], _k46, vl);
758 _sum2 = vfmacc_vf_f32m1(_sum2, r4[10], _k46, vl);
759 _sum3 = vfmacc_vf_f32m1(_sum3, r4[12], _k46, vl);
760
761 vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl);
762 vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl);
763 vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl);
764 vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl);
765 vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl);
766 vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl);
767 vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl);
768
769 kptr += packn * 7;
770
771 _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl);
772 _sum1 = vfmacc_vf_f32m1(_sum1, r5[2], _k50, vl);
773 _sum2 = vfmacc_vf_f32m1(_sum2, r5[4], _k50, vl);
774 _sum3 = vfmacc_vf_f32m1(_sum3, r5[6], _k50, vl);
775 _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl);
776 _sum1 = vfmacc_vf_f32m1(_sum1, r5[3], _k51, vl);
777 _sum2 = vfmacc_vf_f32m1(_sum2, r5[5], _k51, vl);
778 _sum3 = vfmacc_vf_f32m1(_sum3, r5[7], _k51, vl);
779 _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl);
780 _sum1 = vfmacc_vf_f32m1(_sum1, r5[4], _k52, vl);
781 _sum2 = vfmacc_vf_f32m1(_sum2, r5[6], _k52, vl);
782 _sum3 = vfmacc_vf_f32m1(_sum3, r5[8], _k52, vl);
783 _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl);
784 _sum1 = vfmacc_vf_f32m1(_sum1, r5[5], _k53, vl);
785 _sum2 = vfmacc_vf_f32m1(_sum2, r5[7], _k53, vl);
786 _sum3 = vfmacc_vf_f32m1(_sum3, r5[9], _k53, vl);
787 _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl);
788 _sum1 = vfmacc_vf_f32m1(_sum1, r5[6], _k54, vl);
789 _sum2 = vfmacc_vf_f32m1(_sum2, r5[8], _k54, vl);
790 _sum3 = vfmacc_vf_f32m1(_sum3, r5[10], _k54, vl);
791 _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl);
792 _sum1 = vfmacc_vf_f32m1(_sum1, r5[7], _k55, vl);
793 _sum2 = vfmacc_vf_f32m1(_sum2, r5[9], _k55, vl);
794 _sum3 = vfmacc_vf_f32m1(_sum3, r5[11], _k55, vl);
795 _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl);
796 _sum1 = vfmacc_vf_f32m1(_sum1, r5[8], _k56, vl);
797 _sum2 = vfmacc_vf_f32m1(_sum2, r5[10], _k56, vl);
798 _sum3 = vfmacc_vf_f32m1(_sum3, r5[12], _k56, vl);
799
800 vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl);
801 vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl);
802 vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl);
803 vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl);
804 vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl);
805 vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl);
806 vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl);
807
808 kptr -= packn * 42;
809
810 _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl);
811 _sum1 = vfmacc_vf_f32m1(_sum1, r6[2], _k60, vl);
812 _sum2 = vfmacc_vf_f32m1(_sum2, r6[4], _k60, vl);
813 _sum3 = vfmacc_vf_f32m1(_sum3, r6[6], _k60, vl);
814 _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl);
815 _sum1 = vfmacc_vf_f32m1(_sum1, r6[3], _k61, vl);
816 _sum2 = vfmacc_vf_f32m1(_sum2, r6[5], _k61, vl);
817 _sum3 = vfmacc_vf_f32m1(_sum3, r6[7], _k61, vl);
818 _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl);
819 _sum1 = vfmacc_vf_f32m1(_sum1, r6[4], _k62, vl);
820 _sum2 = vfmacc_vf_f32m1(_sum2, r6[6], _k62, vl);
821 _sum3 = vfmacc_vf_f32m1(_sum3, r6[8], _k62, vl);
822 _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl);
823 _sum1 = vfmacc_vf_f32m1(_sum1, r6[5], _k63, vl);
824 _sum2 = vfmacc_vf_f32m1(_sum2, r6[7], _k63, vl);
825 _sum3 = vfmacc_vf_f32m1(_sum3, r6[9], _k63, vl);
826 _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl);
827 _sum1 = vfmacc_vf_f32m1(_sum1, r6[6], _k64, vl);
828 _sum2 = vfmacc_vf_f32m1(_sum2, r6[8], _k64, vl);
829 _sum3 = vfmacc_vf_f32m1(_sum3, r6[10], _k64, vl);
830 _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl);
831 _sum1 = vfmacc_vf_f32m1(_sum1, r6[7], _k65, vl);
832 _sum2 = vfmacc_vf_f32m1(_sum2, r6[9], _k65, vl);
833 _sum3 = vfmacc_vf_f32m1(_sum3, r6[11], _k65, vl);
834 _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl);
835 _sum1 = vfmacc_vf_f32m1(_sum1, r6[8], _k66, vl);
836 _sum2 = vfmacc_vf_f32m1(_sum2, r6[10], _k66, vl);
837 _sum3 = vfmacc_vf_f32m1(_sum3, r6[12], _k66, vl);
838
839 vse32_v_f32m1(outptr0, _sum0, vl);
840 vse32_v_f32m1(outptr0 + packn, _sum1, vl);
841 vse32_v_f32m1(outptr0 + packn * 2, _sum2, vl);
842 vse32_v_f32m1(outptr0 + packn * 3, _sum3, vl);
843
844 outptr0 += packn * 4;
845
846 r0 += 8;
847 r1 += 8;
848 r2 += 8;
849 r3 += 8;
850 r4 += 8;
851 r5 += 8;
852 r6 += 8;
853 }
854 for (; j < outw; j++)
855 {
856 vfloat32m1_t _sum0 = vle32_v_f32m1(outptr0, vl);
857
858 vfloat32m1_t _k00 = vle32_v_f32m1(kptr, vl);
859 vfloat32m1_t _k01 = vle32_v_f32m1(kptr + packn, vl);
860 vfloat32m1_t _k02 = vle32_v_f32m1(kptr + packn * 2, vl);
861 vfloat32m1_t _k03 = vle32_v_f32m1(kptr + packn * 3, vl);
862 vfloat32m1_t _k04 = vle32_v_f32m1(kptr + packn * 4, vl);
863 vfloat32m1_t _k05 = vle32_v_f32m1(kptr + packn * 5, vl);
864 vfloat32m1_t _k06 = vle32_v_f32m1(kptr + packn * 6, vl);
865
866 kptr += packn * 7;
867
868 _sum0 = vfmacc_vf_f32m1(_sum0, r0[0], _k00, vl);
869 _sum0 = vfmacc_vf_f32m1(_sum0, r0[1], _k01, vl);
870 _sum0 = vfmacc_vf_f32m1(_sum0, r0[2], _k02, vl);
871 _sum0 = vfmacc_vf_f32m1(_sum0, r0[3], _k03, vl);
872 _sum0 = vfmacc_vf_f32m1(_sum0, r0[4], _k04, vl);
873 _sum0 = vfmacc_vf_f32m1(_sum0, r0[5], _k05, vl);
874 _sum0 = vfmacc_vf_f32m1(_sum0, r0[6], _k06, vl);
875
876 vfloat32m1_t _k10 = vle32_v_f32m1(kptr, vl);
877 vfloat32m1_t _k11 = vle32_v_f32m1(kptr + packn, vl);
878 vfloat32m1_t _k12 = vle32_v_f32m1(kptr + packn * 2, vl);
879 vfloat32m1_t _k13 = vle32_v_f32m1(kptr + packn * 3, vl);
880 vfloat32m1_t _k14 = vle32_v_f32m1(kptr + packn * 4, vl);
881 vfloat32m1_t _k15 = vle32_v_f32m1(kptr + packn * 5, vl);
882 vfloat32m1_t _k16 = vle32_v_f32m1(kptr + packn * 6, vl);
883
884 kptr += packn * 7;
885
886 _sum0 = vfmacc_vf_f32m1(_sum0, r1[0], _k10, vl);
887 _sum0 = vfmacc_vf_f32m1(_sum0, r1[1], _k11, vl);
888 _sum0 = vfmacc_vf_f32m1(_sum0, r1[2], _k12, vl);
889 _sum0 = vfmacc_vf_f32m1(_sum0, r1[3], _k13, vl);
890 _sum0 = vfmacc_vf_f32m1(_sum0, r1[4], _k14, vl);
891 _sum0 = vfmacc_vf_f32m1(_sum0, r1[5], _k15, vl);
892 _sum0 = vfmacc_vf_f32m1(_sum0, r1[6], _k16, vl);
893
894 vfloat32m1_t _k20 = vle32_v_f32m1(kptr, vl);
895 vfloat32m1_t _k21 = vle32_v_f32m1(kptr + packn, vl);
896 vfloat32m1_t _k22 = vle32_v_f32m1(kptr + packn * 2, vl);
897 vfloat32m1_t _k23 = vle32_v_f32m1(kptr + packn * 3, vl);
898 vfloat32m1_t _k24 = vle32_v_f32m1(kptr + packn * 4, vl);
899 vfloat32m1_t _k25 = vle32_v_f32m1(kptr + packn * 5, vl);
900 vfloat32m1_t _k26 = vle32_v_f32m1(kptr + packn * 6, vl);
901
902 kptr += packn * 7;
903
904 _sum0 = vfmacc_vf_f32m1(_sum0, r2[0], _k20, vl);
905 _sum0 = vfmacc_vf_f32m1(_sum0, r2[1], _k21, vl);
906 _sum0 = vfmacc_vf_f32m1(_sum0, r2[2], _k22, vl);
907 _sum0 = vfmacc_vf_f32m1(_sum0, r2[3], _k23, vl);
908 _sum0 = vfmacc_vf_f32m1(_sum0, r2[4], _k24, vl);
909 _sum0 = vfmacc_vf_f32m1(_sum0, r2[5], _k25, vl);
910 _sum0 = vfmacc_vf_f32m1(_sum0, r2[6], _k26, vl);
911
912 vfloat32m1_t _k30 = vle32_v_f32m1(kptr, vl);
913 vfloat32m1_t _k31 = vle32_v_f32m1(kptr + packn, vl);
914 vfloat32m1_t _k32 = vle32_v_f32m1(kptr + packn * 2, vl);
915 vfloat32m1_t _k33 = vle32_v_f32m1(kptr + packn * 3, vl);
916 vfloat32m1_t _k34 = vle32_v_f32m1(kptr + packn * 4, vl);
917 vfloat32m1_t _k35 = vle32_v_f32m1(kptr + packn * 5, vl);
918 vfloat32m1_t _k36 = vle32_v_f32m1(kptr + packn * 6, vl);
919
920 kptr += packn * 7;
921
922 _sum0 = vfmacc_vf_f32m1(_sum0, r3[0], _k30, vl);
923 _sum0 = vfmacc_vf_f32m1(_sum0, r3[1], _k31, vl);
924 _sum0 = vfmacc_vf_f32m1(_sum0, r3[2], _k32, vl);
925 _sum0 = vfmacc_vf_f32m1(_sum0, r3[3], _k33, vl);
926 _sum0 = vfmacc_vf_f32m1(_sum0, r3[4], _k34, vl);
927 _sum0 = vfmacc_vf_f32m1(_sum0, r3[5], _k35, vl);
928 _sum0 = vfmacc_vf_f32m1(_sum0, r3[6], _k36, vl);
929
930 vfloat32m1_t _k40 = vle32_v_f32m1(kptr, vl);
931 vfloat32m1_t _k41 = vle32_v_f32m1(kptr + packn, vl);
932 vfloat32m1_t _k42 = vle32_v_f32m1(kptr + packn * 2, vl);
933 vfloat32m1_t _k43 = vle32_v_f32m1(kptr + packn * 3, vl);
934 vfloat32m1_t _k44 = vle32_v_f32m1(kptr + packn * 4, vl);
935 vfloat32m1_t _k45 = vle32_v_f32m1(kptr + packn * 5, vl);
936 vfloat32m1_t _k46 = vle32_v_f32m1(kptr + packn * 6, vl);
937
938 kptr += packn * 7;
939
940 _sum0 = vfmacc_vf_f32m1(_sum0, r4[0], _k40, vl);
941 _sum0 = vfmacc_vf_f32m1(_sum0, r4[1], _k41, vl);
942 _sum0 = vfmacc_vf_f32m1(_sum0, r4[2], _k42, vl);
943 _sum0 = vfmacc_vf_f32m1(_sum0, r4[3], _k43, vl);
944 _sum0 = vfmacc_vf_f32m1(_sum0, r4[4], _k44, vl);
945 _sum0 = vfmacc_vf_f32m1(_sum0, r4[5], _k45, vl);
946 _sum0 = vfmacc_vf_f32m1(_sum0, r4[6], _k46, vl);
947
948 vfloat32m1_t _k50 = vle32_v_f32m1(kptr, vl);
949 vfloat32m1_t _k51 = vle32_v_f32m1(kptr + packn, vl);
950 vfloat32m1_t _k52 = vle32_v_f32m1(kptr + packn * 2, vl);
951 vfloat32m1_t _k53 = vle32_v_f32m1(kptr + packn * 3, vl);
952 vfloat32m1_t _k54 = vle32_v_f32m1(kptr + packn * 4, vl);
953 vfloat32m1_t _k55 = vle32_v_f32m1(kptr + packn * 5, vl);
954 vfloat32m1_t _k56 = vle32_v_f32m1(kptr + packn * 6, vl);
955
956 kptr += packn * 7;
957
958 _sum0 = vfmacc_vf_f32m1(_sum0, r5[0], _k50, vl);
959 _sum0 = vfmacc_vf_f32m1(_sum0, r5[1], _k51, vl);
960 _sum0 = vfmacc_vf_f32m1(_sum0, r5[2], _k52, vl);
961 _sum0 = vfmacc_vf_f32m1(_sum0, r5[3], _k53, vl);
962 _sum0 = vfmacc_vf_f32m1(_sum0, r5[4], _k54, vl);
963 _sum0 = vfmacc_vf_f32m1(_sum0, r5[5], _k55, vl);
964 _sum0 = vfmacc_vf_f32m1(_sum0, r5[6], _k56, vl);
965
966 vfloat32m1_t _k60 = vle32_v_f32m1(kptr, vl);
967 vfloat32m1_t _k61 = vle32_v_f32m1(kptr + packn, vl);
968 vfloat32m1_t _k62 = vle32_v_f32m1(kptr + packn * 2, vl);
969 vfloat32m1_t _k63 = vle32_v_f32m1(kptr + packn * 3, vl);
970 vfloat32m1_t _k64 = vle32_v_f32m1(kptr + packn * 4, vl);
971 vfloat32m1_t _k65 = vle32_v_f32m1(kptr + packn * 5, vl);
972 vfloat32m1_t _k66 = vle32_v_f32m1(kptr + packn * 6, vl);
973
974 kptr -= packn * 42;
975
976 _sum0 = vfmacc_vf_f32m1(_sum0, r6[0], _k60, vl);
977 _sum0 = vfmacc_vf_f32m1(_sum0, r6[1], _k61, vl);
978 _sum0 = vfmacc_vf_f32m1(_sum0, r6[2], _k62, vl);
979 _sum0 = vfmacc_vf_f32m1(_sum0, r6[3], _k63, vl);
980 _sum0 = vfmacc_vf_f32m1(_sum0, r6[4], _k64, vl);
981 _sum0 = vfmacc_vf_f32m1(_sum0, r6[5], _k65, vl);
982 _sum0 = vfmacc_vf_f32m1(_sum0, r6[6], _k66, vl);
983
984 vse32_v_f32m1(outptr0, _sum0, vl);
985
986 outptr0 += packn;
987
988 r0 += 2;
989 r1 += 2;
990 r2 += 2;
991 r3 += 2;
992 r4 += 2;
993 r5 += 2;
994 r6 += 2;
995 }
996
997 r0 += tailstep;
998 r1 += tailstep;
999 r2 += tailstep;
1000 r3 += tailstep;
1001 r4 += tailstep;
1002 r5 += tailstep;
1003 r6 += tailstep;
1004 }
1005 }
1006 }
1007 }
1008