1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
convdw5x5s1_pack4_msa(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)15 static void convdw5x5s1_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
16 {
17 int w = bottom_blob.w;
18
19 int outw = top_blob.w;
20 int outh = top_blob.h;
21
22 const int group = bottom_blob.c;
23
24 const float* bias = _bias;
25
26 #pragma omp parallel for num_threads(opt.num_threads)
27 for (int g = 0; g < group; g++)
28 {
29 Mat out = top_blob.channel(g);
30
31 v4f32 _bias0 = bias ? (v4f32)__msa_ld_w(bias + g * 4, 0) : (v4f32)__msa_fill_w(0);
32
33 const float* k0 = kernel.row(g);
34
35 float* outptr0 = out.row(0);
36 float* outptr1 = out.row(1);
37
38 const Mat img0 = bottom_blob.channel(g);
39
40 const float* r0 = img0.row(0);
41 const float* r1 = img0.row(1);
42 const float* r2 = img0.row(2);
43 const float* r3 = img0.row(3);
44 const float* r4 = img0.row(4);
45 const float* r5 = img0.row(5);
46
47 int i = 0;
48 for (; i + 1 < outh; i += 2)
49 {
50 int j = 0;
51 for (; j < outw; j++)
52 {
53 __builtin_prefetch(r0 + 160);
54 __builtin_prefetch(r1 + 160);
55 __builtin_prefetch(r2 + 160);
56 __builtin_prefetch(r3 + 160);
57 __builtin_prefetch(r4 + 160);
58 __builtin_prefetch(r5 + 160);
59
60 __builtin_prefetch(k0 + 800);
61
62 v4f32 _sum0 = _bias0;
63 v4f32 _sum1 = _bias0;
64
65 v4f32 _r00 = (v4f32)__msa_ld_w(r0, 0);
66 v4f32 _r01 = (v4f32)__msa_ld_w(r0 + 4, 0);
67 v4f32 _r02 = (v4f32)__msa_ld_w(r0 + 4 * 2, 0);
68 v4f32 _r03 = (v4f32)__msa_ld_w(r0 + 4 * 3, 0);
69 v4f32 _r04 = (v4f32)__msa_ld_w(r0 + 4 * 4, 0);
70
71 v4f32 _k00 = (v4f32)__msa_ld_w(k0, 0);
72 v4f32 _k01 = (v4f32)__msa_ld_w(k0 + 4, 0);
73 v4f32 _k02 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
74 v4f32 _k03 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
75 v4f32 _k04 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
76 k0 += 4 * 5;
77
78 _sum0 = __msa_fmadd_w(_sum0, _k00, _r00);
79 _sum0 = __msa_fmadd_w(_sum0, _k01, _r01);
80 _sum0 = __msa_fmadd_w(_sum0, _k02, _r02);
81 _sum0 = __msa_fmadd_w(_sum0, _k03, _r03);
82 _sum0 = __msa_fmadd_w(_sum0, _k04, _r04);
83
84 v4f32 _r10 = (v4f32)__msa_ld_w(r1, 0);
85 v4f32 _r11 = (v4f32)__msa_ld_w(r1 + 4, 0);
86 v4f32 _r12 = (v4f32)__msa_ld_w(r1 + 4 * 2, 0);
87 v4f32 _r13 = (v4f32)__msa_ld_w(r1 + 4 * 3, 0);
88 v4f32 _r14 = (v4f32)__msa_ld_w(r1 + 4 * 4, 0);
89
90 _sum1 = __msa_fmadd_w(_sum1, _k00, _r10);
91 _sum1 = __msa_fmadd_w(_sum1, _k01, _r11);
92 _sum1 = __msa_fmadd_w(_sum1, _k02, _r12);
93 _sum1 = __msa_fmadd_w(_sum1, _k03, _r13);
94 _sum1 = __msa_fmadd_w(_sum1, _k04, _r14);
95
96 v4f32 _k10 = (v4f32)__msa_ld_w(k0, 0);
97 v4f32 _k11 = (v4f32)__msa_ld_w(k0 + 4, 0);
98 v4f32 _k12 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
99 v4f32 _k13 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
100 v4f32 _k14 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
101 k0 += 4 * 5;
102
103 _sum0 = __msa_fmadd_w(_sum0, _k10, _r10);
104 _sum0 = __msa_fmadd_w(_sum0, _k11, _r11);
105 _sum0 = __msa_fmadd_w(_sum0, _k12, _r12);
106 _sum0 = __msa_fmadd_w(_sum0, _k13, _r13);
107 _sum0 = __msa_fmadd_w(_sum0, _k14, _r14);
108
109 v4f32 _r20 = (v4f32)__msa_ld_w(r2, 0);
110 v4f32 _r21 = (v4f32)__msa_ld_w(r2 + 4, 0);
111 v4f32 _r22 = (v4f32)__msa_ld_w(r2 + 4 * 2, 0);
112 v4f32 _r23 = (v4f32)__msa_ld_w(r2 + 4 * 3, 0);
113 v4f32 _r24 = (v4f32)__msa_ld_w(r2 + 4 * 4, 0);
114
115 _sum1 = __msa_fmadd_w(_sum1, _k10, _r20);
116 _sum1 = __msa_fmadd_w(_sum1, _k11, _r21);
117 _sum1 = __msa_fmadd_w(_sum1, _k12, _r22);
118 _sum1 = __msa_fmadd_w(_sum1, _k13, _r23);
119 _sum1 = __msa_fmadd_w(_sum1, _k14, _r24);
120
121 v4f32 _k20 = (v4f32)__msa_ld_w(k0, 0);
122 v4f32 _k21 = (v4f32)__msa_ld_w(k0 + 4, 0);
123 v4f32 _k22 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
124 v4f32 _k23 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
125 v4f32 _k24 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
126 k0 += 4 * 5;
127
128 _sum0 = __msa_fmadd_w(_sum0, _k20, _r20);
129 _sum0 = __msa_fmadd_w(_sum0, _k21, _r21);
130 _sum0 = __msa_fmadd_w(_sum0, _k22, _r22);
131 _sum0 = __msa_fmadd_w(_sum0, _k23, _r23);
132 _sum0 = __msa_fmadd_w(_sum0, _k24, _r24);
133
134 v4f32 _r30 = (v4f32)__msa_ld_w(r3, 0);
135 v4f32 _r31 = (v4f32)__msa_ld_w(r3 + 4, 0);
136 v4f32 _r32 = (v4f32)__msa_ld_w(r3 + 4 * 2, 0);
137 v4f32 _r33 = (v4f32)__msa_ld_w(r3 + 4 * 3, 0);
138 v4f32 _r34 = (v4f32)__msa_ld_w(r3 + 4 * 4, 0);
139
140 _sum1 = __msa_fmadd_w(_sum1, _k20, _r30);
141 _sum1 = __msa_fmadd_w(_sum1, _k21, _r31);
142 _sum1 = __msa_fmadd_w(_sum1, _k22, _r32);
143 _sum1 = __msa_fmadd_w(_sum1, _k23, _r33);
144 _sum1 = __msa_fmadd_w(_sum1, _k24, _r34);
145
146 v4f32 _k30 = (v4f32)__msa_ld_w(k0, 0);
147 v4f32 _k31 = (v4f32)__msa_ld_w(k0 + 4, 0);
148 v4f32 _k32 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
149 v4f32 _k33 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
150 v4f32 _k34 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
151 k0 += 4 * 5;
152
153 _sum0 = __msa_fmadd_w(_sum0, _k30, _r30);
154 _sum0 = __msa_fmadd_w(_sum0, _k31, _r31);
155 _sum0 = __msa_fmadd_w(_sum0, _k32, _r32);
156 _sum0 = __msa_fmadd_w(_sum0, _k33, _r33);
157 _sum0 = __msa_fmadd_w(_sum0, _k34, _r34);
158
159 v4f32 _r40 = (v4f32)__msa_ld_w(r4, 0);
160 v4f32 _r41 = (v4f32)__msa_ld_w(r4 + 4, 0);
161 v4f32 _r42 = (v4f32)__msa_ld_w(r4 + 4 * 2, 0);
162 v4f32 _r43 = (v4f32)__msa_ld_w(r4 + 4 * 3, 0);
163 v4f32 _r44 = (v4f32)__msa_ld_w(r4 + 4 * 4, 0);
164
165 _sum1 = __msa_fmadd_w(_sum1, _k30, _r40);
166 _sum1 = __msa_fmadd_w(_sum1, _k31, _r41);
167 _sum1 = __msa_fmadd_w(_sum1, _k32, _r42);
168 _sum1 = __msa_fmadd_w(_sum1, _k33, _r43);
169 _sum1 = __msa_fmadd_w(_sum1, _k34, _r44);
170
171 v4f32 _k40 = (v4f32)__msa_ld_w(k0, 0);
172 v4f32 _k41 = (v4f32)__msa_ld_w(k0 + 4, 0);
173 v4f32 _k42 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
174 v4f32 _k43 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
175 v4f32 _k44 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
176 k0 -= 4 * 20;
177
178 _sum0 = __msa_fmadd_w(_sum0, _k40, _r40);
179 _sum0 = __msa_fmadd_w(_sum0, _k41, _r41);
180 _sum0 = __msa_fmadd_w(_sum0, _k42, _r42);
181 _sum0 = __msa_fmadd_w(_sum0, _k43, _r43);
182 _sum0 = __msa_fmadd_w(_sum0, _k44, _r44);
183
184 v4f32 _r50 = (v4f32)__msa_ld_w(r5, 0);
185 v4f32 _r51 = (v4f32)__msa_ld_w(r5 + 4, 0);
186 v4f32 _r52 = (v4f32)__msa_ld_w(r5 + 4 * 2, 0);
187 v4f32 _r53 = (v4f32)__msa_ld_w(r5 + 4 * 3, 0);
188 v4f32 _r54 = (v4f32)__msa_ld_w(r5 + 4 * 4, 0);
189
190 _sum1 = __msa_fmadd_w(_sum1, _k40, _r50);
191 _sum1 = __msa_fmadd_w(_sum1, _k41, _r51);
192 _sum1 = __msa_fmadd_w(_sum1, _k42, _r52);
193 _sum1 = __msa_fmadd_w(_sum1, _k43, _r53);
194 _sum1 = __msa_fmadd_w(_sum1, _k44, _r54);
195
196 __msa_st_w((v4i32)_sum0, outptr0, 0);
197 __msa_st_w((v4i32)_sum1, outptr1, 0);
198
199 outptr0 += 4;
200 outptr1 += 4;
201
202 r0 += 4;
203 r1 += 4;
204 r2 += 4;
205 r3 += 4;
206 r4 += 4;
207 r5 += 4;
208 }
209
210 r0 += 4 * 4 + w * 4;
211 r1 += 4 * 4 + w * 4;
212 r2 += 4 * 4 + w * 4;
213 r3 += 4 * 4 + w * 4;
214 r4 += 4 * 4 + w * 4;
215 r5 += 4 * 4 + w * 4;
216
217 outptr0 += outw * 4;
218 outptr1 += outw * 4;
219 }
220 for (; i < outh; i++)
221 {
222 int j = 0;
223 for (; j < outw; j++)
224 {
225 __builtin_prefetch(r0 + 160);
226 __builtin_prefetch(r1 + 160);
227 __builtin_prefetch(r2 + 160);
228 __builtin_prefetch(r3 + 160);
229 __builtin_prefetch(r4 + 160);
230
231 __builtin_prefetch(k0 + 800);
232
233 v4f32 _sum0 = _bias0;
234
235 v4f32 _r00 = (v4f32)__msa_ld_w(r0, 0);
236 v4f32 _r01 = (v4f32)__msa_ld_w(r0 + 4, 0);
237 v4f32 _r02 = (v4f32)__msa_ld_w(r0 + 4 * 2, 0);
238 v4f32 _r03 = (v4f32)__msa_ld_w(r0 + 4 * 3, 0);
239 v4f32 _r04 = (v4f32)__msa_ld_w(r0 + 4 * 4, 0);
240
241 v4f32 _k00 = (v4f32)__msa_ld_w(k0, 0);
242 v4f32 _k01 = (v4f32)__msa_ld_w(k0 + 4, 0);
243 v4f32 _k02 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
244 v4f32 _k03 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
245 v4f32 _k04 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
246 k0 += 4 * 5;
247
248 _sum0 = __msa_fmadd_w(_sum0, _k00, _r00);
249 _sum0 = __msa_fmadd_w(_sum0, _k01, _r01);
250 _sum0 = __msa_fmadd_w(_sum0, _k02, _r02);
251 _sum0 = __msa_fmadd_w(_sum0, _k03, _r03);
252 _sum0 = __msa_fmadd_w(_sum0, _k04, _r04);
253
254 v4f32 _r10 = (v4f32)__msa_ld_w(r1, 0);
255 v4f32 _r11 = (v4f32)__msa_ld_w(r1 + 4, 0);
256 v4f32 _r12 = (v4f32)__msa_ld_w(r1 + 4 * 2, 0);
257 v4f32 _r13 = (v4f32)__msa_ld_w(r1 + 4 * 3, 0);
258 v4f32 _r14 = (v4f32)__msa_ld_w(r1 + 4 * 4, 0);
259
260 v4f32 _k10 = (v4f32)__msa_ld_w(k0, 0);
261 v4f32 _k11 = (v4f32)__msa_ld_w(k0 + 4, 0);
262 v4f32 _k12 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
263 v4f32 _k13 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
264 v4f32 _k14 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
265 k0 += 4 * 5;
266
267 _sum0 = __msa_fmadd_w(_sum0, _k10, _r10);
268 _sum0 = __msa_fmadd_w(_sum0, _k11, _r11);
269 _sum0 = __msa_fmadd_w(_sum0, _k12, _r12);
270 _sum0 = __msa_fmadd_w(_sum0, _k13, _r13);
271 _sum0 = __msa_fmadd_w(_sum0, _k14, _r14);
272
273 v4f32 _r20 = (v4f32)__msa_ld_w(r2, 0);
274 v4f32 _r21 = (v4f32)__msa_ld_w(r2 + 4, 0);
275 v4f32 _r22 = (v4f32)__msa_ld_w(r2 + 4 * 2, 0);
276 v4f32 _r23 = (v4f32)__msa_ld_w(r2 + 4 * 3, 0);
277 v4f32 _r24 = (v4f32)__msa_ld_w(r2 + 4 * 4, 0);
278
279 v4f32 _k20 = (v4f32)__msa_ld_w(k0, 0);
280 v4f32 _k21 = (v4f32)__msa_ld_w(k0 + 4, 0);
281 v4f32 _k22 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
282 v4f32 _k23 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
283 v4f32 _k24 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
284 k0 += 4 * 5;
285
286 _sum0 = __msa_fmadd_w(_sum0, _k20, _r20);
287 _sum0 = __msa_fmadd_w(_sum0, _k21, _r21);
288 _sum0 = __msa_fmadd_w(_sum0, _k22, _r22);
289 _sum0 = __msa_fmadd_w(_sum0, _k23, _r23);
290 _sum0 = __msa_fmadd_w(_sum0, _k24, _r24);
291
292 v4f32 _r30 = (v4f32)__msa_ld_w(r3, 0);
293 v4f32 _r31 = (v4f32)__msa_ld_w(r3 + 4, 0);
294 v4f32 _r32 = (v4f32)__msa_ld_w(r3 + 4 * 2, 0);
295 v4f32 _r33 = (v4f32)__msa_ld_w(r3 + 4 * 3, 0);
296 v4f32 _r34 = (v4f32)__msa_ld_w(r3 + 4 * 4, 0);
297
298 v4f32 _k30 = (v4f32)__msa_ld_w(k0, 0);
299 v4f32 _k31 = (v4f32)__msa_ld_w(k0 + 4, 0);
300 v4f32 _k32 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
301 v4f32 _k33 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
302 v4f32 _k34 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
303 k0 += 4 * 5;
304
305 _sum0 = __msa_fmadd_w(_sum0, _k30, _r30);
306 _sum0 = __msa_fmadd_w(_sum0, _k31, _r31);
307 _sum0 = __msa_fmadd_w(_sum0, _k32, _r32);
308 _sum0 = __msa_fmadd_w(_sum0, _k33, _r33);
309 _sum0 = __msa_fmadd_w(_sum0, _k34, _r34);
310
311 v4f32 _r40 = (v4f32)__msa_ld_w(r4, 0);
312 v4f32 _r41 = (v4f32)__msa_ld_w(r4 + 4, 0);
313 v4f32 _r42 = (v4f32)__msa_ld_w(r4 + 4 * 2, 0);
314 v4f32 _r43 = (v4f32)__msa_ld_w(r4 + 4 * 3, 0);
315 v4f32 _r44 = (v4f32)__msa_ld_w(r4 + 4 * 4, 0);
316
317 v4f32 _k40 = (v4f32)__msa_ld_w(k0, 0);
318 v4f32 _k41 = (v4f32)__msa_ld_w(k0 + 4, 0);
319 v4f32 _k42 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
320 v4f32 _k43 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
321 v4f32 _k44 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
322 k0 -= 4 * 20;
323
324 _sum0 = __msa_fmadd_w(_sum0, _k40, _r40);
325 _sum0 = __msa_fmadd_w(_sum0, _k41, _r41);
326 _sum0 = __msa_fmadd_w(_sum0, _k42, _r42);
327 _sum0 = __msa_fmadd_w(_sum0, _k43, _r43);
328 _sum0 = __msa_fmadd_w(_sum0, _k44, _r44);
329
330 __msa_st_w((v4i32)_sum0, outptr0, 0);
331
332 outptr0 += 4;
333
334 r0 += 4;
335 r1 += 4;
336 r2 += 4;
337 r3 += 4;
338 r4 += 4;
339 }
340
341 r0 += 4 * 4;
342 r1 += 4 * 4;
343 r2 += 4 * 4;
344 r3 += 4 * 4;
345 r4 += 4 * 4;
346 }
347 }
348 }
349
convdw5x5s2_pack4_msa(const Mat & bottom_blob,Mat & top_blob,const Mat & kernel,const Mat & _bias,const Option & opt)350 static void convdw5x5s2_pack4_msa(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
351 {
352 int w = bottom_blob.w;
353
354 int outw = top_blob.w;
355 int outh = top_blob.h;
356
357 const int group = bottom_blob.c;
358
359 const int tailstep = (w - 2 * outw + w) * 4;
360
361 const float* bias = _bias;
362
363 #pragma omp parallel for num_threads(opt.num_threads)
364 for (int g = 0; g < group; g++)
365 {
366 Mat out = top_blob.channel(g);
367
368 v4f32 _bias0 = bias ? (v4f32)__msa_ld_w(bias + g * 4, 0) : (v4f32)__msa_fill_w(0);
369
370 const float* k0 = kernel.row(g);
371
372 float* outptr0 = out;
373
374 const Mat img0 = bottom_blob.channel(g);
375
376 const float* r0 = img0.row(0);
377 const float* r1 = img0.row(1);
378 const float* r2 = img0.row(2);
379 const float* r3 = img0.row(3);
380 const float* r4 = img0.row(4);
381
382 int i = 0;
383 for (; i < outh; i++)
384 {
385 int j = 0;
386 for (; j < outw; j++)
387 {
388 __builtin_prefetch(r0 + 160);
389 __builtin_prefetch(r1 + 160);
390 __builtin_prefetch(r2 + 160);
391 __builtin_prefetch(r3 + 160);
392 __builtin_prefetch(r4 + 160);
393
394 __builtin_prefetch(k0 + 800);
395
396 v4f32 _sum0 = _bias0;
397
398 v4f32 _r00 = (v4f32)__msa_ld_w(r0, 0);
399 v4f32 _r01 = (v4f32)__msa_ld_w(r0 + 4, 0);
400 v4f32 _r02 = (v4f32)__msa_ld_w(r0 + 4 * 2, 0);
401 v4f32 _r03 = (v4f32)__msa_ld_w(r0 + 4 * 3, 0);
402 v4f32 _r04 = (v4f32)__msa_ld_w(r0 + 4 * 4, 0);
403
404 v4f32 _k00 = (v4f32)__msa_ld_w(k0, 0);
405 v4f32 _k01 = (v4f32)__msa_ld_w(k0 + 4, 0);
406 v4f32 _k02 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
407 v4f32 _k03 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
408 v4f32 _k04 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
409 k0 += 4 * 5;
410
411 _sum0 = __msa_fmadd_w(_sum0, _k00, _r00);
412 _sum0 = __msa_fmadd_w(_sum0, _k01, _r01);
413 _sum0 = __msa_fmadd_w(_sum0, _k02, _r02);
414 _sum0 = __msa_fmadd_w(_sum0, _k03, _r03);
415 _sum0 = __msa_fmadd_w(_sum0, _k04, _r04);
416
417 v4f32 _r10 = (v4f32)__msa_ld_w(r1, 0);
418 v4f32 _r11 = (v4f32)__msa_ld_w(r1 + 4, 0);
419 v4f32 _r12 = (v4f32)__msa_ld_w(r1 + 4 * 2, 0);
420 v4f32 _r13 = (v4f32)__msa_ld_w(r1 + 4 * 3, 0);
421 v4f32 _r14 = (v4f32)__msa_ld_w(r1 + 4 * 4, 0);
422
423 v4f32 _k10 = (v4f32)__msa_ld_w(k0, 0);
424 v4f32 _k11 = (v4f32)__msa_ld_w(k0 + 4, 0);
425 v4f32 _k12 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
426 v4f32 _k13 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
427 v4f32 _k14 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
428 k0 += 4 * 5;
429
430 _sum0 = __msa_fmadd_w(_sum0, _k10, _r10);
431 _sum0 = __msa_fmadd_w(_sum0, _k11, _r11);
432 _sum0 = __msa_fmadd_w(_sum0, _k12, _r12);
433 _sum0 = __msa_fmadd_w(_sum0, _k13, _r13);
434 _sum0 = __msa_fmadd_w(_sum0, _k14, _r14);
435
436 v4f32 _r20 = (v4f32)__msa_ld_w(r2, 0);
437 v4f32 _r21 = (v4f32)__msa_ld_w(r2 + 4, 0);
438 v4f32 _r22 = (v4f32)__msa_ld_w(r2 + 4 * 2, 0);
439 v4f32 _r23 = (v4f32)__msa_ld_w(r2 + 4 * 3, 0);
440 v4f32 _r24 = (v4f32)__msa_ld_w(r2 + 4 * 4, 0);
441
442 v4f32 _k20 = (v4f32)__msa_ld_w(k0, 0);
443 v4f32 _k21 = (v4f32)__msa_ld_w(k0 + 4, 0);
444 v4f32 _k22 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
445 v4f32 _k23 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
446 v4f32 _k24 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
447 k0 += 4 * 5;
448
449 _sum0 = __msa_fmadd_w(_sum0, _k20, _r20);
450 _sum0 = __msa_fmadd_w(_sum0, _k21, _r21);
451 _sum0 = __msa_fmadd_w(_sum0, _k22, _r22);
452 _sum0 = __msa_fmadd_w(_sum0, _k23, _r23);
453 _sum0 = __msa_fmadd_w(_sum0, _k24, _r24);
454
455 v4f32 _r30 = (v4f32)__msa_ld_w(r3, 0);
456 v4f32 _r31 = (v4f32)__msa_ld_w(r3 + 4, 0);
457 v4f32 _r32 = (v4f32)__msa_ld_w(r3 + 4 * 2, 0);
458 v4f32 _r33 = (v4f32)__msa_ld_w(r3 + 4 * 3, 0);
459 v4f32 _r34 = (v4f32)__msa_ld_w(r3 + 4 * 4, 0);
460
461 v4f32 _k30 = (v4f32)__msa_ld_w(k0, 0);
462 v4f32 _k31 = (v4f32)__msa_ld_w(k0 + 4, 0);
463 v4f32 _k32 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
464 v4f32 _k33 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
465 v4f32 _k34 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
466 k0 += 4 * 5;
467
468 _sum0 = __msa_fmadd_w(_sum0, _k30, _r30);
469 _sum0 = __msa_fmadd_w(_sum0, _k31, _r31);
470 _sum0 = __msa_fmadd_w(_sum0, _k32, _r32);
471 _sum0 = __msa_fmadd_w(_sum0, _k33, _r33);
472 _sum0 = __msa_fmadd_w(_sum0, _k34, _r34);
473
474 v4f32 _r40 = (v4f32)__msa_ld_w(r4, 0);
475 v4f32 _r41 = (v4f32)__msa_ld_w(r4 + 4, 0);
476 v4f32 _r42 = (v4f32)__msa_ld_w(r4 + 4 * 2, 0);
477 v4f32 _r43 = (v4f32)__msa_ld_w(r4 + 4 * 3, 0);
478 v4f32 _r44 = (v4f32)__msa_ld_w(r4 + 4 * 4, 0);
479
480 v4f32 _k40 = (v4f32)__msa_ld_w(k0, 0);
481 v4f32 _k41 = (v4f32)__msa_ld_w(k0 + 4, 0);
482 v4f32 _k42 = (v4f32)__msa_ld_w(k0 + 4 * 2, 0);
483 v4f32 _k43 = (v4f32)__msa_ld_w(k0 + 4 * 3, 0);
484 v4f32 _k44 = (v4f32)__msa_ld_w(k0 + 4 * 4, 0);
485 k0 -= 4 * 20;
486
487 _sum0 = __msa_fmadd_w(_sum0, _k40, _r40);
488 _sum0 = __msa_fmadd_w(_sum0, _k41, _r41);
489 _sum0 = __msa_fmadd_w(_sum0, _k42, _r42);
490 _sum0 = __msa_fmadd_w(_sum0, _k43, _r43);
491 _sum0 = __msa_fmadd_w(_sum0, _k44, _r44);
492
493 __msa_st_w((v4i32)_sum0, outptr0, 0);
494
495 outptr0 += 4;
496
497 r0 += 4 * 2;
498 r1 += 4 * 2;
499 r2 += 4 * 2;
500 r3 += 4 * 2;
501 r4 += 4 * 2;
502 }
503
504 r0 += tailstep;
505 r1 += tailstep;
506 r2 += tailstep;
507 r3 += tailstep;
508 r4 += tailstep;
509 }
510 }
511 }
512