1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264chroma_mips.h"
23 
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26     0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28     0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29     0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
avc_chroma_hz_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)32 static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33                                   uint32_t coeff0, uint32_t coeff1)
34 {
35     uint16_t out0, out1;
36     v16i8 src0, src1;
37     v8u16 res_r;
38     v8i16 res;
39     v16i8 mask;
40     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43 
44     mask = LD_SB(&chroma_mask_arr[0]);
45 
46     LD_SB2(src, stride, src0, src1);
47 
48     src0 = __msa_vshf_b(mask, src1, src0);
49     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50     res_r <<= 3;
51     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52     res_r = __msa_sat_u_h(res_r, 7);
53     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54 
55     out0 = __msa_copy_u_h(res, 0);
56     out1 = __msa_copy_u_h(res, 2);
57 
58     SH(out0, dst);
59     dst += stride;
60     SH(out1, dst);
61 }
62 
avc_chroma_hz_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)63 static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64                                   uint32_t coeff0, uint32_t coeff1)
65 {
66     v16u8 src0, src1, src2, src3;
67     v8u16 res_r;
68     v8i16 res;
69     v16i8 mask;
70     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73 
74     mask = LD_SB(&chroma_mask_arr[64]);
75 
76     LD_UB4(src, stride, src0, src1, src2, src3);
77 
78     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79 
80     src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81 
82     res_r = __msa_dotp_u_h(src0, coeff_vec);
83     res_r <<= 3;
84     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85     res_r = __msa_sat_u_h(res_r, 7);
86     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87 
88     ST_H4(res, 0, 1, 2, 3, dst, stride);
89 }
90 
avc_chroma_hz_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)91 static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92                                  uint32_t coeff0, uint32_t coeff1,
93                                  int32_t height)
94 {
95     if (2 == height) {
96         avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97     } else if (4 == height) {
98         avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99     }
100 }
101 
avc_chroma_hz_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)102 static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103                                   uint32_t coeff0, uint32_t coeff1)
104 {
105     v16i8 src0, src1;
106     v8u16 res_r;
107     v4i32 res;
108     v16i8 mask;
109     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112 
113     mask = LD_SB(&chroma_mask_arr[0]);
114 
115     LD_SB2(src, stride, src0, src1);
116 
117     src0 = __msa_vshf_b(mask, src1, src0);
118     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119     res_r <<= 3;
120     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121     res_r = __msa_sat_u_h(res_r, 7);
122     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123 
124     ST_W2(res, 0, 1, dst, stride);
125 }
126 
avc_chroma_hz_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)127 static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128                                   uint32_t coeff0, uint32_t coeff1)
129 {
130     v16u8 src0, src1, src2, src3, out;
131     v8u16 res0_r, res1_r;
132     v16i8 mask;
133     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136 
137     mask = LD_SB(&chroma_mask_arr[0]);
138 
139     LD_UB4(src, stride, src0, src1, src2, src3);
140     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141     DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142     res0_r <<= 3;
143     res1_r <<= 3;
144     SRARI_H2_UH(res0_r, res1_r, 6);
145     SAT_UH2_UH(res0_r, res1_r, 7);
146     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147     ST_W4(out, 0, 1, 2, 3, dst, stride);
148 }
149 
avc_chroma_hz_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)150 static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151                                   uint32_t coeff0, uint32_t coeff1)
152 {
153     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154     v16i8 mask;
155     v8u16 res0, res1, res2, res3;
156     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159 
160     mask = LD_SB(&chroma_mask_arr[0]);
161 
162     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165     DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166     DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167     SLLI_4V(res0, res1, res2, res3, 3);
168     SRARI_H4_UH(res0, res1, res2, res3, 6);
169     SAT_UH4_UH(res0, res1, res2, res3, 7);
170     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
172 }
173 
avc_chroma_hz_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)174 static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175                                  uint32_t coeff0, uint32_t coeff1,
176                                  int32_t height)
177 {
178     if (2 == height) {
179         avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180     } else if (4 == height) {
181         avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182     } else if (8 == height) {
183         avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184     }
185 }
186 
avc_chroma_hz_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)187 static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188                                   uint32_t coeff0, uint32_t coeff1)
189 {
190     v16u8 src0, src1, src2, src3, out0, out1;
191     v8u16 res0, res1, res2, res3;
192     v16i8 mask;
193     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196 
197     mask = LD_SB(&chroma_mask_arr[32]);
198     LD_UB4(src, stride, src0, src1, src2, src3);
199     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202                 coeff_vec, res0, res1, res2, res3);
203     SLLI_4V(res0, res1, res2, res3, 3);
204     SRARI_H4_UH(res0, res1, res2, res3, 6);
205     SAT_UH4_UH(res0, res1, res2, res3, 7);
206     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207     ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
208 }
209 
avc_chroma_hz_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)210 static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211                                   uint32_t coeff0, uint32_t coeff1)
212 {
213     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214     v16u8 out0, out1, out2, out3;
215     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216     v16i8 mask;
217     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220 
221     mask = LD_SB(&chroma_mask_arr[32]);
222 
223     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229                 coeff_vec, res0, res1, res2, res3);
230     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231                 coeff_vec, res4, res5, res6, res7);
232     SLLI_4V(res0, res1, res2, res3, 3);
233     SLLI_4V(res4, res5, res6, res7, 3);
234     SRARI_H4_UH(res0, res1, res2, res3, 6);
235     SRARI_H4_UH(res4, res5, res6, res7, 6);
236     SAT_UH4_UH(res0, res1, res2, res3, 7);
237     SAT_UH4_UH(res4, res5, res6, res7, 7);
238     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
241 }
242 
avc_chroma_hz_nonmult_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)243 static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244                                       int32_t stride, uint32_t coeff0,
245                                       uint32_t coeff1, int32_t height)
246 {
247     uint32_t row;
248     v16u8 src0, src1, src2, src3, out0, out1;
249     v8u16 res0, res1, res2, res3;
250     v16i8 mask;
251     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254 
255     mask = LD_SB(&chroma_mask_arr[32]);
256 
257     for (row = height >> 2; row--;) {
258         LD_UB4(src, stride, src0, src1, src2, src3);
259         src += (4 * stride);
260 
261         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263         DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264                     coeff_vec, res0, res1, res2, res3);
265         SLLI_4V(res0, res1, res2, res3, 3);
266         SRARI_H4_UH(res0, res1, res2, res3, 6);
267         SAT_UH4_UH(res0, res1, res2, res3, 7);
268         PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269         ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
270         dst += (4 * stride);
271     }
272 
273     if (0 != (height % 4)) {
274         for (row = (height % 4); row--;) {
275             src0 = LD_UB(src);
276             src += stride;
277 
278             src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279 
280             res0 = __msa_dotp_u_h(src0, coeff_vec);
281             res0 <<= 3;
282             res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283             res0 = __msa_sat_u_h(res0, 7);
284             res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285 
286             ST_D1(res0, 0, dst);
287             dst += stride;
288         }
289     }
290 }
291 
avc_chroma_hz_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)292 static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293                                  uint32_t coeff0, uint32_t coeff1,
294                                  int32_t height)
295 {
296     if (4 == height) {
297         avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298     } else if (8 == height) {
299         avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300     } else {
301         avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302     }
303 }
304 
avc_chroma_vt_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)305 static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306                                   uint32_t coeff0, uint32_t coeff1)
307 {
308     uint16_t out0, out1;
309     v16i8 src0, src1, src2;
310     v16u8 tmp0, tmp1;
311     v8i16 res;
312     v8u16 res_r;
313     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316 
317     LD_SB3(src, stride, src0, src1, src2);
318 
319     ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320 
321     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322 
323     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324     res_r <<= 3;
325     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326     res_r = __msa_sat_u_h(res_r, 7);
327     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328 
329     out0 = __msa_copy_u_h(res, 0);
330     out1 = __msa_copy_u_h(res, 2);
331 
332     SH(out0, dst);
333     dst += stride;
334     SH(out1, dst);
335 }
336 
avc_chroma_vt_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)337 static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338                                   uint32_t coeff0, uint32_t coeff1)
339 {
340     v16u8 src0, src1, src2, src3, src4;
341     v16u8 tmp0, tmp1, tmp2, tmp3;
342     v8i16 res;
343     v8u16 res_r;
344     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347 
348     LD_UB5(src, stride, src0, src1, src2, src3, src4);
349     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350                tmp0, tmp1, tmp2, tmp3);
351     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352 
353     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354 
355     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356     res_r <<= 3;
357     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358     res_r = __msa_sat_u_h(res_r, 7);
359 
360     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361 
362     ST_H4(res, 0, 1, 2, 3, dst, stride);
363 }
364 
avc_chroma_vt_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)365 static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366                                  uint32_t coeff0, uint32_t coeff1,
367                                  int32_t height)
368 {
369     if (2 == height) {
370         avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371     } else if (4 == height) {
372         avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373     }
374 }
375 
avc_chroma_vt_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)376 static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377                                   uint32_t coeff0, uint32_t coeff1)
378 {
379     v16u8 src0, src1, src2;
380     v16u8 tmp0, tmp1;
381     v4i32 res;
382     v8u16 res_r;
383     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386 
387     LD_UB3(src, stride, src0, src1, src2);
388     ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389 
390     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392     res_r <<= 3;
393     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394     res_r = __msa_sat_u_h(res_r, 7);
395     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396 
397     ST_W2(res, 0, 1, dst, stride);
398 }
399 
avc_chroma_vt_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)400 static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401                                   uint32_t coeff0, uint32_t coeff1)
402 {
403     v16u8 src0, src1, src2, src3, src4;
404     v16u8 tmp0, tmp1, tmp2, tmp3;
405     v16u8 out;
406     v8u16 res0_r, res1_r;
407     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410 
411     LD_UB5(src, stride, src0, src1, src2, src3, src4);
412     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413                tmp3);
414     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416     res0_r <<= 3;
417     res1_r <<= 3;
418     SRARI_H2_UH(res0_r, res1_r, 6);
419     SAT_UH2_UH(res0_r, res1_r, 7);
420     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421     ST_W4(out, 0, 1, 2, 3, dst, stride);
422 }
423 
avc_chroma_vt_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)424 static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425                                   uint32_t coeff0, uint32_t coeff1)
426 {
427     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428     v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429     v8u16 res0, res1, res2, res3;
430     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433 
434     LD_UB5(src, stride, src0, src1, src2, src3, src4);
435     src += (5 * stride);
436     LD_UB4(src, stride, src5, src6, src7, src8);
437     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438                tmp3);
439     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440                tmp7);
441     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442     ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444     DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445     SLLI_4V(res0, res1, res2, res3, 3);
446     SRARI_H4_UH(res0, res1, res2, res3, 6);
447     SAT_UH4_UH(res0, res1, res2, res3, 7);
448     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
450 }
451 
avc_chroma_vt_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)452 static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453                                  uint32_t coeff0, uint32_t coeff1,
454                                  int32_t height)
455 {
456     if (2 == height) {
457         avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458     } else if (4 == height) {
459         avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460     } else if (8 == height) {
461         avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462     }
463 }
464 
avc_chroma_vt_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)465 static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466                                   uint32_t coeff0, uint32_t coeff1)
467 {
468     v16u8 src0, src1, src2, src3, src4, out0, out1;
469     v8u16 res0, res1, res2, res3;
470     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473 
474     LD_UB5(src, stride, src0, src1, src2, src3, src4);
475     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476                src3);
477     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478                 coeff_vec, res0, res1, res2, res3);
479     SLLI_4V(res0, res1, res2, res3, 3);
480     SRARI_H4_UH(res0, res1, res2, res3, 6);
481     SAT_UH4_UH(res0, res1, res2, res3, 7);
482     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483     ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
484 }
485 
avc_chroma_vt_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)486 static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487                                   uint32_t coeff0, uint32_t coeff1)
488 {
489     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490     v16u8 out0, out1, out2, out3;
491     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495 
496     LD_UB5(src, stride, src0, src1, src2, src3, src4);
497     src += (5 * stride);
498     LD_UB4(src, stride, src5, src6, src7, src8);
499     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500                src3);
501     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502                src7);
503     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504                 coeff_vec, res0, res1, res2, res3);
505     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506                 coeff_vec, res4, res5, res6, res7);
507     SLLI_4V(res0, res1, res2, res3, 3);
508     SLLI_4V(res4, res5, res6, res7, 3);
509     SRARI_H4_UH(res0, res1, res2, res3, 6);
510     SRARI_H4_UH(res4, res5, res6, res7, 6);
511     SAT_UH4_UH(res0, res1, res2, res3, 7);
512     SAT_UH4_UH(res0, res1, res2, res3, 7);
513     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
516 }
517 
avc_chroma_vt_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)518 static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519                                  uint32_t coeff0, uint32_t coeff1,
520                                  int32_t height)
521 {
522     if (4 == height) {
523         avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524     } else if (8 == height) {
525         avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526     }
527 }
528 
avc_chroma_hv_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)529 static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
530                                   uint32_t coef_hor0, uint32_t coef_hor1,
531                                   uint32_t coef_ver0, uint32_t coef_ver1)
532 {
533     uint16_t out0, out1;
534     v16u8 src0, src1, src2;
535     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536     v8i16 res_vert;
537     v16i8 mask;
538     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543 
544     mask = LD_SB(&chroma_mask_arr[48]);
545 
546     LD_UB3(src, stride, src0, src1, src2);
547     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550 
551     res_vt0 += res_vt1;
552     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553     res_vt0 = __msa_sat_u_h(res_vt0, 7);
554     res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555 
556     out0 = __msa_copy_u_h(res_vert, 0);
557     out1 = __msa_copy_u_h(res_vert, 1);
558 
559     SH(out0, dst);
560     dst += stride;
561     SH(out1, dst);
562 }
563 
avc_chroma_hv_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)564 static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
565                                   uint32_t coef_hor0, uint32_t coef_hor1,
566                                   uint32_t coef_ver0, uint32_t coef_ver1)
567 {
568     v16u8 src0, src1, src2, src3, src4;
569     v16u8 tmp0, tmp1, tmp2, tmp3;
570     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571     v8i16 res;
572     v16i8 mask;
573     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578 
579     mask = LD_SB(&chroma_mask_arr[48]);
580 
581     LD_UB5(src, stride, src0, src1, src2, src3, src4);
582 
583     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588 
589     res_vt0 += res_vt1;
590     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591     res_vt0 = __msa_sat_u_h(res_vt0, 7);
592 
593     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594 
595     ST_H4(res, 0, 1, 2, 3, dst, stride);
596 }
597 
avc_chroma_hv_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)598 static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
599                                  uint32_t coef_hor0, uint32_t coef_hor1,
600                                  uint32_t coef_ver0, uint32_t coef_ver1,
601                                  int32_t height)
602 {
603     if (2 == height) {
604         avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605                               coef_ver1);
606     } else if (4 == height) {
607         avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608                               coef_ver1);
609     }
610 }
611 
avc_chroma_hv_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)612 static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
613                                   uint32_t coef_hor0, uint32_t coef_hor1,
614                                   uint32_t coef_ver0, uint32_t coef_ver1)
615 {
616     v16u8 src0, src1, src2;
617     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618     v16i8 mask;
619     v4i32 res;
620     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625 
626     mask = LD_SB(&chroma_mask_arr[0]);
627     LD_UB3(src, stride, src0, src1, src2);
628     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631 
632     res_vt0 += res_vt1;
633     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634     res_vt0 = __msa_sat_u_h(res_vt0, 7);
635     res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636 
637     ST_W2(res, 0, 1, dst, stride);
638 }
639 
avc_chroma_hv_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)640 static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
641                                   uint32_t coef_hor0, uint32_t coef_hor1,
642                                   uint32_t coef_ver0, uint32_t coef_ver1)
643 {
644     v16u8 src0, src1, src2, src3, src4;
645     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647     v16i8 mask;
648     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653     v4i32 res0, res1;
654 
655     mask = LD_SB(&chroma_mask_arr[0]);
656 
657     LD_UB5(src, stride, src0, src1, src2, src3, src4);
658     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659     VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660     DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662                 res_hz3);
663     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666     SRARI_H2_UH(res_vt0, res_vt1, 6);
667     SAT_UH2_UH(res_vt0, res_vt1, 7);
668     PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669     ST_W2(res0, 0, 1, dst, stride);
670     ST_W2(res1, 0, 1, dst + 2 * stride, stride);
671 }
672 
avc_chroma_hv_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)673 static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
674                                   uint32_t coef_hor0, uint32_t coef_hor1,
675                                   uint32_t coef_ver0, uint32_t coef_ver1)
676 {
677     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
678     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679     v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680     v16i8 mask;
681     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686 
687     mask = LD_SB(&chroma_mask_arr[0]);
688 
689     LD_UB5(src, stride, src0, src1, src2, src3, src4);
690     src += (5 * stride);
691     LD_UB4(src, stride, src5, src6, src7, src8);
692 
693     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
694     VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
695     VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
696     VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
697     DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
698                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699     DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700                 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703     MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704          res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706     ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710     ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
711 }
712 
avc_chroma_hv_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)713 static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
714                                  uint32_t coef_hor0, uint32_t coef_hor1,
715                                  uint32_t coef_ver0, uint32_t coef_ver1,
716                                  int32_t height)
717 {
718     if (2 == height) {
719         avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720                               coef_ver1);
721     } else if (4 == height) {
722         avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723                               coef_ver1);
724     } else if (8 == height) {
725         avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
726                               coef_ver1);
727     }
728 }
729 
avc_chroma_hv_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)730 static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
731                                   uint32_t coef_hor0, uint32_t coef_hor1,
732                                   uint32_t coef_ver0, uint32_t coef_ver1)
733 {
734     v16u8 src0, src1, src2, src3, src4, out0, out1;
735     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737     v16i8 mask;
738     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743 
744     mask = LD_SB(&chroma_mask_arr[32]);
745 
746     src0 = LD_UB(src);
747     src += stride;
748 
749     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
750     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751 
752     LD_UB4(src, stride, src1, src2, src3, src4);
753     src += (4 * stride);
754 
755     VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
756     VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
757     DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
758                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760          res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761 
762     res_vt0 += (res_hz0 * coeff_vt_vec1);
763     res_vt1 += (res_hz1 * coeff_vt_vec1);
764     res_vt2 += (res_hz2 * coeff_vt_vec1);
765     res_vt3 += (res_hz3 * coeff_vt_vec1);
766 
767     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
770     ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
771 }
772 
avc_chroma_hv_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)773 static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
774                                   uint32_t coef_hor0, uint32_t coef_hor1,
775                                   uint32_t coef_ver0, uint32_t coef_ver1)
776 {
777     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
778     v16u8 out0, out1, out2, out3;
779     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780     v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782     v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783     v16i8 mask;
784     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789 
790     mask = LD_SB(&chroma_mask_arr[32]);
791 
792     LD_UB5(src, stride, src0, src1, src2, src3, src4);
793     src += (5 * stride);
794     LD_UB4(src, stride, src5, src6, src7, src8);
795     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
796     VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
797     VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
798     VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
799     VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
800     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
801     DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
802                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803                 res_hz4);
804     DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805                 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807          coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808          res_vt3);
809     MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810          coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811          res_vt7);
812     res_vt0 += (res_hz0 * coeff_vt_vec1);
813     res_vt1 += (res_hz1 * coeff_vt_vec1);
814     res_vt2 += (res_hz2 * coeff_vt_vec1);
815     res_vt3 += (res_hz3 * coeff_vt_vec1);
816     res_vt4 += (res_hz4 * coeff_vt_vec1);
817     res_vt5 += (res_hz5 * coeff_vt_vec1);
818     res_vt6 += (res_hz6 * coeff_vt_vec1);
819     res_vt7 += (res_hz7 * coeff_vt_vec1);
820     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821     SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823     SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
827 }
828 
avc_chroma_hv_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)829 static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
830                                  uint32_t coef_hor0, uint32_t coef_hor1,
831                                  uint32_t coef_ver0, uint32_t coef_ver1,
832                                  int32_t height)
833 {
834     if (4 == height) {
835         avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836                               coef_ver1);
837     } else if (8 == height) {
838         avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
839                               coef_ver1);
840     }
841 }
842 
avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)843 static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
844                                                int32_t stride, uint32_t coeff0,
845                                                uint32_t coeff1)
846 {
847     uint16_t out0, out1;
848     v16i8 src0, src1;
849     v16u8 dst_data = { 0 };
850     v8u16 res_r;
851     v16u8 res;
852     v16i8 mask;
853     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856 
857     mask = LD_SB(&chroma_mask_arr[0]);
858 
859     LD_SB2(src, stride, src0, src1);
860 
861     out0 = LH(dst);
862     out1 = LH(dst + stride);
863 
864     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866 
867     src0 = __msa_vshf_b(mask, src1, src0);
868 
869     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870     res_r <<= 3;
871     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872     res_r = __msa_sat_u_h(res_r, 7);
873 
874     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875     dst_data = __msa_aver_u_b(res, dst_data);
876 
877     out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878     out1 = __msa_copy_u_h((v8i16) dst_data, 2);
879 
880     SH(out0, dst);
881     dst += stride;
882     SH(out1, dst);
883 }
884 
avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)885 static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
886                                                int32_t stride, uint32_t coeff0,
887                                                uint32_t coeff1)
888 {
889     uint16_t tp0, tp1, tp2, tp3;
890     v16u8 src0, src1, src2, src3;
891     v16u8 dst0, dst_data = { 0 };
892     v8u16 res_r;
893     v16i8 mask;
894     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897 
898     mask = LD_SB(&chroma_mask_arr[64]);
899 
900     LD_UB4(src, stride, src0, src1, src2, src3);
901     tp0 = LH(dst);
902     tp1 = LH(dst + stride);
903     tp2 = LH(dst + 2 * stride);
904     tp3 = LH(dst + 3 * stride);
905     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909 
910     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911 
912     src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913 
914     res_r = __msa_dotp_u_h(src0, coeff_vec);
915     res_r <<= 3;
916     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917     res_r = __msa_sat_u_h(res_r, 7);
918 
919     dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920     dst0 = __msa_aver_u_b(dst0, dst_data);
921 
922     ST_H4(dst0, 0, 1, 2, 3, dst, stride);
923 }
924 
avc_chroma_hz_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)925 static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
926                                               int32_t stride, uint32_t coeff0,
927                                               uint32_t coeff1, int32_t height)
928 {
929     if (2 == height) {
930         avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
931     } else if (4 == height) {
932         avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
933     }
934 }
935 
avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)936 static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
937                                                int32_t stride, uint32_t coeff0,
938                                                uint32_t coeff1)
939 {
940     uint32_t load0, load1;
941     v16i8 src0, src1;
942     v16u8 dst_data = { 0 };
943     v8u16 res_r;
944     v16i8 res, mask;
945     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948 
949     mask = LD_SB(&chroma_mask_arr[0]);
950 
951     LD_SB2(src, stride, src0, src1);
952 
953     LW2(dst, stride, load0, load1);
954 
955     INSERT_W2_UB(load0, load1, dst_data);
956 
957     src0 = __msa_vshf_b(mask, src1, src0);
958 
959     res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960     res_r <<= 3;
961     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962     res_r = __msa_sat_u_h(res_r, 7);
963     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964     dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965 
966     ST_W2(dst_data, 0, 1, dst, stride);
967 }
968 
avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)969 static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
970                                                int32_t stride, uint32_t coeff0,
971                                                uint32_t coeff1)
972 {
973     uint32_t tp0, tp1, tp2, tp3;
974     v16u8 src0, src1, src2, src3;
975     v16u8 out, dst_data = { 0 };
976     v16i8 mask;
977     v8u16 res0_r, res1_r;
978     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981 
982     mask = LD_SB(&chroma_mask_arr[0]);
983 
984     LD_UB4(src, stride, src0, src1, src2, src3);
985     LW4(dst, stride, tp0, tp1, tp2, tp3);
986     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
987     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
988     DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
989     res0_r <<= 3;
990     res1_r <<= 3;
991     SRARI_H2_UH(res0_r, res1_r, 6);
992     SAT_UH2_UH(res0_r, res1_r, 7);
993     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994     out = __msa_aver_u_b(out, dst_data);
995     ST_W4(out, 0, 1, 2, 3, dst, stride);
996 }
997 
avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)998 static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
999                                                int32_t stride, uint32_t coeff0,
1000                                                uint32_t coeff1)
1001 {
1002     uint32_t tp0, tp1, tp2, tp3;
1003     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1004     v16u8 dst0 = { 0 }, dst1 = { 0 };
1005     v16i8 mask;
1006     v8u16 res0, res1, res2, res3;
1007     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010 
1011     mask = LD_SB(&chroma_mask_arr[0]);
1012 
1013     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014     LW4(dst, stride, tp0, tp1, tp2, tp3);
1015     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1016     LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1018     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1019     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1020     DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1021     DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022     SLLI_4V(res0, res1, res2, res3, 3);
1023     SRARI_H4_UH(res0, res1, res2, res3, 6);
1024     SAT_UH4_UH(res0, res1, res2, res3, 7);
1025     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1026     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1027     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1028 }
1029 
avc_chroma_hz_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1030 static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1031                                               int32_t stride, uint32_t coeff0,
1032                                               uint32_t coeff1, int32_t height)
1033 {
1034     if (2 == height) {
1035         avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1036     } else if (4 == height) {
1037         avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1038     } else if (8 == height) {
1039         avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1040     }
1041 }
1042 
avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1043 static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1044                                                int32_t stride, uint32_t coeff0,
1045                                                uint32_t coeff1)
1046 {
1047     uint64_t tp0, tp1, tp2, tp3;
1048     v16u8 src0, src1, src2, src3, out0, out1;
1049     v16u8 dst0 = { 0 }, dst1 = { 0 };
1050     v8u16 res0, res1, res2, res3;
1051     v16i8 mask;
1052     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055 
1056     mask = LD_SB(&chroma_mask_arr[32]);
1057     LD_UB4(src, stride, src0, src1, src2, src3);
1058     LD4(dst, stride, tp0, tp1, tp2, tp3);
1059     INSERT_D2_UB(tp0, tp1, dst0);
1060     INSERT_D2_UB(tp2, tp3, dst1);
1061     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1062     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1063     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1064                 coeff_vec, res0, res1, res2, res3);
1065     SLLI_4V(res0, res1, res2, res3, 3);
1066     SRARI_H4_UH(res0, res1, res2, res3, 6);
1067     SAT_UH4_UH(res0, res1, res2, res3, 7);
1068     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1069     AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1070     ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1071 }
1072 
avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1073 static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1074                                                int32_t stride, uint32_t coeff0,
1075                                                uint32_t coeff1)
1076 {
1077     uint64_t tp0, tp1, tp2, tp3;
1078     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1079     v16u8 out0, out1, out2, out3;
1080     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082     v16i8 mask;
1083     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086 
1087     mask = LD_SB(&chroma_mask_arr[32]);
1088 
1089     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1090     LD4(dst, stride, tp0, tp1, tp2, tp3);
1091     INSERT_D2_UB(tp0, tp1, dst0);
1092     INSERT_D2_UB(tp2, tp3, dst1);
1093     LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1094     INSERT_D2_UB(tp0, tp1, dst2);
1095     INSERT_D2_UB(tp2, tp3, dst3);
1096     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1097     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1098     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1099     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1100     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1101                 coeff_vec, res0, res1, res2, res3);
1102     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103                 coeff_vec, res4, res5, res6, res7);
1104     SLLI_4V(res0, res1, res2, res3, 3);
1105     SLLI_4V(res4, res5, res6, res7, 3);
1106     SRARI_H4_UH(res0, res1, res2, res3, 6);
1107     SRARI_H4_UH(res4, res5, res6, res7, 6);
1108     SAT_UH4_UH(res0, res1, res2, res3, 7);
1109     SAT_UH4_UH(res4, res5, res6, res7, 7);
1110     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1111     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1112     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1113     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1114     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1115 }
1116 
avc_chroma_hz_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1117 static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1118                                               int32_t stride, uint32_t coeff0,
1119                                               uint32_t coeff1, int32_t height)
1120 {
1121     if (4 == height) {
1122         avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1123     } else if (8 == height) {
1124         avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1125     }
1126 }
1127 
avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1128 static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1129                                                int32_t stride, uint32_t coeff0,
1130                                                uint32_t coeff1)
1131 {
1132     uint16_t out0, out1;
1133     v16i8 src0, src1, src2, tmp0, tmp1, res;
1134     v16u8 dst_data = { 0 };
1135     v8i16 out;
1136     v8u16 res_r;
1137     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140 
1141     LD_SB3(src, stride, src0, src1, src2);
1142     out0 = LH(dst);
1143     out1 = LH(dst + stride);
1144 
1145     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147 
1148     ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149 
1150     tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151     res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152     res_r <<= 3;
1153     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154     res_r = __msa_sat_u_h(res_r, 7);
1155     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156     out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157     out0 = __msa_copy_u_h(out, 0);
1158     out1 = __msa_copy_u_h(out, 2);
1159 
1160     SH(out0, dst);
1161     dst += stride;
1162     SH(out1, dst);
1163 }
1164 
avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1165 static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1166                                                int32_t stride, uint32_t coeff0,
1167                                                uint32_t coeff1)
1168 {
1169     uint16_t tp0, tp1, tp2, tp3;
1170     v16i8 src0, src1, src2, src3, src4;
1171     v16u8 tmp0, tmp1, tmp2, tmp3;
1172     v8u16 res_r;
1173     v8i16 res;
1174     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177     v16u8 dst_data = { 0 };
1178 
1179     LD_SB5(src, stride, src0, src1, src2, src3, src4);
1180 
1181     tp0 = LH(dst);
1182     tp1 = LH(dst + stride);
1183     tp2 = LH(dst + 2 * stride);
1184     tp3 = LH(dst + 3 * stride);
1185     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189 
1190     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1191                tmp0, tmp1, tmp2, tmp3);
1192     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193 
1194     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195 
1196     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197     res_r <<= 3;
1198     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199     res_r = __msa_sat_u_h(res_r, 7);
1200 
1201     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1203 
1204     ST_H4(res, 0, 1, 2, 3, dst, stride);
1205 }
1206 
avc_chroma_vt_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1207 static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1208                                               int32_t stride, uint32_t coeff0,
1209                                               uint32_t coeff1, int32_t height)
1210 {
1211     if (2 == height) {
1212         avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
1213     } else if (4 == height) {
1214         avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
1215     }
1216 }
1217 
avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1218 static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1219                                                int32_t stride, uint32_t coeff0,
1220                                                uint32_t coeff1)
1221 {
1222     uint32_t load0, load1;
1223     v16u8 src0, src1, src2, tmp0, tmp1;
1224     v16u8 dst_data = { 0 };
1225     v8u16 res_r;
1226     v16u8 res;
1227     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230 
1231     LD_UB3(src, stride, src0, src1, src2);
1232 
1233     LW2(dst, stride, load0, load1);
1234 
1235     INSERT_W2_UB(load0, load1, dst_data);
1236     ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237 
1238     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239 
1240     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241     res_r <<= 3;
1242     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243     res_r = __msa_sat_u_h(res_r, 7);
1244     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245     res = __msa_aver_u_b(res, dst_data);
1246 
1247     ST_W2(res, 0, 1, dst, stride);
1248 }
1249 
avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1250 static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1251                                                int32_t stride, uint32_t coeff0,
1252                                                uint32_t coeff1)
1253 {
1254     uint32_t tp0, tp1, tp2, tp3;
1255     v16u8 src0, src1, src2, src3, src4;
1256     v16u8 tmp0, tmp1, tmp2, tmp3;
1257     v16u8 dst0 = { 0 };
1258     v8u16 res0_r, res1_r;
1259     v16u8 out;
1260     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263 
1264     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1265     LW4(dst, stride, tp0, tp1, tp2, tp3);
1266     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1267     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268                tmp3);
1269     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1271     res0_r <<= 3;
1272     res1_r <<= 3;
1273     SRARI_H2_UH(res0_r, res1_r, 6);
1274     SAT_UH2_UH(res0_r, res1_r, 7);
1275     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276     out = __msa_aver_u_b(out, dst0);
1277     ST_W4(out, 0, 1, 2, 3, dst, stride);
1278 }
1279 
avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1280 static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1281                                                int32_t stride, uint32_t coeff0,
1282                                                uint32_t coeff1)
1283 {
1284     uint32_t tp0, tp1, tp2, tp3;
1285     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1286     v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287     v16u8 dst0 = { 0 }, dst1 = { 0 };
1288     v8u16 res0, res1, res2, res3;
1289     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292 
1293     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294     src += (5 * stride);
1295     LD_UB4(src, stride, src5, src6, src7, src8);
1296     LW4(dst, stride, tp0, tp1, tp2, tp3);
1297     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1298     LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1300     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301                tmp3);
1302     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303                tmp7);
1304     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305     ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307     DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308     SLLI_4V(res0, res1, res2, res3, 3);
1309     SRARI_H4_UH(res0, res1, res2, res3, 6);
1310     SAT_UH4_UH(res0, res1, res2, res3, 7);
1311     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1312     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1313     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1314 }
1315 
avc_chroma_vt_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1316 static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1317                                               int32_t stride, uint32_t coeff0,
1318                                               uint32_t coeff1, int32_t height)
1319 {
1320     if (2 == height) {
1321         avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1322     } else if (4 == height) {
1323         avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1324     } else if (8 == height) {
1325         avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1326     }
1327 }
1328 
avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1329 static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1330                                                int32_t stride, uint32_t coeff0,
1331                                                uint32_t coeff1)
1332 {
1333     uint64_t tp0, tp1, tp2, tp3;
1334     v16u8 src0, src1, src2, src3, src4;
1335     v16u8 out0, out1;
1336     v8u16 res0, res1, res2, res3;
1337     v16u8 dst0 = { 0 }, dst1 = { 0 };
1338     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341 
1342     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1343     LD4(dst, stride, tp0, tp1, tp2, tp3);
1344     INSERT_D2_UB(tp0, tp1, dst0);
1345     INSERT_D2_UB(tp2, tp3, dst1);
1346     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1347                src0, src1, src2, src3);
1348     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1349                 coeff_vec, res0, res1, res2, res3);
1350     SLLI_4V(res0, res1, res2, res3, 3);
1351     SRARI_H4_UH(res0, res1, res2, res3, 6);
1352     SAT_UH4_UH(res0, res1, res2, res3, 7);
1353     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1354     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1355     ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1356 }
1357 
avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1358 static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1359                                                int32_t stride, uint32_t coeff0,
1360                                                uint32_t coeff1)
1361 {
1362     uint64_t tp0, tp1, tp2, tp3;
1363     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1364     v16u8 out0, out1, out2, out3;
1365     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370 
1371     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372     src += (5 * stride);
1373     LD_UB4(src, stride, src5, src6, src7, src8);
1374     LD4(dst, stride, tp0, tp1, tp2, tp3);
1375     INSERT_D2_UB(tp0, tp1, dst0);
1376     INSERT_D2_UB(tp2, tp3, dst1);
1377     LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1378     INSERT_D2_UB(tp0, tp1, dst2);
1379     INSERT_D2_UB(tp2, tp3, dst3);
1380     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1381                src0, src1, src2, src3);
1382     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383                src4, src5, src6, src7);
1384     DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1385                 coeff_vec, res0, res1, res2, res3);
1386     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387                 coeff_vec, res4, res5, res6, res7);
1388     SLLI_4V(res0, res1, res2, res3, 3);
1389     SLLI_4V(res4, res5, res6, res7, 3);
1390     SRARI_H4_UH(res0, res1, res2, res3, 6);
1391     SRARI_H4_UH(res4, res5, res6, res7, 6);
1392     SAT_UH4_UH(res0, res1, res2, res3, 7);
1393     SAT_UH4_UH(res0, res1, res2, res3, 7);
1394     PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1395     PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1396     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1397     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1398     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1399 }
1400 
avc_chroma_vt_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1401 static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1402                                               int32_t stride, uint32_t coeff0,
1403                                               uint32_t coeff1, int32_t height)
1404 {
1405     if (4 == height) {
1406         avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1407     } else if (8 == height) {
1408         avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1409     }
1410 }
1411 
avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1412 static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1413                                                int32_t stride,
1414                                                uint32_t coef_hor0,
1415                                                uint32_t coef_hor1,
1416                                                uint32_t coef_ver0,
1417                                                uint32_t coef_ver1)
1418 {
1419     uint16_t out0, out1;
1420     v16u8 dst0 = { 0 };
1421     v16u8 src0, src1, src2;
1422     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423     v16i8 res, mask;
1424     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1429 
1430     mask = LD_SB(&chroma_mask_arr[48]);
1431 
1432     LD_UB3(src, stride, src0, src1, src2);
1433     out0 = LH(dst);
1434     out1 = LH(dst + stride);
1435     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1437     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1438     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1440 
1441     res_vt0 += res_vt1;
1442     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443     res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445     dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446     out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447     out1 = __msa_copy_u_h((v8i16) dst0, 1);
1448 
1449     SH(out0, dst);
1450     dst += stride;
1451     SH(out1, dst);
1452 }
1453 
avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1454 static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1455                                                int32_t stride,
1456                                                uint32_t coef_hor0,
1457                                                uint32_t coef_hor1,
1458                                                uint32_t coef_ver0,
1459                                                uint32_t coef_ver1)
1460 {
1461     uint16_t tp0, tp1, tp2, tp3;
1462     v16u8 src0, src1, src2, src3, src4;
1463     v16u8 tmp0, tmp1, tmp2, tmp3;
1464     v16u8 dst0 = { 0 };
1465     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466     v16i8 res, mask;
1467     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1472 
1473     mask = LD_SB(&chroma_mask_arr[48]);
1474 
1475     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476     tp0 = LH(dst);
1477     tp1 = LH(dst + stride);
1478     tp2 = LH(dst + 2 * stride);
1479     tp3 = LH(dst + 3 * stride);
1480     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1484     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1485     VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1486     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1487     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1488     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1489 
1490     res_vt0 += res_vt1;
1491     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492     res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494     dst0 = __msa_aver_u_b((v16u8) res, dst0);
1495 
1496     ST_H4(dst0, 0, 1, 2, 3, dst, stride);
1497 }
1498 
avc_chroma_hv_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1499 static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1500                                               int32_t stride,
1501                                               uint32_t coef_hor0,
1502                                               uint32_t coef_hor1,
1503                                               uint32_t coef_ver0,
1504                                               uint32_t coef_ver1,
1505                                               int32_t height)
1506 {
1507     if (2 == height) {
1508         avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0,
1509                                            coef_hor1, coef_ver0, coef_ver1);
1510     } else if (4 == height) {
1511         avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0,
1512                                            coef_hor1, coef_ver0, coef_ver1);
1513     }
1514 }
1515 
avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1516 static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1517                                                int32_t stride,
1518                                                uint32_t coef_hor0,
1519                                                uint32_t coef_hor1,
1520                                                uint32_t coef_ver0,
1521                                                uint32_t coef_ver1)
1522 {
1523     uint32_t tp0, tp1;
1524     v16u8 src0, src1, src2;
1525     v16u8 dst0, dst_data = { 0 };
1526     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527     v16i8 mask;
1528     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1533 
1534     mask = LD_SB(&chroma_mask_arr[0]);
1535 
1536     LD_UB3(src, stride, src0, src1, src2);
1537     LW2(dst, stride, tp0, tp1);
1538     INSERT_W2_UB(tp0, tp1, dst_data);
1539     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1540     DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1541     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1542 
1543     res_vt0 += res_vt1;
1544     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545     res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546     dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547     dst0 = __msa_aver_u_b(dst0, dst_data);
1548 
1549     ST_W2(dst0, 0, 1, dst, stride);
1550 }
1551 
avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1552 static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1553                                                int32_t stride,
1554                                                uint32_t coef_hor0,
1555                                                uint32_t coef_hor1,
1556                                                uint32_t coef_ver0,
1557                                                uint32_t coef_ver1)
1558 {
1559     uint32_t tp0, tp1, tp2, tp3;
1560     v16u8 src0, src1, src2, src3, src4;
1561     v16u8 out, dst_data = { 0 };
1562     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564     v16i8 mask;
1565     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1570 
1571     mask = LD_SB(&chroma_mask_arr[0]);
1572 
1573     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1574     LW4(dst, stride, tp0, tp1, tp2, tp3);
1575     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
1576     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1577     VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1578     DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1579                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580                 res_hz3);
1581     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1584     SRARI_H2_UH(res_vt0, res_vt1, 6);
1585     SAT_UH2_UH(res_vt0, res_vt1, 7);
1586     out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587     out = __msa_aver_u_b(out, dst_data);
1588     ST_W4(out, 0, 1, 2, 3, dst, stride);
1589 }
1590 
avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1591 static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1592                                                int32_t stride,
1593                                                uint32_t coef_hor0,
1594                                                uint32_t coef_hor1,
1595                                                uint32_t coef_ver0,
1596                                                uint32_t coef_ver1)
1597 {
1598     uint32_t tp0, tp1, tp2, tp3;
1599     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600     v16u8 dst0 = { 0 }, dst1 = { 0 };
1601     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602     v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603     v16i8 mask;
1604     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1609 
1610     mask = LD_SB(&chroma_mask_arr[0]);
1611 
1612     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613     src += (5 * stride);
1614     LD_UB4(src, stride, src5, src6, src7, src8);
1615     LW4(dst, stride, tp0, tp1, tp2, tp3);
1616     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1617     LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1619     VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1620     VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1621     VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1622     VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1623     DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1624                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625     DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626                 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629     MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630          res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632     ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1636     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1637     ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1638 }
1639 
avc_chroma_hv_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1640 static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1641                                               int32_t stride,
1642                                               uint32_t coef_hor0,
1643                                               uint32_t coef_hor1,
1644                                               uint32_t coef_ver0,
1645                                               uint32_t coef_ver1,
1646                                               int32_t height)
1647 {
1648     if (2 == height) {
1649         avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0,
1650                                            coef_hor1, coef_ver0, coef_ver1);
1651     } else if (4 == height) {
1652         avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0,
1653                                            coef_hor1, coef_ver0, coef_ver1);
1654     } else if (8 == height) {
1655         avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0,
1656                                            coef_hor1, coef_ver0, coef_ver1);
1657     }
1658 }
1659 
avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1660 static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1661                                                int32_t stride,
1662                                                uint32_t coef_hor0,
1663                                                uint32_t coef_hor1,
1664                                                uint32_t coef_ver0,
1665                                                uint32_t coef_ver1)
1666 {
1667     uint64_t tp0, tp1, tp2, tp3;
1668     v16u8 src0, src1, src2, src3, src4, out0, out1;
1669     v8u16 res_hz0, res_hz1, res_hz2;
1670     v8u16 res_hz3, res_hz4;
1671     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672     v16u8 dst0 = { 0 }, dst1 = { 0 };
1673     v16i8 mask;
1674     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679 
1680     mask = LD_SB(&chroma_mask_arr[32]);
1681 
1682     src0 = LD_UB(src);
1683     src += stride;
1684     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1685     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1686     LD_UB4(src, stride, src1, src2, src3, src4);
1687     src += (4 * stride);
1688     LD4(dst, stride, tp0, tp1, tp2, tp3);
1689     INSERT_D2_UB(tp0, tp1, dst0);
1690     INSERT_D2_UB(tp2, tp3, dst1);
1691     VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1692     VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1693     DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1694                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696          res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697     res_vt0 += (res_hz0 * coeff_vt_vec1);
1698     res_vt1 += (res_hz1 * coeff_vt_vec1);
1699     res_vt2 += (res_hz2 * coeff_vt_vec1);
1700     res_vt3 += (res_hz3 * coeff_vt_vec1);
1701     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1704     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1705     ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1706 }
1707 
avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1708 static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1709                                                int32_t stride,
1710                                                uint32_t coef_hor0,
1711                                                uint32_t coef_hor1,
1712                                                uint32_t coef_ver0,
1713                                                uint32_t coef_ver1)
1714 {
1715     uint64_t tp0, tp1, tp2, tp3;
1716     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1717     v16u8 out0, out1, out2, out3;
1718     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720     v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722     v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723     v16i8 mask;
1724     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1729 
1730     mask = LD_SB(&chroma_mask_arr[32]);
1731 
1732     LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733     src += (5 * stride);
1734     LD_UB4(src, stride, src5, src6, src7, src8);
1735     src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1736     VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1737     VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1738     VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1739     VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1740     res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1741     DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1742                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743                 res_hz4);
1744     DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745                 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747          coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748          res_vt3);
1749     MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750          coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751          res_vt7);
1752     LD4(dst, stride, tp0, tp1, tp2, tp3);
1753     INSERT_D2_UB(tp0, tp1, dst0);
1754     INSERT_D2_UB(tp2, tp3, dst1);
1755     LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1756     INSERT_D2_UB(tp0, tp1, dst2);
1757     INSERT_D2_UB(tp2, tp3, dst3);
1758     res_vt0 += (res_hz0 * coeff_vt_vec1);
1759     res_vt1 += (res_hz1 * coeff_vt_vec1);
1760     res_vt2 += (res_hz2 * coeff_vt_vec1);
1761     res_vt3 += (res_hz3 * coeff_vt_vec1);
1762     res_vt4 += (res_hz4 * coeff_vt_vec1);
1763     res_vt5 += (res_hz5 * coeff_vt_vec1);
1764     res_vt6 += (res_hz6 * coeff_vt_vec1);
1765     res_vt7 += (res_hz7 * coeff_vt_vec1);
1766     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767     SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769     SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1772     AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1773     AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1774     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1775 }
1776 
avc_chroma_hv_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1777 static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1778                                               int32_t stride,
1779                                               uint32_t coef_hor0,
1780                                               uint32_t coef_hor1,
1781                                               uint32_t coef_ver0,
1782                                               uint32_t coef_ver1,
1783                                               int32_t height)
1784 {
1785     if (4 == height) {
1786         avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0,
1787                                            coef_hor1, coef_ver0, coef_ver1);
1788     } else if (8 == height) {
1789         avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0,
1790                                            coef_hor1, coef_ver0, coef_ver1);
1791     }
1792 }
1793 
copy_width4_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1794 static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1795                             int32_t height)
1796 {
1797     uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1798 
1799     if (8 == height) {
1800         LW4(src, stride, tp0, tp1, tp2, tp3);
1801         src += 4 * stride;
1802         LW4(src, stride, tp4, tp5, tp6, tp7);
1803         SW4(tp0, tp1, tp2, tp3, dst, stride);
1804         dst += 4 * stride;
1805         SW4(tp4, tp5, tp6, tp7, dst, stride);
1806     } else if (4 == height) {
1807         LW4(src, stride, tp0, tp1, tp2, tp3);
1808         SW4(tp0, tp1, tp2, tp3, dst, stride);
1809     } else if (2 == height) {
1810         LW2(src, stride, tp0, tp1);
1811         SW(tp0, dst);
1812         dst += stride;
1813         SW(tp1, dst);
1814     }
1815 }
1816 
copy_width8_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1817 static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1818                             int32_t height)
1819 {
1820     uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1821 
1822     if (8 == height) {
1823         LD4(src, stride, src0, src1, src2, src3);
1824         src += 4 * stride;
1825         LD4(src, stride, src4, src5, src6, src7);
1826         SD4(src0, src1, src2, src3, dst, stride);
1827         dst += 4 * stride;
1828         SD4(src4, src5, src6, src7, dst, stride);
1829     } else if (4 == height) {
1830         LD4(src, stride, src0, src1, src2, src3);
1831         SD4(src0, src1, src2, src3, dst, stride);
1832     }
1833 }
1834 
avg_width4_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1835 static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1836                            int32_t height)
1837 {
1838     uint32_t tp0, tp1, tp2, tp3;
1839     v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1840 
1841     if (8 == height) {
1842         LW4(src, stride, tp0, tp1, tp2, tp3);
1843         src += 4 * stride;
1844         INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1845         LW4(src, stride, tp0, tp1, tp2, tp3);
1846         INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
1847         LW4(dst, stride, tp0, tp1, tp2, tp3);
1848         INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1849         LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1850         INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1851         AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1852         ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1853     } else if (4 == height) {
1854         LW4(src, stride, tp0, tp1, tp2, tp3);
1855         INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1856         LW4(dst, stride, tp0, tp1, tp2, tp3);
1857         INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1858         dst0 = __msa_aver_u_b(src0, dst0);
1859         ST_W4(dst0, 0, 1, 2, 3, dst, stride);
1860     } else if (2 == height) {
1861         LW2(src, stride, tp0, tp1);
1862         INSERT_W2_UB(tp0, tp1, src0);
1863         LW2(dst, stride, tp0, tp1);
1864         INSERT_W2_UB(tp0, tp1, dst0);
1865         dst0 = __msa_aver_u_b(src0, dst0);
1866         ST_W2(dst0, 0, 1, dst, stride);
1867     }
1868 }
1869 
avg_width8_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1870 static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1871                            int32_t height)
1872 {
1873     uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874     v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1875     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1876 
1877     if (8 == height) {
1878         LD4(src, stride, tp0, tp1, tp2, tp3);
1879         src += 4 * stride;
1880         LD4(src, stride, tp4, tp5, tp6, tp7);
1881         INSERT_D2_UB(tp0, tp1, src0);
1882         INSERT_D2_UB(tp2, tp3, src1);
1883         INSERT_D2_UB(tp4, tp5, src2);
1884         INSERT_D2_UB(tp6, tp7, src3);
1885         LD4(dst, stride, tp0, tp1, tp2, tp3);
1886         LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1887         INSERT_D2_UB(tp0, tp1, dst0);
1888         INSERT_D2_UB(tp2, tp3, dst1);
1889         INSERT_D2_UB(tp4, tp5, dst2);
1890         INSERT_D2_UB(tp6, tp7, dst3);
1891         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892                     dst2, dst3);
1893         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1894     } else if (4 == height) {
1895         LD4(src, stride, tp0, tp1, tp2, tp3);
1896         INSERT_D2_UB(tp0, tp1, src0);
1897         INSERT_D2_UB(tp2, tp3, src1);
1898         LD4(dst, stride, tp0, tp1, tp2, tp3);
1899         INSERT_D2_UB(tp0, tp1, dst0);
1900         INSERT_D2_UB(tp2, tp3, dst1);
1901         AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1902         ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1903     }
1904 }
1905 
ff_put_h264_chroma_mc8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1906 void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1907                                 ptrdiff_t stride, int height, int x, int y)
1908 {
1909     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1910 
1911     if (x && y) {
1912         avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1913     } else if (x) {
1914         avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1915     } else if (y) {
1916         avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1917     } else {
1918         copy_width8_msa(src, dst, stride, height);
1919     }
1920 }
1921 
ff_put_h264_chroma_mc4_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1922 void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1923                                 ptrdiff_t stride, int height, int x, int y)
1924 {
1925     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1926 
1927     if (x && y) {
1928         avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1929     } else if (x) {
1930         avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1931     } else if (y) {
1932         avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1933     } else {
1934         copy_width4_msa(src, dst, stride, height);
1935     }
1936 }
1937 
ff_put_h264_chroma_mc2_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1938 void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1939                                 ptrdiff_t stride, int height, int x, int y)
1940 {
1941     int32_t cnt;
1942 
1943     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944 
1945     if (x && y) {
1946         avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1947     } else if (x) {
1948         avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1949     } else if (y) {
1950         avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1951     } else {
1952         for (cnt = height; cnt--;) {
1953             *((uint16_t *) dst) = *((uint16_t *) src);
1954 
1955             src += stride;
1956             dst += stride;
1957         }
1958     }
1959 }
1960 
ff_avg_h264_chroma_mc8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1961 void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1962                                 ptrdiff_t stride, int height, int x, int y)
1963 {
1964     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965 
1966 
1967     if (x && y) {
1968         avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
1969                                           (8 - y), height);
1970     } else if (x) {
1971         avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
1972     } else if (y) {
1973         avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
1974     } else {
1975         avg_width8_msa(src, dst, stride, height);
1976     }
1977 }
1978 
ff_avg_h264_chroma_mc4_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1979 void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1980                                 ptrdiff_t stride, int height, int x, int y)
1981 {
1982     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983 
1984     if (x && y) {
1985         avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
1986                                           (8 - y), height);
1987     } else if (x) {
1988         avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
1989     } else if (y) {
1990         avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
1991     } else {
1992         avg_width4_msa(src, dst, stride, height);
1993     }
1994 }
1995 
ff_avg_h264_chroma_mc2_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1996 void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1997                                 ptrdiff_t stride, int height, int x, int y)
1998 {
1999     int32_t cnt;
2000 
2001     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2002 
2003     if (x && y) {
2004         avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
2005                                           (8 - y), height);
2006     } else if (x) {
2007         avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
2008     } else if (y) {
2009         avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
2010     } else {
2011         for (cnt = height; cnt--;) {
2012             dst[0] = (dst[0] + src[0] + 1) >> 1;
2013             dst[1] = (dst[1] + src[1] + 1) >> 1;
2014 
2015             src += stride;
2016             dst += stride;
2017         }
2018     }
2019 }
2020