1 /*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264chroma_mips.h"
23
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31
avc_chroma_hz_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)32 static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
33 uint32_t coeff0, uint32_t coeff1)
34 {
35 uint16_t out0, out1;
36 v16i8 src0, src1;
37 v8u16 res_r;
38 v8i16 res;
39 v16i8 mask;
40 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43
44 mask = LD_SB(&chroma_mask_arr[0]);
45
46 LD_SB2(src, stride, src0, src1);
47
48 src0 = __msa_vshf_b(mask, src1, src0);
49 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50 res_r <<= 3;
51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52 res_r = __msa_sat_u_h(res_r, 7);
53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54
55 out0 = __msa_copy_u_h(res, 0);
56 out1 = __msa_copy_u_h(res, 2);
57
58 SH(out0, dst);
59 dst += stride;
60 SH(out1, dst);
61 }
62
avc_chroma_hz_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)63 static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
64 uint32_t coeff0, uint32_t coeff1)
65 {
66 v16u8 src0, src1, src2, src3;
67 v8u16 res_r;
68 v8i16 res;
69 v16i8 mask;
70 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73
74 mask = LD_SB(&chroma_mask_arr[64]);
75
76 LD_UB4(src, stride, src0, src1, src2, src3);
77
78 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79
80 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81
82 res_r = __msa_dotp_u_h(src0, coeff_vec);
83 res_r <<= 3;
84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85 res_r = __msa_sat_u_h(res_r, 7);
86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87
88 ST_H4(res, 0, 1, 2, 3, dst, stride);
89 }
90
avc_chroma_hz_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)91 static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
92 uint32_t coeff0, uint32_t coeff1,
93 int32_t height)
94 {
95 if (2 == height) {
96 avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97 } else if (4 == height) {
98 avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99 }
100 }
101
avc_chroma_hz_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)102 static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
103 uint32_t coeff0, uint32_t coeff1)
104 {
105 v16i8 src0, src1;
106 v8u16 res_r;
107 v4i32 res;
108 v16i8 mask;
109 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112
113 mask = LD_SB(&chroma_mask_arr[0]);
114
115 LD_SB2(src, stride, src0, src1);
116
117 src0 = __msa_vshf_b(mask, src1, src0);
118 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119 res_r <<= 3;
120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121 res_r = __msa_sat_u_h(res_r, 7);
122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123
124 ST_W2(res, 0, 1, dst, stride);
125 }
126
avc_chroma_hz_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)127 static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
128 uint32_t coeff0, uint32_t coeff1)
129 {
130 v16u8 src0, src1, src2, src3, out;
131 v8u16 res0_r, res1_r;
132 v16i8 mask;
133 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136
137 mask = LD_SB(&chroma_mask_arr[0]);
138
139 LD_UB4(src, stride, src0, src1, src2, src3);
140 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142 res0_r <<= 3;
143 res1_r <<= 3;
144 SRARI_H2_UH(res0_r, res1_r, 6);
145 SAT_UH2_UH(res0_r, res1_r, 7);
146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147 ST_W4(out, 0, 1, 2, 3, dst, stride);
148 }
149
avc_chroma_hz_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)150 static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
151 uint32_t coeff0, uint32_t coeff1)
152 {
153 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154 v16i8 mask;
155 v8u16 res0, res1, res2, res3;
156 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159
160 mask = LD_SB(&chroma_mask_arr[0]);
161
162 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167 SLLI_4V(res0, res1, res2, res3, 3);
168 SRARI_H4_UH(res0, res1, res2, res3, 6);
169 SAT_UH4_UH(res0, res1, res2, res3, 7);
170 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
172 }
173
avc_chroma_hz_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)174 static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
175 uint32_t coeff0, uint32_t coeff1,
176 int32_t height)
177 {
178 if (2 == height) {
179 avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180 } else if (4 == height) {
181 avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182 } else if (8 == height) {
183 avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184 }
185 }
186
avc_chroma_hz_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)187 static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
188 uint32_t coeff0, uint32_t coeff1)
189 {
190 v16u8 src0, src1, src2, src3, out0, out1;
191 v8u16 res0, res1, res2, res3;
192 v16i8 mask;
193 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196
197 mask = LD_SB(&chroma_mask_arr[32]);
198 LD_UB4(src, stride, src0, src1, src2, src3);
199 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202 coeff_vec, res0, res1, res2, res3);
203 SLLI_4V(res0, res1, res2, res3, 3);
204 SRARI_H4_UH(res0, res1, res2, res3, 6);
205 SAT_UH4_UH(res0, res1, res2, res3, 7);
206 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
208 }
209
avc_chroma_hz_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)210 static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
211 uint32_t coeff0, uint32_t coeff1)
212 {
213 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214 v16u8 out0, out1, out2, out3;
215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216 v16i8 mask;
217 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220
221 mask = LD_SB(&chroma_mask_arr[32]);
222
223 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229 coeff_vec, res0, res1, res2, res3);
230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231 coeff_vec, res4, res5, res6, res7);
232 SLLI_4V(res0, res1, res2, res3, 3);
233 SLLI_4V(res4, res5, res6, res7, 3);
234 SRARI_H4_UH(res0, res1, res2, res3, 6);
235 SRARI_H4_UH(res4, res5, res6, res7, 6);
236 SAT_UH4_UH(res0, res1, res2, res3, 7);
237 SAT_UH4_UH(res4, res5, res6, res7, 7);
238 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
241 }
242
avc_chroma_hz_nonmult_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)243 static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
244 int32_t stride, uint32_t coeff0,
245 uint32_t coeff1, int32_t height)
246 {
247 uint32_t row;
248 v16u8 src0, src1, src2, src3, out0, out1;
249 v8u16 res0, res1, res2, res3;
250 v16i8 mask;
251 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254
255 mask = LD_SB(&chroma_mask_arr[32]);
256
257 for (row = height >> 2; row--;) {
258 LD_UB4(src, stride, src0, src1, src2, src3);
259 src += (4 * stride);
260
261 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264 coeff_vec, res0, res1, res2, res3);
265 SLLI_4V(res0, res1, res2, res3, 3);
266 SRARI_H4_UH(res0, res1, res2, res3, 6);
267 SAT_UH4_UH(res0, res1, res2, res3, 7);
268 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
270 dst += (4 * stride);
271 }
272
273 if (0 != (height % 4)) {
274 for (row = (height % 4); row--;) {
275 src0 = LD_UB(src);
276 src += stride;
277
278 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279
280 res0 = __msa_dotp_u_h(src0, coeff_vec);
281 res0 <<= 3;
282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283 res0 = __msa_sat_u_h(res0, 7);
284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285
286 ST_D1(res0, 0, dst);
287 dst += stride;
288 }
289 }
290 }
291
avc_chroma_hz_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)292 static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
293 uint32_t coeff0, uint32_t coeff1,
294 int32_t height)
295 {
296 if (4 == height) {
297 avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298 } else if (8 == height) {
299 avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300 } else {
301 avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302 }
303 }
304
avc_chroma_vt_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)305 static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
306 uint32_t coeff0, uint32_t coeff1)
307 {
308 uint16_t out0, out1;
309 v16i8 src0, src1, src2;
310 v16u8 tmp0, tmp1;
311 v8i16 res;
312 v8u16 res_r;
313 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316
317 LD_SB3(src, stride, src0, src1, src2);
318
319 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320
321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322
323 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324 res_r <<= 3;
325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328
329 out0 = __msa_copy_u_h(res, 0);
330 out1 = __msa_copy_u_h(res, 2);
331
332 SH(out0, dst);
333 dst += stride;
334 SH(out1, dst);
335 }
336
avc_chroma_vt_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)337 static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
338 uint32_t coeff0, uint32_t coeff1)
339 {
340 v16u8 src0, src1, src2, src3, src4;
341 v16u8 tmp0, tmp1, tmp2, tmp3;
342 v8i16 res;
343 v8u16 res_r;
344 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347
348 LD_UB5(src, stride, src0, src1, src2, src3, src4);
349 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350 tmp0, tmp1, tmp2, tmp3);
351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352
353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354
355 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356 res_r <<= 3;
357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358 res_r = __msa_sat_u_h(res_r, 7);
359
360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361
362 ST_H4(res, 0, 1, 2, 3, dst, stride);
363 }
364
avc_chroma_vt_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)365 static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
366 uint32_t coeff0, uint32_t coeff1,
367 int32_t height)
368 {
369 if (2 == height) {
370 avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371 } else if (4 == height) {
372 avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373 }
374 }
375
avc_chroma_vt_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)376 static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
377 uint32_t coeff0, uint32_t coeff1)
378 {
379 v16u8 src0, src1, src2;
380 v16u8 tmp0, tmp1;
381 v4i32 res;
382 v8u16 res_r;
383 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386
387 LD_UB3(src, stride, src0, src1, src2);
388 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389
390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392 res_r <<= 3;
393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394 res_r = __msa_sat_u_h(res_r, 7);
395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396
397 ST_W2(res, 0, 1, dst, stride);
398 }
399
avc_chroma_vt_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)400 static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
401 uint32_t coeff0, uint32_t coeff1)
402 {
403 v16u8 src0, src1, src2, src3, src4;
404 v16u8 tmp0, tmp1, tmp2, tmp3;
405 v16u8 out;
406 v8u16 res0_r, res1_r;
407 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410
411 LD_UB5(src, stride, src0, src1, src2, src3, src4);
412 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413 tmp3);
414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416 res0_r <<= 3;
417 res1_r <<= 3;
418 SRARI_H2_UH(res0_r, res1_r, 6);
419 SAT_UH2_UH(res0_r, res1_r, 7);
420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421 ST_W4(out, 0, 1, 2, 3, dst, stride);
422 }
423
avc_chroma_vt_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)424 static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
425 uint32_t coeff0, uint32_t coeff1)
426 {
427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429 v8u16 res0, res1, res2, res3;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433
434 LD_UB5(src, stride, src0, src1, src2, src3, src4);
435 src += (5 * stride);
436 LD_UB4(src, stride, src5, src6, src7, src8);
437 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438 tmp3);
439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440 tmp7);
441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445 SLLI_4V(res0, res1, res2, res3, 3);
446 SRARI_H4_UH(res0, res1, res2, res3, 6);
447 SAT_UH4_UH(res0, res1, res2, res3, 7);
448 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
450 }
451
avc_chroma_vt_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)452 static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
453 uint32_t coeff0, uint32_t coeff1,
454 int32_t height)
455 {
456 if (2 == height) {
457 avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458 } else if (4 == height) {
459 avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460 } else if (8 == height) {
461 avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462 }
463 }
464
avc_chroma_vt_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)465 static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
466 uint32_t coeff0, uint32_t coeff1)
467 {
468 v16u8 src0, src1, src2, src3, src4, out0, out1;
469 v8u16 res0, res1, res2, res3;
470 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473
474 LD_UB5(src, stride, src0, src1, src2, src3, src4);
475 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476 src3);
477 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478 coeff_vec, res0, res1, res2, res3);
479 SLLI_4V(res0, res1, res2, res3, 3);
480 SRARI_H4_UH(res0, res1, res2, res3, 6);
481 SAT_UH4_UH(res0, res1, res2, res3, 7);
482 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
484 }
485
avc_chroma_vt_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)486 static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
487 uint32_t coeff0, uint32_t coeff1)
488 {
489 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490 v16u8 out0, out1, out2, out3;
491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495
496 LD_UB5(src, stride, src0, src1, src2, src3, src4);
497 src += (5 * stride);
498 LD_UB4(src, stride, src5, src6, src7, src8);
499 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500 src3);
501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502 src7);
503 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504 coeff_vec, res0, res1, res2, res3);
505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506 coeff_vec, res4, res5, res6, res7);
507 SLLI_4V(res0, res1, res2, res3, 3);
508 SLLI_4V(res4, res5, res6, res7, 3);
509 SRARI_H4_UH(res0, res1, res2, res3, 6);
510 SRARI_H4_UH(res4, res5, res6, res7, 6);
511 SAT_UH4_UH(res0, res1, res2, res3, 7);
512 SAT_UH4_UH(res0, res1, res2, res3, 7);
513 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
516 }
517
avc_chroma_vt_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)518 static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
519 uint32_t coeff0, uint32_t coeff1,
520 int32_t height)
521 {
522 if (4 == height) {
523 avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524 } else if (8 == height) {
525 avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526 }
527 }
528
avc_chroma_hv_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)529 static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
530 uint32_t coef_hor0, uint32_t coef_hor1,
531 uint32_t coef_ver0, uint32_t coef_ver1)
532 {
533 uint16_t out0, out1;
534 v16u8 src0, src1, src2;
535 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536 v8i16 res_vert;
537 v16i8 mask;
538 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543
544 mask = LD_SB(&chroma_mask_arr[48]);
545
546 LD_UB3(src, stride, src0, src1, src2);
547 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550
551 res_vt0 += res_vt1;
552 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553 res_vt0 = __msa_sat_u_h(res_vt0, 7);
554 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555
556 out0 = __msa_copy_u_h(res_vert, 0);
557 out1 = __msa_copy_u_h(res_vert, 1);
558
559 SH(out0, dst);
560 dst += stride;
561 SH(out1, dst);
562 }
563
avc_chroma_hv_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)564 static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
565 uint32_t coef_hor0, uint32_t coef_hor1,
566 uint32_t coef_ver0, uint32_t coef_ver1)
567 {
568 v16u8 src0, src1, src2, src3, src4;
569 v16u8 tmp0, tmp1, tmp2, tmp3;
570 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571 v8i16 res;
572 v16i8 mask;
573 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578
579 mask = LD_SB(&chroma_mask_arr[48]);
580
581 LD_UB5(src, stride, src0, src1, src2, src3, src4);
582
583 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588
589 res_vt0 += res_vt1;
590 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591 res_vt0 = __msa_sat_u_h(res_vt0, 7);
592
593 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594
595 ST_H4(res, 0, 1, 2, 3, dst, stride);
596 }
597
avc_chroma_hv_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)598 static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
599 uint32_t coef_hor0, uint32_t coef_hor1,
600 uint32_t coef_ver0, uint32_t coef_ver1,
601 int32_t height)
602 {
603 if (2 == height) {
604 avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605 coef_ver1);
606 } else if (4 == height) {
607 avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608 coef_ver1);
609 }
610 }
611
avc_chroma_hv_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)612 static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride,
613 uint32_t coef_hor0, uint32_t coef_hor1,
614 uint32_t coef_ver0, uint32_t coef_ver1)
615 {
616 v16u8 src0, src1, src2;
617 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618 v16i8 mask;
619 v4i32 res;
620 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625
626 mask = LD_SB(&chroma_mask_arr[0]);
627 LD_UB3(src, stride, src0, src1, src2);
628 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631
632 res_vt0 += res_vt1;
633 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634 res_vt0 = __msa_sat_u_h(res_vt0, 7);
635 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636
637 ST_W2(res, 0, 1, dst, stride);
638 }
639
avc_chroma_hv_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)640 static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
641 uint32_t coef_hor0, uint32_t coef_hor1,
642 uint32_t coef_ver0, uint32_t coef_ver1)
643 {
644 v16u8 src0, src1, src2, src3, src4;
645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647 v16i8 mask;
648 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653 v4i32 res0, res1;
654
655 mask = LD_SB(&chroma_mask_arr[0]);
656
657 LD_UB5(src, stride, src0, src1, src2, src3, src4);
658 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662 res_hz3);
663 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666 SRARI_H2_UH(res_vt0, res_vt1, 6);
667 SAT_UH2_UH(res_vt0, res_vt1, 7);
668 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669 ST_W2(res0, 0, 1, dst, stride);
670 ST_W2(res1, 0, 1, dst + 2 * stride, stride);
671 }
672
avc_chroma_hv_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)673 static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
674 uint32_t coef_hor0, uint32_t coef_hor1,
675 uint32_t coef_ver0, uint32_t coef_ver1)
676 {
677 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
678 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
680 v16i8 mask;
681 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
686
687 mask = LD_SB(&chroma_mask_arr[0]);
688
689 LD_UB5(src, stride, src0, src1, src2, src3, src4);
690 src += (5 * stride);
691 LD_UB4(src, stride, src5, src6, src7, src8);
692
693 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
694 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
695 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
696 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
697 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
698 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
711 }
712
avc_chroma_hv_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)713 static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
714 uint32_t coef_hor0, uint32_t coef_hor1,
715 uint32_t coef_ver0, uint32_t coef_ver1,
716 int32_t height)
717 {
718 if (2 == height) {
719 avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
720 coef_ver1);
721 } else if (4 == height) {
722 avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
723 coef_ver1);
724 } else if (8 == height) {
725 avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
726 coef_ver1);
727 }
728 }
729
avc_chroma_hv_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)730 static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
731 uint32_t coef_hor0, uint32_t coef_hor1,
732 uint32_t coef_ver0, uint32_t coef_ver1)
733 {
734 v16u8 src0, src1, src2, src3, src4, out0, out1;
735 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
737 v16i8 mask;
738 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
743
744 mask = LD_SB(&chroma_mask_arr[32]);
745
746 src0 = LD_UB(src);
747 src += stride;
748
749 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
750 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
751
752 LD_UB4(src, stride, src1, src2, src3, src4);
753 src += (4 * stride);
754
755 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
756 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
757 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
758 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
761
762 res_vt0 += (res_hz0 * coeff_vt_vec1);
763 res_vt1 += (res_hz1 * coeff_vt_vec1);
764 res_vt2 += (res_hz2 * coeff_vt_vec1);
765 res_vt3 += (res_hz3 * coeff_vt_vec1);
766
767 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
770 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
771 }
772
avc_chroma_hv_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)773 static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
774 uint32_t coef_hor0, uint32_t coef_hor1,
775 uint32_t coef_ver0, uint32_t coef_ver1)
776 {
777 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
778 v16u8 out0, out1, out2, out3;
779 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
783 v16i8 mask;
784 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
789
790 mask = LD_SB(&chroma_mask_arr[32]);
791
792 LD_UB5(src, stride, src0, src1, src2, src3, src4);
793 src += (5 * stride);
794 LD_UB4(src, stride, src5, src6, src7, src8);
795 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
796 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
797 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
798 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
799 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
800 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
801 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
802 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
803 res_hz4);
804 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
808 res_vt3);
809 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
811 res_vt7);
812 res_vt0 += (res_hz0 * coeff_vt_vec1);
813 res_vt1 += (res_hz1 * coeff_vt_vec1);
814 res_vt2 += (res_hz2 * coeff_vt_vec1);
815 res_vt3 += (res_hz3 * coeff_vt_vec1);
816 res_vt4 += (res_hz4 * coeff_vt_vec1);
817 res_vt5 += (res_hz5 * coeff_vt_vec1);
818 res_vt6 += (res_hz6 * coeff_vt_vec1);
819 res_vt7 += (res_hz7 * coeff_vt_vec1);
820 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
827 }
828
avc_chroma_hv_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)829 static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
830 uint32_t coef_hor0, uint32_t coef_hor1,
831 uint32_t coef_ver0, uint32_t coef_ver1,
832 int32_t height)
833 {
834 if (4 == height) {
835 avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
836 coef_ver1);
837 } else if (8 == height) {
838 avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
839 coef_ver1);
840 }
841 }
842
avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)843 static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
844 int32_t stride, uint32_t coeff0,
845 uint32_t coeff1)
846 {
847 uint16_t out0, out1;
848 v16i8 src0, src1;
849 v16u8 dst_data = { 0 };
850 v8u16 res_r;
851 v16u8 res;
852 v16i8 mask;
853 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856
857 mask = LD_SB(&chroma_mask_arr[0]);
858
859 LD_SB2(src, stride, src0, src1);
860
861 out0 = LH(dst);
862 out1 = LH(dst + stride);
863
864 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
866
867 src0 = __msa_vshf_b(mask, src1, src0);
868
869 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
870 res_r <<= 3;
871 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872 res_r = __msa_sat_u_h(res_r, 7);
873
874 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875 dst_data = __msa_aver_u_b(res, dst_data);
876
877 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
879
880 SH(out0, dst);
881 dst += stride;
882 SH(out1, dst);
883 }
884
avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)885 static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
886 int32_t stride, uint32_t coeff0,
887 uint32_t coeff1)
888 {
889 uint16_t tp0, tp1, tp2, tp3;
890 v16u8 src0, src1, src2, src3;
891 v16u8 dst0, dst_data = { 0 };
892 v8u16 res_r;
893 v16i8 mask;
894 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
897
898 mask = LD_SB(&chroma_mask_arr[64]);
899
900 LD_UB4(src, stride, src0, src1, src2, src3);
901 tp0 = LH(dst);
902 tp1 = LH(dst + stride);
903 tp2 = LH(dst + 2 * stride);
904 tp3 = LH(dst + 3 * stride);
905 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
909
910 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
911
912 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
913
914 res_r = __msa_dotp_u_h(src0, coeff_vec);
915 res_r <<= 3;
916 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917 res_r = __msa_sat_u_h(res_r, 7);
918
919 dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920 dst0 = __msa_aver_u_b(dst0, dst_data);
921
922 ST_H4(dst0, 0, 1, 2, 3, dst, stride);
923 }
924
avc_chroma_hz_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)925 static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
926 int32_t stride, uint32_t coeff0,
927 uint32_t coeff1, int32_t height)
928 {
929 if (2 == height) {
930 avc_chroma_hz_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
931 } else if (4 == height) {
932 avc_chroma_hz_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
933 }
934 }
935
avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)936 static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
937 int32_t stride, uint32_t coeff0,
938 uint32_t coeff1)
939 {
940 uint32_t load0, load1;
941 v16i8 src0, src1;
942 v16u8 dst_data = { 0 };
943 v8u16 res_r;
944 v16i8 res, mask;
945 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
948
949 mask = LD_SB(&chroma_mask_arr[0]);
950
951 LD_SB2(src, stride, src0, src1);
952
953 LW2(dst, stride, load0, load1);
954
955 INSERT_W2_UB(load0, load1, dst_data);
956
957 src0 = __msa_vshf_b(mask, src1, src0);
958
959 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
960 res_r <<= 3;
961 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962 res_r = __msa_sat_u_h(res_r, 7);
963 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
965
966 ST_W2(dst_data, 0, 1, dst, stride);
967 }
968
avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)969 static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
970 int32_t stride, uint32_t coeff0,
971 uint32_t coeff1)
972 {
973 uint32_t tp0, tp1, tp2, tp3;
974 v16u8 src0, src1, src2, src3;
975 v16u8 out, dst_data = { 0 };
976 v16i8 mask;
977 v8u16 res0_r, res1_r;
978 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
981
982 mask = LD_SB(&chroma_mask_arr[0]);
983
984 LD_UB4(src, stride, src0, src1, src2, src3);
985 LW4(dst, stride, tp0, tp1, tp2, tp3);
986 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
987 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
988 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
989 res0_r <<= 3;
990 res1_r <<= 3;
991 SRARI_H2_UH(res0_r, res1_r, 6);
992 SAT_UH2_UH(res0_r, res1_r, 7);
993 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994 out = __msa_aver_u_b(out, dst_data);
995 ST_W4(out, 0, 1, 2, 3, dst, stride);
996 }
997
avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)998 static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
999 int32_t stride, uint32_t coeff0,
1000 uint32_t coeff1)
1001 {
1002 uint32_t tp0, tp1, tp2, tp3;
1003 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
1004 v16u8 dst0 = { 0 }, dst1 = { 0 };
1005 v16i8 mask;
1006 v8u16 res0, res1, res2, res3;
1007 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1010
1011 mask = LD_SB(&chroma_mask_arr[0]);
1012
1013 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1014 LW4(dst, stride, tp0, tp1, tp2, tp3);
1015 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1016 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1017 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1018 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1019 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
1020 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
1021 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022 SLLI_4V(res0, res1, res2, res3, 3);
1023 SRARI_H4_UH(res0, res1, res2, res3, 6);
1024 SAT_UH4_UH(res0, res1, res2, res3, 7);
1025 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1026 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1027 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1028 }
1029
avc_chroma_hz_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1030 static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1031 int32_t stride, uint32_t coeff0,
1032 uint32_t coeff1, int32_t height)
1033 {
1034 if (2 == height) {
1035 avc_chroma_hz_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1036 } else if (4 == height) {
1037 avc_chroma_hz_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1038 } else if (8 == height) {
1039 avc_chroma_hz_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1040 }
1041 }
1042
avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1043 static void avc_chroma_hz_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1044 int32_t stride, uint32_t coeff0,
1045 uint32_t coeff1)
1046 {
1047 uint64_t tp0, tp1, tp2, tp3;
1048 v16u8 src0, src1, src2, src3, out0, out1;
1049 v16u8 dst0 = { 0 }, dst1 = { 0 };
1050 v8u16 res0, res1, res2, res3;
1051 v16i8 mask;
1052 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1055
1056 mask = LD_SB(&chroma_mask_arr[32]);
1057 LD_UB4(src, stride, src0, src1, src2, src3);
1058 LD4(dst, stride, tp0, tp1, tp2, tp3);
1059 INSERT_D2_UB(tp0, tp1, dst0);
1060 INSERT_D2_UB(tp2, tp3, dst1);
1061 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1062 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1063 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1064 coeff_vec, res0, res1, res2, res3);
1065 SLLI_4V(res0, res1, res2, res3, 3);
1066 SRARI_H4_UH(res0, res1, res2, res3, 6);
1067 SAT_UH4_UH(res0, res1, res2, res3, 7);
1068 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1069 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1070 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1071 }
1072
avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1073 static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1074 int32_t stride, uint32_t coeff0,
1075 uint32_t coeff1)
1076 {
1077 uint64_t tp0, tp1, tp2, tp3;
1078 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1079 v16u8 out0, out1, out2, out3;
1080 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1082 v16i8 mask;
1083 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1086
1087 mask = LD_SB(&chroma_mask_arr[32]);
1088
1089 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1090 LD4(dst, stride, tp0, tp1, tp2, tp3);
1091 INSERT_D2_UB(tp0, tp1, dst0);
1092 INSERT_D2_UB(tp2, tp3, dst1);
1093 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1094 INSERT_D2_UB(tp0, tp1, dst2);
1095 INSERT_D2_UB(tp2, tp3, dst3);
1096 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1097 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1098 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
1099 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
1100 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1101 coeff_vec, res0, res1, res2, res3);
1102 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103 coeff_vec, res4, res5, res6, res7);
1104 SLLI_4V(res0, res1, res2, res3, 3);
1105 SLLI_4V(res4, res5, res6, res7, 3);
1106 SRARI_H4_UH(res0, res1, res2, res3, 6);
1107 SRARI_H4_UH(res4, res5, res6, res7, 6);
1108 SAT_UH4_UH(res0, res1, res2, res3, 7);
1109 SAT_UH4_UH(res4, res5, res6, res7, 7);
1110 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1111 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1112 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1113 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1114 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1115 }
1116
avc_chroma_hz_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1117 static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1118 int32_t stride, uint32_t coeff0,
1119 uint32_t coeff1, int32_t height)
1120 {
1121 if (4 == height) {
1122 avc_chroma_hz_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1123 } else if (8 == height) {
1124 avc_chroma_hz_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1125 }
1126 }
1127
avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1128 static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1129 int32_t stride, uint32_t coeff0,
1130 uint32_t coeff1)
1131 {
1132 uint16_t out0, out1;
1133 v16i8 src0, src1, src2, tmp0, tmp1, res;
1134 v16u8 dst_data = { 0 };
1135 v8i16 out;
1136 v8u16 res_r;
1137 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1140
1141 LD_SB3(src, stride, src0, src1, src2);
1142 out0 = LH(dst);
1143 out1 = LH(dst + stride);
1144
1145 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1147
1148 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1149
1150 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1152 res_r <<= 3;
1153 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154 res_r = __msa_sat_u_h(res_r, 7);
1155 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156 out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157 out0 = __msa_copy_u_h(out, 0);
1158 out1 = __msa_copy_u_h(out, 2);
1159
1160 SH(out0, dst);
1161 dst += stride;
1162 SH(out1, dst);
1163 }
1164
avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1165 static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1166 int32_t stride, uint32_t coeff0,
1167 uint32_t coeff1)
1168 {
1169 uint16_t tp0, tp1, tp2, tp3;
1170 v16i8 src0, src1, src2, src3, src4;
1171 v16u8 tmp0, tmp1, tmp2, tmp3;
1172 v8u16 res_r;
1173 v8i16 res;
1174 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177 v16u8 dst_data = { 0 };
1178
1179 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1180
1181 tp0 = LH(dst);
1182 tp1 = LH(dst + stride);
1183 tp2 = LH(dst + 2 * stride);
1184 tp3 = LH(dst + 3 * stride);
1185 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1189
1190 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1191 tmp0, tmp1, tmp2, tmp3);
1192 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1193
1194 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1195
1196 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1197 res_r <<= 3;
1198 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199 res_r = __msa_sat_u_h(res_r, 7);
1200
1201 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1203
1204 ST_H4(res, 0, 1, 2, 3, dst, stride);
1205 }
1206
avc_chroma_vt_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1207 static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1208 int32_t stride, uint32_t coeff0,
1209 uint32_t coeff1, int32_t height)
1210 {
1211 if (2 == height) {
1212 avc_chroma_vt_and_aver_dst_2x2_msa(src, dst, stride, coeff0, coeff1);
1213 } else if (4 == height) {
1214 avc_chroma_vt_and_aver_dst_2x4_msa(src, dst, stride, coeff0, coeff1);
1215 }
1216 }
1217
avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1218 static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1219 int32_t stride, uint32_t coeff0,
1220 uint32_t coeff1)
1221 {
1222 uint32_t load0, load1;
1223 v16u8 src0, src1, src2, tmp0, tmp1;
1224 v16u8 dst_data = { 0 };
1225 v8u16 res_r;
1226 v16u8 res;
1227 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1230
1231 LD_UB3(src, stride, src0, src1, src2);
1232
1233 LW2(dst, stride, load0, load1);
1234
1235 INSERT_W2_UB(load0, load1, dst_data);
1236 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
1237
1238 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1239
1240 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1241 res_r <<= 3;
1242 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243 res_r = __msa_sat_u_h(res_r, 7);
1244 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245 res = __msa_aver_u_b(res, dst_data);
1246
1247 ST_W2(res, 0, 1, dst, stride);
1248 }
1249
avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1250 static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1251 int32_t stride, uint32_t coeff0,
1252 uint32_t coeff1)
1253 {
1254 uint32_t tp0, tp1, tp2, tp3;
1255 v16u8 src0, src1, src2, src3, src4;
1256 v16u8 tmp0, tmp1, tmp2, tmp3;
1257 v16u8 dst0 = { 0 };
1258 v8u16 res0_r, res1_r;
1259 v16u8 out;
1260 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1263
1264 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1265 LW4(dst, stride, tp0, tp1, tp2, tp3);
1266 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1267 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1268 tmp3);
1269 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1271 res0_r <<= 3;
1272 res1_r <<= 3;
1273 SRARI_H2_UH(res0_r, res1_r, 6);
1274 SAT_UH2_UH(res0_r, res1_r, 7);
1275 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276 out = __msa_aver_u_b(out, dst0);
1277 ST_W4(out, 0, 1, 2, 3, dst, stride);
1278 }
1279
avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1280 static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1281 int32_t stride, uint32_t coeff0,
1282 uint32_t coeff1)
1283 {
1284 uint32_t tp0, tp1, tp2, tp3;
1285 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1286 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287 v16u8 dst0 = { 0 }, dst1 = { 0 };
1288 v8u16 res0, res1, res2, res3;
1289 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1292
1293 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1294 src += (5 * stride);
1295 LD_UB4(src, stride, src5, src6, src7, src8);
1296 LW4(dst, stride, tp0, tp1, tp2, tp3);
1297 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1298 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1299 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1300 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
1301 tmp3);
1302 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1303 tmp7);
1304 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308 SLLI_4V(res0, res1, res2, res3, 3);
1309 SRARI_H4_UH(res0, res1, res2, res3, 6);
1310 SAT_UH4_UH(res0, res1, res2, res3, 7);
1311 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1312 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1313 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1314 }
1315
avc_chroma_vt_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1316 static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1317 int32_t stride, uint32_t coeff0,
1318 uint32_t coeff1, int32_t height)
1319 {
1320 if (2 == height) {
1321 avc_chroma_vt_and_aver_dst_4x2_msa(src, dst, stride, coeff0, coeff1);
1322 } else if (4 == height) {
1323 avc_chroma_vt_and_aver_dst_4x4_msa(src, dst, stride, coeff0, coeff1);
1324 } else if (8 == height) {
1325 avc_chroma_vt_and_aver_dst_4x8_msa(src, dst, stride, coeff0, coeff1);
1326 }
1327 }
1328
avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1329 static void avc_chroma_vt_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1330 int32_t stride, uint32_t coeff0,
1331 uint32_t coeff1)
1332 {
1333 uint64_t tp0, tp1, tp2, tp3;
1334 v16u8 src0, src1, src2, src3, src4;
1335 v16u8 out0, out1;
1336 v8u16 res0, res1, res2, res3;
1337 v16u8 dst0 = { 0 }, dst1 = { 0 };
1338 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1341
1342 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1343 LD4(dst, stride, tp0, tp1, tp2, tp3);
1344 INSERT_D2_UB(tp0, tp1, dst0);
1345 INSERT_D2_UB(tp2, tp3, dst1);
1346 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1347 src0, src1, src2, src3);
1348 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1349 coeff_vec, res0, res1, res2, res3);
1350 SLLI_4V(res0, res1, res2, res3, 3);
1351 SRARI_H4_UH(res0, res1, res2, res3, 6);
1352 SAT_UH4_UH(res0, res1, res2, res3, 7);
1353 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1354 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1355 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1356 }
1357
avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1)1358 static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1359 int32_t stride, uint32_t coeff0,
1360 uint32_t coeff1)
1361 {
1362 uint64_t tp0, tp1, tp2, tp3;
1363 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1364 v16u8 out0, out1, out2, out3;
1365 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1370
1371 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1372 src += (5 * stride);
1373 LD_UB4(src, stride, src5, src6, src7, src8);
1374 LD4(dst, stride, tp0, tp1, tp2, tp3);
1375 INSERT_D2_UB(tp0, tp1, dst0);
1376 INSERT_D2_UB(tp2, tp3, dst1);
1377 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1378 INSERT_D2_UB(tp0, tp1, dst2);
1379 INSERT_D2_UB(tp2, tp3, dst3);
1380 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1381 src0, src1, src2, src3);
1382 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383 src4, src5, src6, src7);
1384 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1385 coeff_vec, res0, res1, res2, res3);
1386 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387 coeff_vec, res4, res5, res6, res7);
1388 SLLI_4V(res0, res1, res2, res3, 3);
1389 SLLI_4V(res4, res5, res6, res7, 3);
1390 SRARI_H4_UH(res0, res1, res2, res3, 6);
1391 SRARI_H4_UH(res4, res5, res6, res7, 6);
1392 SAT_UH4_UH(res0, res1, res2, res3, 7);
1393 SAT_UH4_UH(res0, res1, res2, res3, 7);
1394 PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1395 PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
1396 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1397 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1398 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1399 }
1400
avc_chroma_vt_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coeff0,uint32_t coeff1,int32_t height)1401 static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1402 int32_t stride, uint32_t coeff0,
1403 uint32_t coeff1, int32_t height)
1404 {
1405 if (4 == height) {
1406 avc_chroma_vt_and_aver_dst_8x4_msa(src, dst, stride, coeff0, coeff1);
1407 } else if (8 == height) {
1408 avc_chroma_vt_and_aver_dst_8x8_msa(src, dst, stride, coeff0, coeff1);
1409 }
1410 }
1411
avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1412 static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, uint8_t *dst,
1413 int32_t stride,
1414 uint32_t coef_hor0,
1415 uint32_t coef_hor1,
1416 uint32_t coef_ver0,
1417 uint32_t coef_ver1)
1418 {
1419 uint16_t out0, out1;
1420 v16u8 dst0 = { 0 };
1421 v16u8 src0, src1, src2;
1422 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1423 v16i8 res, mask;
1424 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1429
1430 mask = LD_SB(&chroma_mask_arr[48]);
1431
1432 LD_UB3(src, stride, src0, src1, src2);
1433 out0 = LH(dst);
1434 out1 = LH(dst + stride);
1435 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1437 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1438 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1440
1441 res_vt0 += res_vt1;
1442 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1448
1449 SH(out0, dst);
1450 dst += stride;
1451 SH(out1, dst);
1452 }
1453
avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1454 static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, uint8_t *dst,
1455 int32_t stride,
1456 uint32_t coef_hor0,
1457 uint32_t coef_hor1,
1458 uint32_t coef_ver0,
1459 uint32_t coef_ver1)
1460 {
1461 uint16_t tp0, tp1, tp2, tp3;
1462 v16u8 src0, src1, src2, src3, src4;
1463 v16u8 tmp0, tmp1, tmp2, tmp3;
1464 v16u8 dst0 = { 0 };
1465 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466 v16i8 res, mask;
1467 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1472
1473 mask = LD_SB(&chroma_mask_arr[48]);
1474
1475 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1476 tp0 = LH(dst);
1477 tp1 = LH(dst + stride);
1478 tp2 = LH(dst + 2 * stride);
1479 tp3 = LH(dst + 3 * stride);
1480 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1484 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1485 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1486 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1487 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1488 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1489
1490 res_vt0 += res_vt1;
1491 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1495
1496 ST_H4(dst0, 0, 1, 2, 3, dst, stride);
1497 }
1498
avc_chroma_hv_and_aver_dst_2w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1499 static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
1500 int32_t stride,
1501 uint32_t coef_hor0,
1502 uint32_t coef_hor1,
1503 uint32_t coef_ver0,
1504 uint32_t coef_ver1,
1505 int32_t height)
1506 {
1507 if (2 == height) {
1508 avc_chroma_hv_and_aver_dst_2x2_msa(src, dst, stride, coef_hor0,
1509 coef_hor1, coef_ver0, coef_ver1);
1510 } else if (4 == height) {
1511 avc_chroma_hv_and_aver_dst_2x4_msa(src, dst, stride, coef_hor0,
1512 coef_hor1, coef_ver0, coef_ver1);
1513 }
1514 }
1515
avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1516 static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, uint8_t *dst,
1517 int32_t stride,
1518 uint32_t coef_hor0,
1519 uint32_t coef_hor1,
1520 uint32_t coef_ver0,
1521 uint32_t coef_ver1)
1522 {
1523 uint32_t tp0, tp1;
1524 v16u8 src0, src1, src2;
1525 v16u8 dst0, dst_data = { 0 };
1526 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1527 v16i8 mask;
1528 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1533
1534 mask = LD_SB(&chroma_mask_arr[0]);
1535
1536 LD_UB3(src, stride, src0, src1, src2);
1537 LW2(dst, stride, tp0, tp1);
1538 INSERT_W2_UB(tp0, tp1, dst_data);
1539 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1540 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1541 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1542
1543 res_vt0 += res_vt1;
1544 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546 dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547 dst0 = __msa_aver_u_b(dst0, dst_data);
1548
1549 ST_W2(dst0, 0, 1, dst, stride);
1550 }
1551
avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1552 static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
1553 int32_t stride,
1554 uint32_t coef_hor0,
1555 uint32_t coef_hor1,
1556 uint32_t coef_ver0,
1557 uint32_t coef_ver1)
1558 {
1559 uint32_t tp0, tp1, tp2, tp3;
1560 v16u8 src0, src1, src2, src3, src4;
1561 v16u8 out, dst_data = { 0 };
1562 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1564 v16i8 mask;
1565 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1570
1571 mask = LD_SB(&chroma_mask_arr[0]);
1572
1573 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1574 LW4(dst, stride, tp0, tp1, tp2, tp3);
1575 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst_data);
1576 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1577 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1578 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1579 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1580 res_hz3);
1581 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1584 SRARI_H2_UH(res_vt0, res_vt1, 6);
1585 SAT_UH2_UH(res_vt0, res_vt1, 7);
1586 out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587 out = __msa_aver_u_b(out, dst_data);
1588 ST_W4(out, 0, 1, 2, 3, dst, stride);
1589 }
1590
avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1591 static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
1592 int32_t stride,
1593 uint32_t coef_hor0,
1594 uint32_t coef_hor1,
1595 uint32_t coef_ver0,
1596 uint32_t coef_ver1)
1597 {
1598 uint32_t tp0, tp1, tp2, tp3;
1599 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600 v16u8 dst0 = { 0 }, dst1 = { 0 };
1601 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1603 v16i8 mask;
1604 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1609
1610 mask = LD_SB(&chroma_mask_arr[0]);
1611
1612 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1613 src += (5 * stride);
1614 LD_UB4(src, stride, src5, src6, src7, src8);
1615 LW4(dst, stride, tp0, tp1, tp2, tp3);
1616 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1617 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1618 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1619 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1620 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1621 VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
1622 VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
1623 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1624 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1636 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1637 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1638 }
1639
avc_chroma_hv_and_aver_dst_4w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1640 static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
1641 int32_t stride,
1642 uint32_t coef_hor0,
1643 uint32_t coef_hor1,
1644 uint32_t coef_ver0,
1645 uint32_t coef_ver1,
1646 int32_t height)
1647 {
1648 if (2 == height) {
1649 avc_chroma_hv_and_aver_dst_4x2_msa(src, dst, stride, coef_hor0,
1650 coef_hor1, coef_ver0, coef_ver1);
1651 } else if (4 == height) {
1652 avc_chroma_hv_and_aver_dst_4x4_msa(src, dst, stride, coef_hor0,
1653 coef_hor1, coef_ver0, coef_ver1);
1654 } else if (8 == height) {
1655 avc_chroma_hv_and_aver_dst_4x8_msa(src, dst, stride, coef_hor0,
1656 coef_hor1, coef_ver0, coef_ver1);
1657 }
1658 }
1659
avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1660 static void avc_chroma_hv_and_aver_dst_8x4_msa(uint8_t *src, uint8_t *dst,
1661 int32_t stride,
1662 uint32_t coef_hor0,
1663 uint32_t coef_hor1,
1664 uint32_t coef_ver0,
1665 uint32_t coef_ver1)
1666 {
1667 uint64_t tp0, tp1, tp2, tp3;
1668 v16u8 src0, src1, src2, src3, src4, out0, out1;
1669 v8u16 res_hz0, res_hz1, res_hz2;
1670 v8u16 res_hz3, res_hz4;
1671 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672 v16u8 dst0 = { 0 }, dst1 = { 0 };
1673 v16i8 mask;
1674 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679
1680 mask = LD_SB(&chroma_mask_arr[32]);
1681
1682 src0 = LD_UB(src);
1683 src += stride;
1684 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1685 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1686 LD_UB4(src, stride, src1, src2, src3, src4);
1687 src += (4 * stride);
1688 LD4(dst, stride, tp0, tp1, tp2, tp3);
1689 INSERT_D2_UB(tp0, tp1, dst0);
1690 INSERT_D2_UB(tp2, tp3, dst1);
1691 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1692 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1693 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1694 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697 res_vt0 += (res_hz0 * coeff_vt_vec1);
1698 res_vt1 += (res_hz1 * coeff_vt_vec1);
1699 res_vt2 += (res_hz2 * coeff_vt_vec1);
1700 res_vt3 += (res_hz3 * coeff_vt_vec1);
1701 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1704 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1705 ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
1706 }
1707
avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1)1708 static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
1709 int32_t stride,
1710 uint32_t coef_hor0,
1711 uint32_t coef_hor1,
1712 uint32_t coef_ver0,
1713 uint32_t coef_ver1)
1714 {
1715 uint64_t tp0, tp1, tp2, tp3;
1716 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1717 v16u8 out0, out1, out2, out3;
1718 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1723 v16i8 mask;
1724 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1729
1730 mask = LD_SB(&chroma_mask_arr[32]);
1731
1732 LD_UB5(src, stride, src0, src1, src2, src3, src4);
1733 src += (5 * stride);
1734 LD_UB4(src, stride, src5, src6, src7, src8);
1735 src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1736 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1737 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1738 VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
1739 VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
1740 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1741 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1742 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1743 res_hz4);
1744 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1748 res_vt3);
1749 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1751 res_vt7);
1752 LD4(dst, stride, tp0, tp1, tp2, tp3);
1753 INSERT_D2_UB(tp0, tp1, dst0);
1754 INSERT_D2_UB(tp2, tp3, dst1);
1755 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1756 INSERT_D2_UB(tp0, tp1, dst2);
1757 INSERT_D2_UB(tp2, tp3, dst3);
1758 res_vt0 += (res_hz0 * coeff_vt_vec1);
1759 res_vt1 += (res_hz1 * coeff_vt_vec1);
1760 res_vt2 += (res_hz2 * coeff_vt_vec1);
1761 res_vt3 += (res_hz3 * coeff_vt_vec1);
1762 res_vt4 += (res_hz4 * coeff_vt_vec1);
1763 res_vt5 += (res_hz5 * coeff_vt_vec1);
1764 res_vt6 += (res_hz6 * coeff_vt_vec1);
1765 res_vt7 += (res_hz7 * coeff_vt_vec1);
1766 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1772 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1773 AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
1774 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1775 }
1776
avc_chroma_hv_and_aver_dst_8w_msa(uint8_t * src,uint8_t * dst,int32_t stride,uint32_t coef_hor0,uint32_t coef_hor1,uint32_t coef_ver0,uint32_t coef_ver1,int32_t height)1777 static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
1778 int32_t stride,
1779 uint32_t coef_hor0,
1780 uint32_t coef_hor1,
1781 uint32_t coef_ver0,
1782 uint32_t coef_ver1,
1783 int32_t height)
1784 {
1785 if (4 == height) {
1786 avc_chroma_hv_and_aver_dst_8x4_msa(src, dst, stride, coef_hor0,
1787 coef_hor1, coef_ver0, coef_ver1);
1788 } else if (8 == height) {
1789 avc_chroma_hv_and_aver_dst_8x8_msa(src, dst, stride, coef_hor0,
1790 coef_hor1, coef_ver0, coef_ver1);
1791 }
1792 }
1793
copy_width4_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1794 static void copy_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1795 int32_t height)
1796 {
1797 uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1798
1799 if (8 == height) {
1800 LW4(src, stride, tp0, tp1, tp2, tp3);
1801 src += 4 * stride;
1802 LW4(src, stride, tp4, tp5, tp6, tp7);
1803 SW4(tp0, tp1, tp2, tp3, dst, stride);
1804 dst += 4 * stride;
1805 SW4(tp4, tp5, tp6, tp7, dst, stride);
1806 } else if (4 == height) {
1807 LW4(src, stride, tp0, tp1, tp2, tp3);
1808 SW4(tp0, tp1, tp2, tp3, dst, stride);
1809 } else if (2 == height) {
1810 LW2(src, stride, tp0, tp1);
1811 SW(tp0, dst);
1812 dst += stride;
1813 SW(tp1, dst);
1814 }
1815 }
1816
copy_width8_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1817 static void copy_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1818 int32_t height)
1819 {
1820 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1821
1822 if (8 == height) {
1823 LD4(src, stride, src0, src1, src2, src3);
1824 src += 4 * stride;
1825 LD4(src, stride, src4, src5, src6, src7);
1826 SD4(src0, src1, src2, src3, dst, stride);
1827 dst += 4 * stride;
1828 SD4(src4, src5, src6, src7, dst, stride);
1829 } else if (4 == height) {
1830 LD4(src, stride, src0, src1, src2, src3);
1831 SD4(src0, src1, src2, src3, dst, stride);
1832 }
1833 }
1834
avg_width4_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1835 static void avg_width4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1836 int32_t height)
1837 {
1838 uint32_t tp0, tp1, tp2, tp3;
1839 v16u8 src0 = { 0 }, src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1840
1841 if (8 == height) {
1842 LW4(src, stride, tp0, tp1, tp2, tp3);
1843 src += 4 * stride;
1844 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1845 LW4(src, stride, tp0, tp1, tp2, tp3);
1846 INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
1847 LW4(dst, stride, tp0, tp1, tp2, tp3);
1848 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1849 LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
1850 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
1851 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1852 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
1853 } else if (4 == height) {
1854 LW4(src, stride, tp0, tp1, tp2, tp3);
1855 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1856 LW4(dst, stride, tp0, tp1, tp2, tp3);
1857 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1858 dst0 = __msa_aver_u_b(src0, dst0);
1859 ST_W4(dst0, 0, 1, 2, 3, dst, stride);
1860 } else if (2 == height) {
1861 LW2(src, stride, tp0, tp1);
1862 INSERT_W2_UB(tp0, tp1, src0);
1863 LW2(dst, stride, tp0, tp1);
1864 INSERT_W2_UB(tp0, tp1, dst0);
1865 dst0 = __msa_aver_u_b(src0, dst0);
1866 ST_W2(dst0, 0, 1, dst, stride);
1867 }
1868 }
1869
avg_width8_msa(uint8_t * src,uint8_t * dst,int32_t stride,int32_t height)1870 static void avg_width8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
1871 int32_t height)
1872 {
1873 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1875 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1876
1877 if (8 == height) {
1878 LD4(src, stride, tp0, tp1, tp2, tp3);
1879 src += 4 * stride;
1880 LD4(src, stride, tp4, tp5, tp6, tp7);
1881 INSERT_D2_UB(tp0, tp1, src0);
1882 INSERT_D2_UB(tp2, tp3, src1);
1883 INSERT_D2_UB(tp4, tp5, src2);
1884 INSERT_D2_UB(tp6, tp7, src3);
1885 LD4(dst, stride, tp0, tp1, tp2, tp3);
1886 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1887 INSERT_D2_UB(tp0, tp1, dst0);
1888 INSERT_D2_UB(tp2, tp3, dst1);
1889 INSERT_D2_UB(tp4, tp5, dst2);
1890 INSERT_D2_UB(tp6, tp7, dst3);
1891 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1892 dst2, dst3);
1893 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
1894 } else if (4 == height) {
1895 LD4(src, stride, tp0, tp1, tp2, tp3);
1896 INSERT_D2_UB(tp0, tp1, src0);
1897 INSERT_D2_UB(tp2, tp3, src1);
1898 LD4(dst, stride, tp0, tp1, tp2, tp3);
1899 INSERT_D2_UB(tp0, tp1, dst0);
1900 INSERT_D2_UB(tp2, tp3, dst1);
1901 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1902 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
1903 }
1904 }
1905
ff_put_h264_chroma_mc8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1906 void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1907 ptrdiff_t stride, int height, int x, int y)
1908 {
1909 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1910
1911 if (x && y) {
1912 avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1913 } else if (x) {
1914 avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1915 } else if (y) {
1916 avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1917 } else {
1918 copy_width8_msa(src, dst, stride, height);
1919 }
1920 }
1921
ff_put_h264_chroma_mc4_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1922 void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1923 ptrdiff_t stride, int height, int x, int y)
1924 {
1925 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1926
1927 if (x && y) {
1928 avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1929 } else if (x) {
1930 avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1931 } else if (y) {
1932 avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1933 } else {
1934 copy_width4_msa(src, dst, stride, height);
1935 }
1936 }
1937
ff_put_h264_chroma_mc2_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1938 void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1939 ptrdiff_t stride, int height, int x, int y)
1940 {
1941 int32_t cnt;
1942
1943 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944
1945 if (x && y) {
1946 avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1947 } else if (x) {
1948 avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1949 } else if (y) {
1950 avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1951 } else {
1952 for (cnt = height; cnt--;) {
1953 *((uint16_t *) dst) = *((uint16_t *) src);
1954
1955 src += stride;
1956 dst += stride;
1957 }
1958 }
1959 }
1960
ff_avg_h264_chroma_mc8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1961 void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src,
1962 ptrdiff_t stride, int height, int x, int y)
1963 {
1964 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1965
1966
1967 if (x && y) {
1968 avc_chroma_hv_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), y,
1969 (8 - y), height);
1970 } else if (x) {
1971 avc_chroma_hz_and_aver_dst_8w_msa(src, dst, stride, x, (8 - x), height);
1972 } else if (y) {
1973 avc_chroma_vt_and_aver_dst_8w_msa(src, dst, stride, y, (8 - y), height);
1974 } else {
1975 avg_width8_msa(src, dst, stride, height);
1976 }
1977 }
1978
ff_avg_h264_chroma_mc4_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1979 void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src,
1980 ptrdiff_t stride, int height, int x, int y)
1981 {
1982 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1983
1984 if (x && y) {
1985 avc_chroma_hv_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), y,
1986 (8 - y), height);
1987 } else if (x) {
1988 avc_chroma_hz_and_aver_dst_4w_msa(src, dst, stride, x, (8 - x), height);
1989 } else if (y) {
1990 avc_chroma_vt_and_aver_dst_4w_msa(src, dst, stride, y, (8 - y), height);
1991 } else {
1992 avg_width4_msa(src, dst, stride, height);
1993 }
1994 }
1995
ff_avg_h264_chroma_mc2_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int x,int y)1996 void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src,
1997 ptrdiff_t stride, int height, int x, int y)
1998 {
1999 int32_t cnt;
2000
2001 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2002
2003 if (x && y) {
2004 avc_chroma_hv_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), y,
2005 (8 - y), height);
2006 } else if (x) {
2007 avc_chroma_hz_and_aver_dst_2w_msa(src, dst, stride, x, (8 - x), height);
2008 } else if (y) {
2009 avc_chroma_vt_and_aver_dst_2w_msa(src, dst, stride, y, (8 - y), height);
2010 } else {
2011 for (cnt = height; cnt--;) {
2012 dst[0] = (dst[0] + src[0] + 1) >> 1;
2013 dst[1] = (dst[1] + src[1] + 1) >> 1;
2014
2015 src += stride;
2016 dst += stride;
2017 }
2018 }
2019 }
2020