1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "qpeldsp_mips.h"
23 
24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2)  \
25 ( {                                                                     \
26     v16u8 out, tmp0, tmp1;                                              \
27     v16u8 data0, data1, data2, data3, data4, data5;                     \
28     v8i16 res_r, res_l;                                                 \
29     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
30     v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
31                                                                         \
32     VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
33     ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
34     data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
35     data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
36     HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
37     ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
38     data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
39     data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
40     sum0_r *= (v8u16) (coef0);                                          \
41     sum0_l *= (v8u16) (coef0);                                          \
42     ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
43     data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
44     data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
45     DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
46     ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
47     HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
48     DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
49     res_r = (v8i16) (sum0_r - sum3_r);                                  \
50     res_l = (v8i16) (sum0_l - sum3_l);                                  \
51     SRARI_H2_SH(res_r, res_l, 5);                                       \
52     CLIP_SH2_0_255(res_r, res_l);                                       \
53     out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
54                                                                         \
55     out;                                                                \
56 } )
57 
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,                       \
59                                       mask0, mask1, mask2, mask3,       \
60                                       coef0, coef1, coef2)              \
61 ( {                                                                     \
62     v16u8 out;                                                          \
63     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
64     v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                               \
65     v8i16 res0_r, res1_r;                                               \
66                                                                         \
67     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);   \
68     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);   \
69     HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                        \
70     DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);          \
71     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);   \
72     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);   \
73     DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);         \
74     DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);         \
75     res0_r = (v8i16) (sum0_r - sum3_r);                                 \
76     res1_r = (v8i16) (sum4_r - sum7_r);                                 \
77     SRARI_H2_SH(res0_r, res1_r, 5);                                     \
78     CLIP_SH2_0_255(res0_r, res1_r);                                     \
79     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);        \
80                                                                         \
81     out;                                                                \
82 } )
83 
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,                        \
85                                            mask0, mask1, mask2, mask3,  \
86                                            coef0, coef1, coef2)         \
87 ( {                                                                     \
88     v16u8 out;                                                          \
89     v8i16 res0_r;                                                       \
90     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
91                                                                         \
92     VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);   \
93     sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);            \
94     sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);             \
95     VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);   \
96     DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);         \
97     res0_r = (v8i16) (sum0_r - sum3_r);                                 \
98     res0_r = __msa_srari_h(res0_r, 5);                                  \
99     CLIP_SH_0_255(res0_r);                                              \
100     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);        \
101                                                                         \
102     out;                                                                \
103 } )
104 
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,   \
106                                                     mask2, mask3, coef0,  \
107                                                     coef1, coef2)         \
108 ( {                                                                       \
109     v16u8 out;                                                            \
110     v8i16 res0_r;                                                         \
111     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                 \
112                                                                           \
113     VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r);     \
114     sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r);              \
115     sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0);               \
116     VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r);     \
117     DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r);           \
118     res0_r = (v8i16) (sum0_r - sum3_r);                                   \
119     res0_r += 15;                                                         \
120     res0_r >>= 5;                                                         \
121     CLIP_SH_0_255(res0_r);                                                \
122     out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r);          \
123                                                                           \
124     out;                                                                  \
125 } )
126 
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,              \
128                                          coef0, coef1, coef2)           \
129 ( {                                                                     \
130     v16u8 out, tmp0, tmp1;                                              \
131     v16u8 data0, data1, data2, data3, data4, data5;                     \
132     v8i16 res_r, res_l;                                                 \
133     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
134     v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
135                                                                         \
136     VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1);         \
137     ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l);                            \
138     data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15);       \
139     data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1);        \
140     HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l);                        \
141     ILVRL_B2_UH(data3, data0, sum1_r, sum1_l);                          \
142     data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14);       \
143     data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2);        \
144     sum0_r *= (v8u16) (coef0);                                          \
145     sum0_l *= (v8u16) (coef0);                                          \
146     ILVRL_B2_UH(data4, data1, sum2_r, sum2_l);                          \
147     data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13);       \
148     data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3);        \
149     DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
150     ILVRL_B2_UH(data5, data2, sum3_r, sum3_l);                          \
151     HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
152     DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
153     res_r = (v8i16) (sum0_r - sum3_r);                                  \
154     res_l = (v8i16) (sum0_l - sum3_l);                                  \
155     res_r += 15;                                                        \
156     res_l += 15;                                                        \
157     res_r >>= 5;                                                        \
158     res_l >>= 5;                                                        \
159     CLIP_SH2_0_255(res_r, res_l);                                       \
160     out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
161                                                                         \
162     out;                                                                \
163 } )
164 
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1,                  \
166                                                mask0, mask1, mask2, mask3,  \
167                                                coef0, coef1, coef2)         \
168 ( {                                                                         \
169     v16u8 out;                                                              \
170     v8i16 res0_r, res1_r;                                                   \
171     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                                   \
172     v8u16 sum4_r, sum5_r, sum6_r, sum7_r;                                   \
173                                                                             \
174     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r);       \
175     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r);       \
176     HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r);                            \
177     DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r);              \
178     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r);       \
179     VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r);       \
180     DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r);             \
181     DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r);             \
182     res0_r = (v8i16) (sum0_r - sum3_r);                                     \
183     res1_r = (v8i16) (sum4_r - sum7_r);                                     \
184     res0_r += 15;                                                           \
185     res1_r += 15;                                                           \
186     res0_r >>= 5;                                                           \
187     res1_r >>= 5;                                                           \
188     CLIP_SH2_0_255(res0_r, res1_r);                                         \
189     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);            \
190                                                                             \
191     out;                                                                    \
192 } )
193 
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3,                  \
195                                inp4, inp5, inp6, inp7,                  \
196                                coef0, coef1, coef2)                     \
197 ( {                                                                     \
198     v16u8 res;                                                          \
199     v8i16 res_r, res_l;                                                 \
200     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
201     v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
202                                                                         \
203     ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
204     ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
205     DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
206     HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
207     ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
208     ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
209     DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
210     DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
211     res_r = (v8i16) (sum0_r - sum3_r);                                  \
212     res_l = (v8i16) (sum0_l - sum3_l);                                  \
213     SRARI_H2_SH(res_r, res_l, 5);                                       \
214     CLIP_SH2_0_255(res_r, res_l);                                       \
215     res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
216                                                                         \
217     res;                                                                \
218 } )
219 
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03,        \
221                                      inp04, inp05, inp06, inp07,        \
222                                      inp10, inp11, inp12, inp13,        \
223                                      inp14, inp15, inp16, inp17,        \
224                                      coef0, coef1, coef2)               \
225 ( {                                                                     \
226     v16u8 res;                                                          \
227     v8i16 val0, val1;                                                   \
228     v8u16 sum00, sum01, sum02, sum03;                                   \
229     v8u16 sum10, sum11, sum12, sum13;                                   \
230                                                                         \
231     ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,  \
232                sum00, sum10, sum03, sum13);                             \
233     DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);              \
234     HADD_UB2_UH(sum03, sum13, sum03, sum13);                            \
235     ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,  \
236                sum02, sum12, sum01, sum11);                             \
237     DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);             \
238     DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);             \
239     val0 = (v8i16) (sum00 - sum03);                                     \
240     val1 = (v8i16) (sum10 - sum13);                                     \
241     SRARI_H2_SH(val0, val1, 5);                                         \
242     CLIP_SH2_0_255(val0, val1);                                         \
243     res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);            \
244                                                                         \
245     res;                                                                \
246 } )
247 
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3,         \
249                                         inp4, inp5, inp6, inp7,         \
250                                         coef0, coef1, coef2)            \
251 ( {                                                                     \
252     v16u8 res;                                                          \
253     v8i16 res_r, res_l;                                                 \
254     v8u16 sum0_r, sum1_r, sum2_r, sum3_r;                               \
255     v8u16 sum0_l, sum1_l, sum2_l, sum3_l;                               \
256                                                                         \
257     ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l);                            \
258     ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l);                            \
259     DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l);          \
260     HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l);                        \
261     ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l);                            \
262     ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l);                            \
263     DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l);         \
264     DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l);         \
265     res_r = (v8i16) (sum0_r - sum3_r);                                  \
266     res_l = (v8i16) (sum0_l - sum3_l);                                  \
267     res_r += 15;                                                        \
268     res_l += 15;                                                        \
269     res_r >>= 5;                                                        \
270     res_l >>= 5;                                                        \
271     CLIP_SH2_0_255(res_r, res_l);                                       \
272     res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r);          \
273                                                                         \
274     res;                                                                \
275 } )
276 
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03,  \
278                                               inp04, inp05, inp06, inp07,  \
279                                               inp10, inp11, inp12, inp13,  \
280                                               inp14, inp15, inp16, inp17,  \
281                                               coef0, coef1, coef2)         \
282 ( {                                                                        \
283     v16u8 res;                                                             \
284     v8i16 val0, val1;                                                      \
285     v8u16 sum00, sum01, sum02, sum03;                                      \
286     v8u16 sum10, sum11, sum12, sum13;                                      \
287                                                                            \
288     ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13,     \
289                sum00, sum10, sum03, sum13);                                \
290     DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10);                 \
291     HADD_UB2_UH(sum03, sum13, sum03, sum13);                               \
292     ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11,     \
293                sum02, sum12, sum01, sum11);                                \
294     DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10);                \
295     DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13);                \
296     val0 = (v8i16) (sum00 - sum03);                                        \
297     val1 = (v8i16) (sum10 - sum13);                                        \
298     val0 += 15;                                                            \
299     val1 += 15;                                                            \
300     val0 >>= 5;                                                            \
301     val1 >>= 5;                                                            \
302     CLIP_SH2_0_255(val0, val1);                                            \
303     res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0);               \
304                                                                            \
305     res;                                                                   \
306 } )
307 
horiz_mc_qpel_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)308 static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src,
309                                                int32_t src_stride,
310                                                uint8_t *dst,
311                                                int32_t dst_stride,
312                                                int32_t height)
313 {
314     uint8_t loop_count;
315     v16u8 inp0, inp1, inp2, inp3;
316     v16u8 res0, res1;
317     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321     v16u8 const20 = (v16u8) __msa_ldi_b(20);
322     v16u8 const6 = (v16u8) __msa_ldi_b(6);
323     v16u8 const3 = (v16u8) __msa_ldi_b(3);
324 
325     for (loop_count = (height >> 2); loop_count--;) {
326         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327         src += (4 * src_stride);
328         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329                                              mask0, mask1, mask2, mask3,
330                                              const20, const6, const3);
331         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332                                              mask0, mask1, mask2, mask3,
333                                              const20, const6, const3);
334         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336         AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
338         dst += (4 * dst_stride);
339     }
340 }
341 
horiz_mc_qpel_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)342 static void horiz_mc_qpel_aver_src0_16width_msa(const uint8_t *src,
343                                                 int32_t src_stride,
344                                                 uint8_t *dst,
345                                                 int32_t dst_stride,
346                                                 int32_t height)
347 {
348     uint8_t loop_count;
349     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
350     v16u8 res;
351     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352     v16u8 const6 = (v16u8) __msa_ldi_b(6);
353     v16u8 const3 = (v16u8) __msa_ldi_b(3);
354     v8u16 const20 = (v8u16) __msa_ldi_h(20);
355 
356     for (loop_count = (height >> 2); loop_count--;) {
357         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359         src += (4 * src_stride);
360         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361                                       const20, const6, const3);
362         res = __msa_aver_u_b(inp0, res);
363         ST_UB(res, dst);
364         dst += dst_stride;
365 
366         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367                                       const20, const6, const3);
368         res = __msa_aver_u_b(inp2, res);
369         ST_UB(res, dst);
370         dst += dst_stride;
371 
372         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373                                       const20, const6, const3);
374         res = __msa_aver_u_b(inp4, res);
375         ST_UB(res, dst);
376         dst += dst_stride;
377 
378         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379                                       const20, const6, const3);
380         res = __msa_aver_u_b(inp6, res);
381         ST_UB(res, dst);
382         dst += dst_stride;
383     }
384 }
385 
horiz_mc_qpel_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)386 static void horiz_mc_qpel_8width_msa(const uint8_t *src,
387                                      int32_t src_stride,
388                                      uint8_t *dst,
389                                      int32_t dst_stride,
390                                      int32_t height)
391 {
392     uint8_t loop_count;
393     v16u8 inp0, inp1, inp2, inp3;
394     v16u8 res0, res1;
395     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399     v16u8 const20 = (v16u8) __msa_ldi_b(20);
400     v16u8 const6 = (v16u8) __msa_ldi_b(6);
401     v16u8 const3 = (v16u8) __msa_ldi_b(3);
402 
403     for (loop_count = (height >> 2); loop_count--;) {
404         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405         src += (4 * src_stride);
406         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407                                              mask0, mask1, mask2, mask3,
408                                              const20, const6, const3);
409         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410                                              mask0, mask1, mask2, mask3,
411                                              const20, const6, const3);
412         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
413         dst += (4 * dst_stride);
414     }
415 }
416 
horiz_mc_qpel_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)417 static void horiz_mc_qpel_16width_msa(const uint8_t *src,
418                                       int32_t src_stride,
419                                       uint8_t *dst,
420                                       int32_t dst_stride,
421                                       int32_t height)
422 {
423     uint8_t loop_count;
424     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
425     v16u8 res;
426     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427     v8u16 const20 = (v8u16) __msa_ldi_h(20);
428     v16u8 const6 = (v16u8) __msa_ldi_b(6);
429     v16u8 const3 = (v16u8) __msa_ldi_b(3);
430 
431     for (loop_count = (height >> 2); loop_count--;) {
432         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434         src += (4 * src_stride);
435         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436                                       const20, const6, const3);
437         ST_UB(res, dst);
438         dst += dst_stride;
439 
440         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441                                       const20, const6, const3);
442         ST_UB(res, dst);
443         dst += dst_stride;
444 
445         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446                                       const20, const6, const3);
447         ST_UB(res, dst);
448         dst += dst_stride;
449 
450         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451                                       const20, const6, const3);
452         ST_UB(res, dst);
453         dst += dst_stride;
454     }
455 }
456 
horiz_mc_qpel_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)457 static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src,
458                                                int32_t src_stride,
459                                                uint8_t *dst,
460                                                int32_t dst_stride,
461                                                int32_t height)
462 {
463     uint8_t loop_count;
464     v16u8 inp0, inp1, inp2, inp3;
465     v16u8 res0, res1;
466     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470     v16u8 const20 = (v16u8) __msa_ldi_b(20);
471     v16u8 const6 = (v16u8) __msa_ldi_b(6);
472     v16u8 const3 = (v16u8) __msa_ldi_b(3);
473 
474     for (loop_count = (height >> 2); loop_count--;) {
475         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476         src += (4 * src_stride);
477         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478                                              mask0, mask1, mask2, mask3,
479                                              const20, const6, const3);
480         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481                                              mask0, mask1, mask2, mask3,
482                                              const20, const6, const3);
483         SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
484                    inp0, inp1, inp2, inp3);
485         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487         AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
489         dst += (4 * dst_stride);
490     }
491 }
492 
horiz_mc_qpel_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)493 static void horiz_mc_qpel_aver_src1_16width_msa(const uint8_t *src,
494                                                 int32_t src_stride,
495                                                 uint8_t *dst,
496                                                 int32_t dst_stride,
497                                                 int32_t height)
498 {
499     uint8_t loop_count;
500     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
501     v16u8 res;
502     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503     v8u16 const20 = (v8u16) __msa_ldi_h(20);
504     v16u8 const6 = (v16u8) __msa_ldi_b(6);
505     v16u8 const3 = (v16u8) __msa_ldi_b(3);
506 
507     for (loop_count = (height >> 2); loop_count--;) {
508         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510         src += (4 * src_stride);
511         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512                                       const20, const6, const3);
513         res = __msa_aver_u_b(res, inp1);
514         ST_UB(res, dst);
515         dst += dst_stride;
516 
517         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518                                       const20, const6, const3);
519         res = __msa_aver_u_b(res, inp3);
520         ST_UB(res, dst);
521         dst += dst_stride;
522 
523         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524                                       const20, const6, const3);
525         res = __msa_aver_u_b(res, inp5);
526         ST_UB(res, dst);
527         dst += dst_stride;
528 
529         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530                                       const20, const6, const3);
531         res = __msa_aver_u_b(res, inp7);
532         ST_UB(res, dst);
533         dst += dst_stride;
534     }
535 }
536 
horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)537 static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src,
538                                                       int32_t src_stride,
539                                                       uint8_t *dst,
540                                                       int32_t dst_stride,
541                                                       int32_t height)
542 {
543     uint8_t loop_count;
544     v16u8 inp0, inp1, inp2, inp3;
545     v16u8 res0, res1;
546     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550     v16u8 const20 = (v16u8) __msa_ldi_b(20);
551     v16u8 const6 = (v16u8) __msa_ldi_b(6);
552     v16u8 const3 = (v16u8) __msa_ldi_b(3);
553 
554     for (loop_count = (height >> 2); loop_count--;) {
555         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556         src += (4 * src_stride);
557         res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558                                                       mask2, mask3, const20,
559                                                       const6, const3);
560         res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561                                                       mask2, mask3, const20,
562                                                       const6, const3);
563         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565         res0 = __msa_ave_u_b(inp0, res0);
566         res1 = __msa_ave_u_b(inp2, res1);
567         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
568         dst += (4 * dst_stride);
569     }
570 }
571 
horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)572 static void horiz_mc_qpel_no_rnd_aver_src0_16width_msa(const uint8_t *src,
573                                                        int32_t src_stride,
574                                                        uint8_t *dst,
575                                                        int32_t dst_stride,
576                                                        int32_t height)
577 {
578     uint8_t loop_count;
579     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
580     v16u8 res;
581     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582     v8u16 const20 = (v8u16) __msa_ldi_h(20);
583     v16u8 const6 = (v16u8) __msa_ldi_b(6);
584     v16u8 const3 = (v16u8) __msa_ldi_b(3);
585 
586     for (loop_count = (height >> 2); loop_count--;) {
587         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589         src += (4 * src_stride);
590         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591                                                const20, const6, const3);
592         res = __msa_ave_u_b(inp0, res);
593         ST_UB(res, dst);
594         dst += dst_stride;
595 
596         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597                                                const20, const6, const3);
598         res = __msa_ave_u_b(inp2, res);
599         ST_UB(res, dst);
600         dst += dst_stride;
601 
602         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603                                                const20, const6, const3);
604         res = __msa_ave_u_b(inp4, res);
605         ST_UB(res, dst);
606         dst += dst_stride;
607 
608         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609                                                const20, const6, const3);
610         res = __msa_ave_u_b(inp6, res);
611         ST_UB(res, dst);
612         dst += dst_stride;
613     }
614 }
615 
horiz_mc_qpel_no_rnd_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)616 static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src,
617                                             int32_t src_stride,
618                                             uint8_t *dst,
619                                             int32_t dst_stride,
620                                             int32_t height)
621 {
622     uint8_t loop_count;
623     v16u8 inp0, inp1, inp2, inp3;
624     v16u8 res0, res1;
625     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629     v16u8 const20 = (v16u8) __msa_ldi_b(20);
630     v16u8 const6 = (v16u8) __msa_ldi_b(6);
631     v16u8 const3 = (v16u8) __msa_ldi_b(3);
632 
633     for (loop_count = (height >> 2); loop_count--;) {
634         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635         src += (4 * src_stride);
636         res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637                                                       mask2, mask3, const20,
638                                                       const6, const3);
639         res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640                                                       mask2, mask3, const20,
641                                                       const6, const3);
642         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
643         dst += (4 * dst_stride);
644     }
645 }
646 
horiz_mc_qpel_no_rnd_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)647 static void horiz_mc_qpel_no_rnd_16width_msa(const uint8_t *src,
648                                              int32_t src_stride,
649                                              uint8_t *dst,
650                                              int32_t dst_stride,
651                                              int32_t height)
652 {
653     uint8_t loop_count;
654     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
655     v16u8 res;
656     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657     v16u8 const6 = (v16u8) __msa_ldi_b(6);
658     v16u8 const3 = (v16u8) __msa_ldi_b(3);
659     v8u16 const20 = (v8u16) __msa_ldi_h(20);
660 
661     for (loop_count = (height >> 2); loop_count--;) {
662         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664         src += (4 * src_stride);
665         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666                                                const20, const6, const3);
667         ST_UB(res, dst);
668         dst += dst_stride;
669 
670         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671                                                const20, const6, const3);
672         ST_UB(res, dst);
673         dst += dst_stride;
674 
675         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676                                                const20, const6, const3);
677         ST_UB(res, dst);
678         dst += dst_stride;
679 
680         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681                                                const20, const6, const3);
682         ST_UB(res, dst);
683         dst += dst_stride;
684     }
685 }
686 
horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)687 static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src,
688                                                       int32_t src_stride,
689                                                       uint8_t *dst,
690                                                       int32_t dst_stride,
691                                                       int32_t height)
692 {
693     uint8_t loop_count;
694     v16u8 inp0, inp1, inp2, inp3;
695     v16u8 res0, res1;
696     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700     v16u8 const20 = (v16u8) __msa_ldi_b(20);
701     v16u8 const6 = (v16u8) __msa_ldi_b(6);
702     v16u8 const3 = (v16u8) __msa_ldi_b(3);
703 
704     for (loop_count = (height >> 2); loop_count--;) {
705         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706         src += (4 * src_stride);
707         res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708                                                       mask2, mask3, const20,
709                                                       const6, const3);
710         res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711                                                       mask2, mask3, const20,
712                                                       const6, const3);
713         SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
714                    inp0, inp1, inp2, inp3);
715         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717         res0 = __msa_ave_u_b(inp0, res0);
718         res1 = __msa_ave_u_b(inp2, res1);
719         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
720         dst += (4 * dst_stride);
721     }
722 }
723 
horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)724 static void horiz_mc_qpel_no_rnd_aver_src1_16width_msa(const uint8_t *src,
725                                                        int32_t src_stride,
726                                                        uint8_t *dst,
727                                                        int32_t dst_stride,
728                                                        int32_t height)
729 {
730     uint8_t loop_count;
731     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
732     v16u8 res;
733     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734     v16u8 const6 = (v16u8) __msa_ldi_b(6);
735     v16u8 const3 = (v16u8) __msa_ldi_b(3);
736     v8u16 const20 = (v8u16) __msa_ldi_h(20);
737 
738     for (loop_count = (height >> 2); loop_count--;) {
739         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741         src += (4 * src_stride);
742         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743                                                const20, const6, const3);
744         res = __msa_ave_u_b(res, inp1);
745         ST_UB(res, dst);
746         dst += dst_stride;
747 
748         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749                                                const20, const6, const3);
750         res = __msa_ave_u_b(res, inp3);
751         ST_UB(res, dst);
752         dst += dst_stride;
753 
754         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755                                                const20, const6, const3);
756         res = __msa_ave_u_b(res, inp5);
757         ST_UB(res, dst);
758         dst += dst_stride;
759 
760         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761                                                const20, const6, const3);
762         res = __msa_ave_u_b(res, inp7);
763         ST_UB(res, dst);
764         dst += dst_stride;
765     }
766 }
767 
horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)768 static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src,
769                                                        int32_t src_stride,
770                                                        uint8_t *dst,
771                                                        int32_t dst_stride,
772                                                        int32_t height)
773 {
774     uint8_t loop_count;
775     v16u8 inp0, inp1, inp2, inp3;
776     v16u8 dst0, dst1, dst2, dst3;
777     v16u8 res0, res1;
778     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782     v16u8 const20 = (v16u8) __msa_ldi_b(20);
783     v16u8 const6 = (v16u8) __msa_ldi_b(6);
784     v16u8 const3 = (v16u8) __msa_ldi_b(3);
785 
786     for (loop_count = (height >> 2); loop_count--;) {
787         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788         src += (4 * src_stride);
789         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790                                              mask0, mask1, mask2, mask3,
791                                              const20, const6, const3);
792         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793                                              mask0, mask1, mask2, mask3,
794                                              const20, const6, const3);
795         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798         dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799         dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800         AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801         AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
803         dst += (4 * dst_stride);
804     }
805 }
806 
horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)807 static void horiz_mc_qpel_avg_dst_aver_src0_16width_msa(const uint8_t *src,
808                                                         int32_t src_stride,
809                                                         uint8_t *dst,
810                                                         int32_t dst_stride,
811                                                         int32_t height)
812 {
813     uint8_t loop_count;
814     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
815     v16u8 res0, res1;
816     v16u8 dst0, dst1;
817     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818     v16u8 const6 = (v16u8) __msa_ldi_b(6);
819     v16u8 const3 = (v16u8) __msa_ldi_b(3);
820     v8u16 const20 = (v8u16) __msa_ldi_h(20);
821 
822     for (loop_count = (height >> 2); loop_count--;) {
823         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825         src += (4 * src_stride);
826         res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827                                        const20, const6, const3);
828         res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829                                        const20, const6, const3);
830         LD_UB2(dst, dst_stride, dst0, dst1);
831         AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833         ST_UB2(res0, res1, dst, dst_stride);
834         dst += (2 * dst_stride);
835 
836         res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837                                        const20, const6, const3);
838         res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839                                        const20, const6, const3);
840         LD_UB2(dst, dst_stride, dst0, dst1);
841         AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843         ST_UB2(res0, res1, dst, dst_stride);
844         dst += (2 * dst_stride);
845     }
846 }
847 
horiz_mc_qpel_avg_dst_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)848 static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src,
849                                              int32_t src_stride,
850                                              uint8_t *dst,
851                                              int32_t dst_stride,
852                                              int32_t height)
853 {
854     uint8_t loop_count;
855     v16u8 inp0, inp1, inp2, inp3;
856     v16u8 dst0, dst1, dst2, dst3;
857     v16u8 res0, res1;
858     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862     v16u8 const20 = (v16u8) __msa_ldi_b(20);
863     v16u8 const6 = (v16u8) __msa_ldi_b(6);
864     v16u8 const3 = (v16u8) __msa_ldi_b(3);
865 
866     for (loop_count = (height >> 2); loop_count--;) {
867         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868         src += (4 * src_stride);
869         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870                                              mask0, mask1, mask2, mask3,
871                                              const20, const6, const3);
872         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873                                              mask0, mask1, mask2, mask3,
874                                              const20, const6, const3);
875         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876         dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877         dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878         AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
880         dst += (4 * dst_stride);
881     }
882 }
883 
horiz_mc_qpel_avg_dst_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)884 static void horiz_mc_qpel_avg_dst_16width_msa(const uint8_t *src,
885                                               int32_t src_stride,
886                                               uint8_t *dst,
887                                               int32_t dst_stride,
888                                               int32_t height)
889 {
890     uint8_t loop_count;
891     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
892     v16u8 res0, res1;
893     v16u8 dst0, dst1;
894     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895     v16u8 const6 = (v16u8) __msa_ldi_b(6);
896     v16u8 const3 = (v16u8) __msa_ldi_b(3);
897     v8u16 const20 = (v8u16) __msa_ldi_h(20);
898 
899     for (loop_count = (height >> 2); loop_count--;) {
900         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902         src += (4 * src_stride);
903         res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904                                        const20, const6, const3);
905         res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906                                        const20, const6, const3);
907         LD_UB2(dst, dst_stride, dst0, dst1);
908         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909         ST_UB2(res0, res1, dst, dst_stride);
910         dst += (2 * dst_stride);
911 
912         res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913                                        const20, const6, const3);
914         res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915                                        const20, const6, const3);
916         LD_UB2(dst, dst_stride, dst0, dst1);
917         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918         ST_UB2(res0, res1, dst, dst_stride);
919         dst += (2 * dst_stride);
920     }
921 }
922 
horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)923 static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src,
924                                                        int32_t src_stride,
925                                                        uint8_t *dst,
926                                                        int32_t dst_stride,
927                                                        int32_t height)
928 {
929     uint8_t loop_count;
930     v16u8 inp0, inp1, inp2, inp3;
931     v16u8 dst0, dst1, dst2, dst3;
932     v16u8 res0, res1;
933     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937     v16u8 const20 = (v16u8) __msa_ldi_b(20);
938     v16u8 const6 = (v16u8) __msa_ldi_b(6);
939     v16u8 const3 = (v16u8) __msa_ldi_b(3);
940 
941     for (loop_count = (height >> 2); loop_count--;) {
942         LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943         src += (4 * src_stride);
944         res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945                                              mask0, mask1, mask2, mask3,
946                                              const20, const6, const3);
947         res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948                                              mask0, mask1, mask2, mask3,
949                                              const20, const6, const3);
950         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951         SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
952                    inp0, inp1, inp2, inp3);
953         inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954         inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955         dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956         dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957         AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958         AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959         ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
960         dst += (4 * dst_stride);
961     }
962 }
963 
horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)964 static void horiz_mc_qpel_avg_dst_aver_src1_16width_msa(const uint8_t *src,
965                                                         int32_t src_stride,
966                                                         uint8_t *dst,
967                                                         int32_t dst_stride,
968                                                         int32_t height)
969 {
970     uint8_t loop_count;
971     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972     v16u8 res0, res1, dst0, dst1;
973     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974     v16u8 const6 = (v16u8) __msa_ldi_b(6);
975     v16u8 const3 = (v16u8) __msa_ldi_b(3);
976     v8u16 const20 = (v8u16) __msa_ldi_h(20);
977 
978     for (loop_count = (height >> 2); loop_count--;) {
979         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981         src += (4 * src_stride);
982         res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983                                        const20, const6, const3);
984         res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985                                        const20, const6, const3);
986         LD_UB2(dst, dst_stride, dst0, dst1);
987         AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989         ST_UB2(res0, res1, dst, dst_stride);
990         dst += (2 * dst_stride);
991         res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992                                        const20, const6, const3);
993         res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994                                        const20, const6, const3);
995         LD_UB2(dst, dst_stride, dst0, dst1);
996         AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997         AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998         ST_UB2(res0, res1, dst, dst_stride);
999         dst += (2 * dst_stride);
1000     }
1001 }
1002 
1003 
vert_mc_qpel_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1004 static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src,
1005                                            int32_t src_stride,
1006                                            uint8_t *dst,
1007                                            int32_t dst_stride)
1008 {
1009     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010     v16u8 tmp0, tmp1, res0, res1;
1011     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1014 
1015     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016     src += (4 * src_stride);
1017     LD_UB2(src, src_stride, inp4, inp5);
1018     src += (2 * src_stride);
1019     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020                                         inp1, inp2, inp3, inp4,
1021                                         inp1, inp0, inp0, inp1,
1022                                         inp2, inp3, inp4, inp5,
1023                                         const20, const6, const3);
1024     LD_UB2(src, src_stride, inp6, inp7);
1025     src += (2 * src_stride);
1026     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027                                         inp3, inp4, inp5, inp6,
1028                                         inp3, inp2, inp1, inp0,
1029                                         inp4, inp5, inp6, inp7,
1030                                         const20, const6, const3);
1031     tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032     tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1035 
1036     inp8 = LD_UB(src);
1037     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1038                                         inp5, inp6, inp7, inp8,
1039                                         inp5, inp4, inp3, inp2,
1040                                         inp6, inp7, inp8, inp8,
1041                                         const20, const6, const3);
1042     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1043                                         inp7, inp8, inp8, inp7,
1044                                         inp7, inp6, inp5, inp4,
1045                                         inp8, inp8, inp7, inp6,
1046                                         const20, const6, const3);
1047     tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1048     tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1049     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1050     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1051 }
1052 
vert_mc_qpel_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1053 static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
1054                                              int32_t src_stride,
1055                                              uint8_t *dst,
1056                                              int32_t dst_stride)
1057 {
1058     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1059     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1060     v16u8 res0;
1061     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1062     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1063     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1064 
1065     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1066     src += (5 * src_stride);
1067     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1068                                   inp1, inp2, inp3, inp4,
1069                                   const20, const6, const3);
1070     res0 = __msa_aver_u_b(res0, inp0);
1071     ST_UB(res0, dst);
1072     dst += dst_stride;
1073 
1074     inp5 = LD_UB(src);
1075     src += src_stride;
1076     res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1077                                   inp2, inp3, inp4, inp5,
1078                                   const20, const6, const3);
1079     res0 = __msa_aver_u_b(res0, inp1);
1080     ST_UB(res0, dst);
1081     dst += dst_stride;
1082 
1083     inp6 = LD_UB(src);
1084     src += src_stride;
1085     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1086                                   inp3, inp4, inp5, inp6,
1087                                   const20, const6, const3);
1088     res0 = __msa_aver_u_b(res0, inp2);
1089     ST_UB(res0, dst);
1090     dst += dst_stride;
1091 
1092     inp7 = LD_UB(src);
1093     src += src_stride;
1094     res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1095                                   inp4, inp5, inp6, inp7,
1096                                   const20, const6, const3);
1097     res0 = __msa_aver_u_b(res0, inp3);
1098     ST_UB(res0, dst);
1099     dst += dst_stride;
1100 
1101     LD_UB2(src, src_stride, inp8, inp9);
1102     src += (2 * src_stride);
1103     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1104                                   inp5, inp6, inp7, inp8,
1105                                   const20, const6, const3);
1106     res0 = __msa_aver_u_b(res0, inp4);
1107     ST_UB(res0, dst);
1108     dst += dst_stride;
1109 
1110     res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1111                                   inp6, inp7, inp8, inp9,
1112                                   const20, const6, const3);
1113     res0 = __msa_aver_u_b(res0, inp5);
1114     ST_UB(res0, dst);
1115     dst += dst_stride;
1116 
1117     LD_UB2(src, src_stride, inp10, inp11);
1118     src += (2 * src_stride);
1119     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1120                                   inp7, inp8, inp9, inp10,
1121                                   const20, const6, const3);
1122     res0 = __msa_aver_u_b(res0, inp6);
1123     ST_UB(res0, dst);
1124     dst += dst_stride;
1125 
1126     res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1127                                   inp8, inp9, inp10, inp11,
1128                                   const20, const6, const3);
1129     res0 = __msa_aver_u_b(res0, inp7);
1130     ST_UB(res0, dst);
1131     dst += dst_stride;
1132 
1133     LD_UB2(src, src_stride, inp12, inp13);
1134     src += (2 * src_stride);
1135     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1136                                   inp9, inp10, inp11, inp12,
1137                                   const20, const6, const3);
1138     res0 = __msa_aver_u_b(res0, inp8);
1139     ST_UB(res0, dst);
1140     dst += dst_stride;
1141 
1142     res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1143                                   inp10, inp11, inp12, inp13,
1144                                   const20, const6, const3);
1145     res0 = __msa_aver_u_b(res0, inp9);
1146     ST_UB(res0, dst);
1147     dst += dst_stride;
1148 
1149     LD_UB2(src, src_stride, inp14, inp15);
1150     src += (2 * src_stride);
1151     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1152                                   inp11, inp12, inp13, inp14,
1153                                   const20, const6, const3);
1154     res0 = __msa_aver_u_b(res0, inp10);
1155     ST_UB(res0, dst);
1156     dst += dst_stride;
1157 
1158     res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1159                                   inp12, inp13, inp14, inp15,
1160                                   const20, const6, const3);
1161     res0 = __msa_aver_u_b(res0, inp11);
1162     ST_UB(res0, dst);
1163     dst += dst_stride;
1164 
1165     inp16 = LD_UB(src);
1166     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1167                                   inp13, inp14, inp15, inp16,
1168                                   const20, const6, const3);
1169     res0 = __msa_aver_u_b(res0, inp12);
1170     ST_UB(res0, dst);
1171     dst += dst_stride;
1172 
1173     res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1174                                   inp14, inp15, inp16, inp16,
1175                                   const20, const6, const3);
1176     res0 = __msa_aver_u_b(res0, inp13);
1177     ST_UB(res0, dst);
1178     dst += dst_stride;
1179 
1180     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1181                                   inp15, inp16, inp16, inp15,
1182                                   const20, const6, const3);
1183     res0 = __msa_aver_u_b(res0, inp14);
1184     ST_UB(res0, dst);
1185     dst += dst_stride;
1186 
1187     res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1188                                   inp16, inp16, inp15, inp14,
1189                                   const20, const6, const3);
1190     res0 = __msa_aver_u_b(res0, inp15);
1191     ST_UB(res0, dst);
1192 }
1193 
vert_mc_qpel_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1194 static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1195                                  int32_t src_stride,
1196                                  uint8_t *dst,
1197                                  int32_t dst_stride)
1198 {
1199     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1200     v16u8 res0, res1;
1201     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1202     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1203     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1204 
1205     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1206     src += (4 * src_stride);
1207     LD_UB2(src, src_stride, inp4, inp5);
1208     src += (2 * src_stride);
1209     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1210                                         inp1, inp2, inp3, inp4,
1211                                         inp1, inp0, inp0, inp1,
1212                                         inp2, inp3, inp4, inp5,
1213                                         const20, const6, const3);
1214     LD_UB2(src, src_stride, inp6, inp7);
1215     src += (2 * src_stride);
1216     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1217                                         inp3, inp4, inp5, inp6,
1218                                         inp3, inp2, inp1, inp0,
1219                                         inp4, inp5, inp6, inp7,
1220                                         const20, const6, const3);
1221     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1222 
1223     inp8 = LD_UB(src);
1224     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1225                                         inp5, inp6, inp7, inp8,
1226                                         inp5, inp4, inp3, inp2,
1227                                         inp6, inp7, inp8, inp8,
1228                                         const20, const6, const3);
1229     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1230                                         inp7, inp8, inp8, inp7,
1231                                         inp7, inp6, inp5, inp4,
1232                                         inp8, inp8, inp7, inp6,
1233                                         const20, const6, const3);
1234     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1235 }
1236 
vert_mc_qpel_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1237 static void vert_mc_qpel_16x16_msa(const uint8_t *src,
1238                                    int32_t src_stride,
1239                                    uint8_t *dst,
1240                                    int32_t dst_stride)
1241 {
1242     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1243     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1244     v16u8 res0;
1245     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1246     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1247     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1248 
1249     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1250     src += (4 * src_stride);
1251     inp4 = LD_UB(src);
1252     src += src_stride;
1253     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1254                                   inp1, inp2, inp3, inp4,
1255                                   const20, const6, const3);
1256     ST_UB(res0, dst);
1257     dst += dst_stride;
1258 
1259     inp5 = LD_UB(src);
1260     src += src_stride;
1261     res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1262                                   inp2, inp3, inp4, inp5,
1263                                   const20, const6, const3);
1264     ST_UB(res0, dst);
1265     dst += dst_stride;
1266 
1267     inp6 = LD_UB(src);
1268     src += src_stride;
1269     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1270                                   inp3, inp4, inp5, inp6,
1271                                   const20, const6, const3);
1272     ST_UB(res0, dst);
1273     dst += dst_stride;
1274 
1275     inp7 = LD_UB(src);
1276     src += src_stride;
1277     res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1278                                   inp4, inp5, inp6, inp7,
1279                                   const20, const6, const3);
1280     ST_UB(res0, dst);
1281     dst += dst_stride;
1282 
1283     inp8 = LD_UB(src);
1284     src += src_stride;
1285     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1286                                   inp5, inp6, inp7, inp8,
1287                                   const20, const6, const3);
1288     ST_UB(res0, dst);
1289     dst += dst_stride;
1290 
1291     inp9 = LD_UB(src);
1292     src += src_stride;
1293     res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1294                                   inp6, inp7, inp8, inp9,
1295                                   const20, const6, const3);
1296     ST_UB(res0, dst);
1297     dst += dst_stride;
1298 
1299     inp10 = LD_UB(src);
1300     src += src_stride;
1301     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1302                                   inp7, inp8, inp9, inp10,
1303                                   const20, const6, const3);
1304     ST_UB(res0, dst);
1305     dst += dst_stride;
1306 
1307     inp11 = LD_UB(src);
1308     src += src_stride;
1309     res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1310                                   inp8, inp9, inp10, inp11,
1311                                   const20, const6, const3);
1312     ST_UB(res0, dst);
1313     dst += dst_stride;
1314 
1315     inp12 = LD_UB(src);
1316     src += src_stride;
1317     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1318                                   inp9, inp10, inp11, inp12,
1319                                   const20, const6, const3);
1320     ST_UB(res0, dst);
1321     dst += dst_stride;
1322 
1323     inp13 = LD_UB(src);
1324     src += src_stride;
1325     res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1326                                   inp10, inp11, inp12, inp13,
1327                                   const20, const6, const3);
1328     ST_UB(res0, dst);
1329     dst += dst_stride;
1330 
1331     inp14 = LD_UB(src);
1332     src += src_stride;
1333     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1334                                   inp11, inp12, inp13, inp14,
1335                                   const20, const6, const3);
1336     ST_UB(res0, dst);
1337     dst += dst_stride;
1338 
1339     inp15 = LD_UB(src);
1340     src += src_stride;
1341     res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1342                                   inp12, inp13, inp14, inp15,
1343                                   const20, const6, const3);
1344     ST_UB(res0, dst);
1345     dst += dst_stride;
1346 
1347     inp16 = LD_UB(src);
1348     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1349                                   inp13, inp14, inp15, inp16,
1350                                   const20, const6, const3);
1351     ST_UB(res0, dst);
1352     dst += dst_stride;
1353 
1354     res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1355                                   inp14, inp15, inp16, inp16,
1356                                   const20, const6, const3);
1357     ST_UB(res0, dst);
1358     dst += dst_stride;
1359 
1360     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1361                                   inp15, inp16, inp16, inp15,
1362                                   const20, const6, const3);
1363     ST_UB(res0, dst);
1364     dst += dst_stride;
1365 
1366     res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1367                                   inp16, inp16, inp15, inp14,
1368                                   const20, const6, const3);
1369     ST_UB(res0, dst);
1370     dst += dst_stride;
1371 }
1372 
vert_mc_qpel_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1373 static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src,
1374                                            int32_t src_stride,
1375                                            uint8_t *dst,
1376                                            int32_t dst_stride)
1377 {
1378     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1379     v16u8 tmp0, tmp1, res0, res1;
1380     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1381     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1382     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1383 
1384     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1385     src += (4 * src_stride);
1386     LD_UB2(src, src_stride, inp4, inp5);
1387     src += (2 * src_stride);
1388     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1389                                         inp1, inp2, inp3, inp4,
1390                                         inp1, inp0, inp0, inp1,
1391                                         inp2, inp3, inp4, inp5,
1392                                         const20, const6, const3);
1393 
1394     LD_UB2(src, src_stride, inp6, inp7);
1395     src += (2 * src_stride);
1396     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1397                                         inp3, inp4, inp5, inp6,
1398                                         inp3, inp2, inp1, inp0,
1399                                         inp4, inp5, inp6, inp7,
1400                                         const20, const6, const3);
1401     tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1402     tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1403     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1404     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1405 
1406     inp8 = LD_UB(src);
1407     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1408                                         inp5, inp6, inp7, inp8,
1409                                         inp5, inp4, inp3, inp2,
1410                                         inp6, inp7, inp8, inp8,
1411                                         const20, const6, const3);
1412     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1413                                         inp7, inp8, inp8, inp7,
1414                                         inp7, inp6, inp5, inp4,
1415                                         inp8, inp8, inp7, inp6,
1416                                         const20, const6, const3);
1417     tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1418     tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1419     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1420     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1421 }
1422 
vert_mc_qpel_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1423 static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
1424                                              int32_t src_stride,
1425                                              uint8_t *dst,
1426                                              int32_t dst_stride)
1427 {
1428     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1429     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1430     v16u8 res0;
1431     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1432     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1433     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1434 
1435     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1436     src += (4 * src_stride);
1437     inp4 = LD_UB(src);
1438     src += src_stride;
1439     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1440                                   inp1, inp2, inp3, inp4,
1441                                   const20, const6, const3);
1442     res0 = __msa_aver_u_b(res0, inp1);
1443     ST_UB(res0, dst);
1444     dst += dst_stride;
1445 
1446     inp5 = LD_UB(src);
1447     src += src_stride;
1448     res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1449                                   inp2, inp3, inp4, inp5,
1450                                   const20, const6, const3);
1451     res0 = __msa_aver_u_b(res0, inp2);
1452     ST_UB(res0, dst);
1453     dst += dst_stride;
1454 
1455     inp6 = LD_UB(src);
1456     src += src_stride;
1457     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1458                                   inp3, inp4, inp5, inp6,
1459                                   const20, const6, const3);
1460     res0 = __msa_aver_u_b(res0, inp3);
1461     ST_UB(res0, dst);
1462     dst += dst_stride;
1463 
1464     inp7 = LD_UB(src);
1465     src += src_stride;
1466     res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1467                                   inp4, inp5, inp6, inp7,
1468                                   const20, const6, const3);
1469     res0 = __msa_aver_u_b(res0, inp4);
1470     ST_UB(res0, dst);
1471     dst += dst_stride;
1472 
1473     inp8 = LD_UB(src);
1474     src += src_stride;
1475     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1476                                   inp5, inp6, inp7, inp8,
1477                                   const20, const6, const3);
1478     res0 = __msa_aver_u_b(res0, inp5);
1479     ST_UB(res0, dst);
1480     dst += dst_stride;
1481 
1482     inp9 = LD_UB(src);
1483     src += src_stride;
1484     res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1485                                   inp6, inp7, inp8, inp9,
1486                                   const20, const6, const3);
1487     res0 = __msa_aver_u_b(res0, inp6);
1488     ST_UB(res0, dst);
1489     dst += dst_stride;
1490 
1491     inp10 = LD_UB(src);
1492     src += src_stride;
1493     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1494                                   inp7, inp8, inp9, inp10,
1495                                   const20, const6, const3);
1496     res0 = __msa_aver_u_b(res0, inp7);
1497     ST_UB(res0, dst);
1498     dst += dst_stride;
1499 
1500     inp11 = LD_UB(src);
1501     src += src_stride;
1502     res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1503                                   inp8, inp9, inp10, inp11,
1504                                   const20, const6, const3);
1505     res0 = __msa_aver_u_b(res0, inp8);
1506     ST_UB(res0, dst);
1507     dst += dst_stride;
1508 
1509     inp12 = LD_UB(src);
1510     src += src_stride;
1511     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1512                                   inp9, inp10, inp11, inp12,
1513                                   const20, const6, const3);
1514     res0 = __msa_aver_u_b(res0, inp9);
1515     ST_UB(res0, dst);
1516     dst += dst_stride;
1517 
1518     inp13 = LD_UB(src);
1519     src += src_stride;
1520     res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1521                                   inp10, inp11, inp12, inp13,
1522                                   const20, const6, const3);
1523     res0 = __msa_aver_u_b(res0, inp10);
1524     ST_UB(res0, dst);
1525     dst += dst_stride;
1526 
1527     inp14 = LD_UB(src);
1528     src += src_stride;
1529     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1530                                   inp11, inp12, inp13, inp14,
1531                                   const20, const6, const3);
1532     res0 = __msa_aver_u_b(res0, inp11);
1533     ST_UB(res0, dst);
1534     dst += dst_stride;
1535 
1536     inp15 = LD_UB(src);
1537     src += src_stride;
1538     res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1539                                   inp12, inp13, inp14, inp15,
1540                                   const20, const6, const3);
1541     res0 = __msa_aver_u_b(res0, inp12);
1542     ST_UB(res0, dst);
1543     dst += dst_stride;
1544 
1545     inp16 = LD_UB(src);
1546     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1547                                   inp13, inp14, inp15, inp16,
1548                                   const20, const6, const3);
1549     res0 = __msa_aver_u_b(res0, inp13);
1550     ST_UB(res0, dst);
1551     dst += dst_stride;
1552 
1553     res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1554                                   inp14, inp15, inp16, inp16,
1555                                   const20, const6, const3);
1556     res0 = __msa_aver_u_b(res0, inp14);
1557     ST_UB(res0, dst);
1558     dst += dst_stride;
1559 
1560     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1561                                   inp15, inp16, inp16, inp15,
1562                                   const20, const6, const3);
1563     res0 = __msa_aver_u_b(res0, inp15);
1564     ST_UB(res0, dst);
1565     dst += dst_stride;
1566 
1567     res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1568                                   inp16, inp16, inp15, inp14,
1569                                   const20, const6, const3);
1570     res0 = __msa_aver_u_b(res0, inp16);
1571     ST_UB(res0, dst);
1572 }
1573 
vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1574 static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src,
1575                                                   int32_t src_stride,
1576                                                   uint8_t *dst,
1577                                                   int32_t dst_stride)
1578 {
1579     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1580     v16u8 tmp0, tmp1, res0, res1;
1581     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1582     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1583     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1584 
1585     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1586     src += (4 * src_stride);
1587     LD_UB2(src, src_stride, inp4, inp5);
1588     src += (2 * src_stride);
1589     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1590                                                  inp1, inp2, inp3, inp4,
1591                                                  inp1, inp0, inp0, inp1,
1592                                                  inp2, inp3, inp4, inp5,
1593                                                  const20, const6, const3);
1594     LD_UB2(src, src_stride, inp6, inp7);
1595     src += (2 * src_stride);
1596     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1597                                                  inp3, inp4, inp5, inp6,
1598                                                  inp3, inp2, inp1, inp0,
1599                                                  inp4, inp5, inp6, inp7,
1600                                                  const20, const6, const3);
1601     tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1602     tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1603     res0 = __msa_ave_u_b(res0, tmp0);
1604     res1 = __msa_ave_u_b(res1, tmp1);
1605     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1606 
1607     inp8 = LD_UB(src);
1608     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1609                                                  inp5, inp6, inp7, inp8,
1610                                                  inp5, inp4, inp3, inp2,
1611                                                  inp6, inp7, inp8, inp8,
1612                                                  const20, const6, const3);
1613     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1614                                                  inp7, inp8, inp8, inp7,
1615                                                  inp7, inp6, inp5, inp4,
1616                                                  inp8, inp8, inp7, inp6,
1617                                                  const20, const6, const3);
1618     tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1619     tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1620     res0 = __msa_ave_u_b(res0, tmp0);
1621     res1 = __msa_ave_u_b(res1, tmp1);
1622     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1623 }
1624 
vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1625 static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
1626                                                     int32_t src_stride,
1627                                                     uint8_t *dst,
1628                                                     int32_t dst_stride)
1629 {
1630     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1631     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1632     v16u8 res0;
1633     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1634     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1635     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1636 
1637     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1638     src += (5 * src_stride);
1639     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1640                                            inp1, inp2, inp3, inp4,
1641                                            const20, const6, const3);
1642     res0 = __msa_ave_u_b(res0, inp0);
1643     ST_UB(res0, dst);
1644     dst += dst_stride;
1645 
1646     inp5 = LD_UB(src);
1647     src += src_stride;
1648     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1649                                            inp2, inp3, inp4, inp5,
1650                                            const20, const6, const3);
1651     res0 = __msa_ave_u_b(res0, inp1);
1652     ST_UB(res0, dst);
1653     dst += dst_stride;
1654 
1655     inp6 = LD_UB(src);
1656     src += src_stride;
1657     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1658                                            inp3, inp4, inp5, inp6,
1659                                            const20, const6, const3);
1660     res0 = __msa_ave_u_b(res0, inp2);
1661     ST_UB(res0, dst);
1662     dst += dst_stride;
1663 
1664     inp7 = LD_UB(src);
1665     src += src_stride;
1666     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1667                                            inp4, inp5, inp6, inp7,
1668                                            const20, const6, const3);
1669     res0 = __msa_ave_u_b(res0, inp3);
1670     ST_UB(res0, dst);
1671     dst += dst_stride;
1672 
1673     inp8 = LD_UB(src);
1674     src += src_stride;
1675     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1676                                            inp5, inp6, inp7, inp8,
1677                                            const20, const6, const3);
1678     res0 = __msa_ave_u_b(res0, inp4);
1679     ST_UB(res0, dst);
1680     dst += dst_stride;
1681 
1682     inp9 = LD_UB(src);
1683     src += src_stride;
1684     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1685                                            inp6, inp7, inp8, inp9,
1686                                            const20, const6, const3);
1687     res0 = __msa_ave_u_b(res0, inp5);
1688     ST_UB(res0, dst);
1689     dst += dst_stride;
1690 
1691     inp10 = LD_UB(src);
1692     src += src_stride;
1693     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1694                                            inp7, inp8, inp9, inp10,
1695                                            const20, const6, const3);
1696     res0 = __msa_ave_u_b(res0, inp6);
1697     ST_UB(res0, dst);
1698     dst += dst_stride;
1699 
1700     inp11 = LD_UB(src);
1701     src += src_stride;
1702     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1703                                            inp8, inp9, inp10, inp11,
1704                                            const20, const6, const3);
1705     res0 = __msa_ave_u_b(res0, inp7);
1706     ST_UB(res0, dst);
1707     dst += dst_stride;
1708 
1709     inp12 = LD_UB(src);
1710     src += src_stride;
1711     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1712                                            inp9, inp10, inp11, inp12,
1713                                            const20, const6, const3);
1714     res0 = __msa_ave_u_b(res0, inp8);
1715     ST_UB(res0, dst);
1716     dst += dst_stride;
1717 
1718     inp13 = LD_UB(src);
1719     src += src_stride;
1720     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1721                                            inp10, inp11, inp12, inp13,
1722                                            const20, const6, const3);
1723     res0 = __msa_ave_u_b(res0, inp9);
1724     ST_UB(res0, dst);
1725     dst += dst_stride;
1726 
1727     inp14 = LD_UB(src);
1728     src += src_stride;
1729     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1730                                            inp11, inp12, inp13, inp14,
1731                                            const20, const6, const3);
1732     res0 = __msa_ave_u_b(res0, inp10);
1733     ST_UB(res0, dst);
1734     dst += dst_stride;
1735 
1736     inp15 = LD_UB(src);
1737     src += src_stride;
1738     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1739                                            inp12, inp13, inp14, inp15,
1740                                            const20, const6, const3);
1741     res0 = __msa_ave_u_b(res0, inp11);
1742     ST_UB(res0, dst);
1743     dst += dst_stride;
1744 
1745     inp16 = LD_UB(src);
1746     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1747                                            inp13, inp14, inp15, inp16,
1748                                            const20, const6, const3);
1749     res0 = __msa_ave_u_b(res0, inp12);
1750     ST_UB(res0, dst);
1751     dst += dst_stride;
1752 
1753     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1754                                            inp14, inp15, inp16, inp16,
1755                                            const20, const6, const3);
1756     res0 = __msa_ave_u_b(res0, inp13);
1757     ST_UB(res0, dst);
1758     dst += dst_stride;
1759 
1760     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1761                                            inp15, inp16, inp16, inp15,
1762                                            const20, const6, const3);
1763     res0 = __msa_ave_u_b(res0, inp14);
1764     ST_UB(res0, dst);
1765     dst += dst_stride;
1766 
1767     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1768                                            inp16, inp16, inp15, inp14,
1769                                            const20, const6, const3);
1770     res0 = __msa_ave_u_b(res0, inp15);
1771     ST_UB(res0, dst);
1772     dst += dst_stride;
1773 }
1774 
vert_mc_qpel_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1775 static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
1776                                         int32_t src_stride,
1777                                         uint8_t *dst,
1778                                         int32_t dst_stride)
1779 {
1780     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1781     v16u8 res0, res1;
1782     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1783     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1784     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1785 
1786     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1787     src += (4 * src_stride);
1788     LD_UB2(src, src_stride, inp4, inp5);
1789     src += (2 * src_stride);
1790     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1791                                                  inp1, inp2, inp3, inp4,
1792                                                  inp1, inp0, inp0, inp1,
1793                                                  inp2, inp3, inp4, inp5,
1794                                                  const20, const6, const3);
1795     LD_UB2(src, src_stride, inp6, inp7);
1796     src += (2 * src_stride);
1797     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1798                                                  inp3, inp4, inp5, inp6,
1799                                                  inp3, inp2, inp1, inp0,
1800                                                  inp4, inp5, inp6, inp7,
1801                                                  const20, const6, const3);
1802     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1803 
1804     inp8 = LD_UB(src);
1805     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1806                                                  inp5, inp6, inp7, inp8,
1807                                                  inp5, inp4, inp3, inp2,
1808                                                  inp6, inp7, inp8, inp8,
1809                                                  const20, const6, const3);
1810     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1811                                                  inp7, inp8, inp8, inp7,
1812                                                  inp7, inp6, inp5, inp4,
1813                                                  inp8, inp8, inp7, inp6,
1814                                                  const20, const6, const3);
1815     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1816 }
1817 
vert_mc_qpel_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1818 static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
1819                                           int32_t src_stride,
1820                                           uint8_t *dst,
1821                                           int32_t dst_stride)
1822 {
1823     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1824     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1825     v16u8 res0;
1826     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1827     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1828     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1829 
1830     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1831     src += (5 * src_stride);
1832     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1833                                            inp1, inp2, inp3, inp4,
1834                                            const20, const6, const3);
1835     ST_UB(res0, dst);
1836     dst += dst_stride;
1837 
1838     inp5 = LD_UB(src);
1839     src += src_stride;
1840     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1841                                            inp2, inp3, inp4, inp5,
1842                                            const20, const6, const3);
1843     ST_UB(res0, dst);
1844     dst += dst_stride;
1845 
1846     inp6 = LD_UB(src);
1847     src += src_stride;
1848     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1849                                            inp3, inp4, inp5, inp6,
1850                                            const20, const6, const3);
1851     ST_UB(res0, dst);
1852     dst += dst_stride;
1853 
1854     inp7 = LD_UB(src);
1855     src += src_stride;
1856     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1857                                            inp4, inp5, inp6, inp7,
1858                                            const20, const6, const3);
1859     ST_UB(res0, dst);
1860     dst += dst_stride;
1861 
1862     inp8 = LD_UB(src);
1863     src += src_stride;
1864     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1865                                            inp5, inp6, inp7, inp8,
1866                                            const20, const6, const3);
1867     ST_UB(res0, dst);
1868     dst += dst_stride;
1869 
1870     inp9 = LD_UB(src);
1871     src += src_stride;
1872     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1873                                            inp6, inp7, inp8, inp9,
1874                                            const20, const6, const3);
1875     ST_UB(res0, dst);
1876     dst += dst_stride;
1877 
1878     inp10 = LD_UB(src);
1879     src += src_stride;
1880     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1881                                            inp7, inp8, inp9, inp10,
1882                                            const20, const6, const3);
1883     ST_UB(res0, dst);
1884     dst += dst_stride;
1885 
1886     inp11 = LD_UB(src);
1887     src += src_stride;
1888     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1889                                            inp8, inp9, inp10, inp11,
1890                                            const20, const6, const3);
1891     ST_UB(res0, dst);
1892     dst += dst_stride;
1893 
1894     inp12 = LD_UB(src);
1895     src += src_stride;
1896     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1897                                            inp9, inp10, inp11, inp12,
1898                                            const20, const6, const3);
1899     ST_UB(res0, dst);
1900     dst += dst_stride;
1901 
1902     inp13 = LD_UB(src);
1903     src += src_stride;
1904     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1905                                            inp10, inp11, inp12, inp13,
1906                                            const20, const6, const3);
1907     ST_UB(res0, dst);
1908     dst += dst_stride;
1909 
1910     inp14 = LD_UB(src);
1911     src += src_stride;
1912     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1913                                            inp11, inp12, inp13, inp14,
1914                                            const20, const6, const3);
1915     ST_UB(res0, dst);
1916     dst += dst_stride;
1917 
1918     inp15 = LD_UB(src);
1919     src += src_stride;
1920     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1921                                            inp12, inp13, inp14, inp15,
1922                                            const20, const6, const3);
1923     ST_UB(res0, dst);
1924     dst += dst_stride;
1925 
1926     inp16 = LD_UB(src);
1927     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1928                                            inp13, inp14, inp15, inp16,
1929                                            const20, const6, const3);
1930     ST_UB(res0, dst);
1931     dst += dst_stride;
1932 
1933     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1934                                            inp14, inp15, inp16, inp16,
1935                                            const20, const6, const3);
1936     ST_UB(res0, dst);
1937     dst += dst_stride;
1938 
1939     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1940                                            inp15, inp16, inp16, inp15,
1941                                            const20, const6, const3);
1942     ST_UB(res0, dst);
1943     dst += dst_stride;
1944 
1945     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1946                                            inp16, inp16, inp15, inp14,
1947                                            const20, const6, const3);
1948     ST_UB(res0, dst);
1949 }
1950 
vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)1951 static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src,
1952                                                   int32_t src_stride,
1953                                                   uint8_t *dst,
1954                                                   int32_t dst_stride)
1955 {
1956     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1957     v16u8 tmp0, tmp1, res0, res1;
1958     v16u8 const20 = (v16u8) __msa_ldi_b(20);
1959     v16u8 const6 = (v16u8) __msa_ldi_b(6);
1960     v16u8 const3 = (v16u8) __msa_ldi_b(3);
1961 
1962     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1963     src += (4 * src_stride);
1964     LD_UB2(src, src_stride, inp4, inp5);
1965     src += (2 * src_stride);
1966     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1967                                                  inp1, inp2, inp3, inp4,
1968                                                  inp1, inp0, inp0, inp1,
1969                                                  inp2, inp3, inp4, inp5,
1970                                                  const20, const6, const3);
1971     LD_UB2(src, src_stride, inp6, inp7);
1972     src += (2 * src_stride);
1973     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1974                                                  inp3, inp4, inp5, inp6,
1975                                                  inp3, inp2, inp1, inp0,
1976                                                  inp4, inp5, inp6, inp7,
1977                                                  const20, const6, const3);
1978     tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1979     tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1980     res0 = __msa_ave_u_b(res0, tmp0);
1981     res1 = __msa_ave_u_b(res1, tmp1);
1982     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1983 
1984     inp8 = LD_UB(src);
1985     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1986                                                  inp5, inp6, inp7, inp8,
1987                                                  inp5, inp4, inp3, inp2,
1988                                                  inp6, inp7, inp8, inp8,
1989                                                  const20, const6, const3);
1990     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1991                                                  inp7, inp8, inp8, inp7,
1992                                                  inp7, inp6, inp5, inp4,
1993                                                  inp8, inp8, inp7, inp6,
1994                                                  const20, const6, const3);
1995     tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1996     tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1997     res0 = __msa_ave_u_b(res0, tmp0);
1998     res1 = __msa_ave_u_b(res1, tmp1);
1999     ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2000 }
2001 
vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2002 static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
2003                                                     int32_t src_stride,
2004                                                     uint8_t *dst,
2005                                                     int32_t dst_stride)
2006 {
2007     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2008     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2009     v16u8 res0;
2010     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2011     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2012     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2013 
2014     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2015     src += (5 * src_stride);
2016     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2017                                            inp1, inp2, inp3, inp4,
2018                                            const20, const6, const3);
2019     res0 = __msa_ave_u_b(res0, inp1);
2020     ST_UB(res0, dst);
2021     dst += dst_stride;
2022 
2023     inp5 = LD_UB(src);
2024     src += src_stride;
2025     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2026                                            inp2, inp3, inp4, inp5,
2027                                            const20, const6, const3);
2028     res0 = __msa_ave_u_b(res0, inp2);
2029     ST_UB(res0, dst);
2030     dst += dst_stride;
2031 
2032     inp6 = LD_UB(src);
2033     src += src_stride;
2034     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2035                                            inp3, inp4, inp5, inp6,
2036                                            const20, const6, const3);
2037     res0 = __msa_ave_u_b(res0, inp3);
2038     ST_UB(res0, dst);
2039     dst += dst_stride;
2040 
2041     inp7 = LD_UB(src);
2042     src += src_stride;
2043     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2044                                            inp4, inp5, inp6, inp7,
2045                                            const20, const6, const3);
2046     res0 = __msa_ave_u_b(res0, inp4);
2047     ST_UB(res0, dst);
2048     dst += dst_stride;
2049 
2050     inp8 = LD_UB(src);
2051     src += src_stride;
2052     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2053                                            inp5, inp6, inp7, inp8,
2054                                            const20, const6, const3);
2055     res0 = __msa_ave_u_b(res0, inp5);
2056     ST_UB(res0, dst);
2057     dst += dst_stride;
2058 
2059     inp9 = LD_UB(src);
2060     src += src_stride;
2061     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2062                                            inp6, inp7, inp8, inp9,
2063                                            const20, const6, const3);
2064     res0 = __msa_ave_u_b(res0, inp6);
2065     ST_UB(res0, dst);
2066     dst += dst_stride;
2067 
2068     inp10 = LD_UB(src);
2069     src += src_stride;
2070     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2071                                            inp7, inp8, inp9, inp10,
2072                                            const20, const6, const3);
2073     res0 = __msa_ave_u_b(res0, inp7);
2074     ST_UB(res0, dst);
2075     dst += dst_stride;
2076 
2077     inp11 = LD_UB(src);
2078     src += src_stride;
2079     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2080                                            inp8, inp9, inp10, inp11,
2081                                            const20, const6, const3);
2082     res0 = __msa_ave_u_b(res0, inp8);
2083     ST_UB(res0, dst);
2084     dst += dst_stride;
2085 
2086     inp12 = LD_UB(src);
2087     src += src_stride;
2088     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2089                                            inp9, inp10, inp11, inp12,
2090                                            const20, const6, const3);
2091     res0 = __msa_ave_u_b(res0, inp9);
2092     ST_UB(res0, dst);
2093     dst += dst_stride;
2094 
2095     inp13 = LD_UB(src);
2096     src += src_stride;
2097     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2098                                            inp10, inp11, inp12, inp13,
2099                                            const20, const6, const3);
2100     res0 = __msa_ave_u_b(res0, inp10);
2101     ST_UB(res0, dst);
2102     dst += dst_stride;
2103 
2104     inp14 = LD_UB(src);
2105     src += src_stride;
2106     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2107                                            inp11, inp12, inp13, inp14,
2108                                            const20, const6, const3);
2109     res0 = __msa_ave_u_b(res0, inp11);
2110     ST_UB(res0, dst);
2111     dst += dst_stride;
2112 
2113     inp15 = LD_UB(src);
2114     src += src_stride;
2115     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2116                                            inp12, inp13, inp14, inp15,
2117                                            const20, const6, const3);
2118     res0 = __msa_ave_u_b(res0, inp12);
2119     ST_UB(res0, dst);
2120     dst += dst_stride;
2121 
2122     inp16 = LD_UB(src);
2123     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2124                                            inp13, inp14, inp15, inp16,
2125                                            const20, const6, const3);
2126     res0 = __msa_ave_u_b(res0, inp13);
2127     ST_UB(res0, dst);
2128     dst += dst_stride;
2129 
2130     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2131                                            inp14, inp15, inp16, inp16,
2132                                            const20, const6, const3);
2133     res0 = __msa_ave_u_b(res0, inp14);
2134     ST_UB(res0, dst);
2135     dst += dst_stride;
2136 
2137     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2138                                            inp15, inp16, inp16, inp15,
2139                                            const20, const6, const3);
2140     res0 = __msa_ave_u_b(res0, inp15);
2141     ST_UB(res0, dst);
2142     dst += dst_stride;
2143 
2144     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2145                                            inp16, inp16, inp15, inp14,
2146                                            const20, const6, const3);
2147     res0 = __msa_ave_u_b(res0, inp16);
2148     ST_UB(res0, dst);
2149 }
2150 
vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2151 static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src,
2152                                                    int32_t src_stride,
2153                                                    uint8_t *dst,
2154                                                    int32_t dst_stride)
2155 {
2156     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2157     v16u8 dst0, dst1, dst2, dst3;
2158     v16u8 tmp0, tmp1, res0, res1;
2159     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2160     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2161     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2162 
2163     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2164     src += (4 * src_stride);
2165     LD_UB2(src, src_stride, inp4, inp5);
2166     src += (2 * src_stride);
2167     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2168                                         inp1, inp2, inp3, inp4,
2169                                         inp1, inp0, inp0, inp1,
2170                                         inp2, inp3, inp4, inp5,
2171                                         const20, const6, const3);
2172 
2173     LD_UB2(src, src_stride, inp6, inp7);
2174     src += (2 * src_stride);
2175     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2176                                         inp3, inp4, inp5, inp6,
2177                                         inp3, inp2, inp1, inp0,
2178                                         inp4, inp5, inp6, inp7,
2179                                         const20, const6, const3);
2180 
2181     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2182     tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2183     tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2184     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2185     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2186     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2187     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2188     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2189     dst += (4 * dst_stride);
2190 
2191     inp8 = LD_UB(src);
2192     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2193                                         inp5, inp6, inp7, inp8,
2194                                         inp5, inp4, inp3, inp2,
2195                                         inp6, inp7, inp8, inp8,
2196                                         const20, const6, const3);
2197     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2198                                         inp7, inp8, inp8, inp7,
2199                                         inp7, inp6, inp5, inp4,
2200                                         inp8, inp8, inp7, inp6,
2201                                         const20, const6, const3);
2202 
2203     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2204     tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2205     tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2206     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2207     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2208     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2209     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2210     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2211 }
2212 
vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2213 static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
2214                                                      int32_t src_stride,
2215                                                      uint8_t *dst,
2216                                                      int32_t dst_stride)
2217 {
2218     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2219     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2220     v16u8 res0, res1, dst0, dst1;
2221     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2222     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2223     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2224 
2225     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2226     src += (5 * src_stride);
2227     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2228                                   inp1, inp2, inp3, inp4,
2229                                   const20, const6, const3);
2230 
2231     inp5 = LD_UB(src);
2232     src += src_stride;
2233     res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2234                                   inp2, inp3, inp4, inp5,
2235                                   const20, const6, const3);
2236 
2237     LD_UB2(dst, dst_stride, dst0, dst1);
2238     AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2239     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2240     ST_UB2(res0, res1, dst, dst_stride);
2241     dst += (2 * dst_stride);
2242 
2243     inp6 = LD_UB(src);
2244     src += src_stride;
2245     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2246                                   inp3, inp4, inp5, inp6,
2247                                   const20, const6, const3);
2248 
2249     inp7 = LD_UB(src);
2250     src += src_stride;
2251     res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2252                                   inp4, inp5, inp6, inp7,
2253                                   const20, const6, const3);
2254 
2255     LD_UB2(dst, dst_stride, dst0, dst1);
2256     AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2257     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2258     ST_UB2(res0, res1, dst, dst_stride);
2259     dst += (2 * dst_stride);
2260 
2261     LD_UB2(src, src_stride, inp8, inp9);
2262     src += (2 * src_stride);
2263     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2264                                   inp5, inp6, inp7, inp8,
2265                                   const20, const6, const3);
2266     res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2267                                   inp6, inp7, inp8, inp9,
2268                                   const20, const6, const3);
2269 
2270     LD_UB2(dst, dst_stride, dst0, dst1);
2271     AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2272     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2273     ST_UB2(res0, res1, dst, dst_stride);
2274     dst += (2 * dst_stride);
2275 
2276     LD_UB2(src, src_stride, inp10, inp11);
2277     src += (2 * src_stride);
2278     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2279                                   inp7, inp8, inp9, inp10,
2280                                   const20, const6, const3);
2281     res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2282                                   inp8, inp9, inp10, inp11,
2283                                   const20, const6, const3);
2284 
2285     LD_UB2(dst, dst_stride, dst0, dst1);
2286     AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2287     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2288     ST_UB2(res0, res1, dst, dst_stride);
2289     dst += (2 * dst_stride);
2290 
2291     LD_UB2(src, src_stride, inp12, inp13);
2292     src += (2 * src_stride);
2293     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2294                                   inp9, inp10, inp11, inp12,
2295                                   const20, const6, const3);
2296     res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2297                                   inp10, inp11, inp12, inp13,
2298                                   const20, const6, const3);
2299     LD_UB2(dst, dst_stride, dst0, dst1);
2300     AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2301     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2302     ST_UB2(res0, res1, dst, dst_stride);
2303     dst += (2 * dst_stride);
2304 
2305     LD_UB2(src, src_stride, inp14, inp15);
2306     src += (2 * src_stride);
2307     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2308                                   inp11, inp12, inp13, inp14,
2309                                   const20, const6, const3);
2310     res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2311                                   inp12, inp13, inp14, inp15,
2312                                   const20, const6, const3);
2313 
2314     LD_UB2(dst, dst_stride, dst0, dst1);
2315     AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2316     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2317     ST_UB2(res0, res1, dst, dst_stride);
2318     dst += (2 * dst_stride);
2319 
2320     inp16 = LD_UB(src);
2321     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2322                                   inp13, inp14, inp15, inp16,
2323                                   const20, const6, const3);
2324     res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2325                                   inp14, inp15, inp16, inp16,
2326                                   const20, const6, const3);
2327     LD_UB2(dst, dst_stride, dst0, dst1);
2328     AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2329     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2330     ST_UB2(res0, res1, dst, dst_stride);
2331     dst += (2 * dst_stride);
2332 
2333     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2334                                   inp15, inp16, inp16, inp15,
2335                                   const20, const6, const3);
2336     res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2337                                   inp16, inp16, inp15, inp14,
2338                                   const20, const6, const3);
2339     LD_UB2(dst, dst_stride, dst0, dst1);
2340     AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2341     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2342     ST_UB2(res0, res1, dst, dst_stride);
2343 }
2344 
vert_mc_qpel_avg_dst_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2345 static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src,
2346                                          int32_t src_stride,
2347                                          uint8_t *dst,
2348                                          int32_t dst_stride)
2349 {
2350     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2351     v16u8 dst0, dst1, dst2, dst3;
2352     v16u8 res0, res1;
2353     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2354     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2355     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2356 
2357     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2358     src += (4 * src_stride);
2359     LD_UB2(src, src_stride, inp4, inp5);
2360     src += (2 * src_stride);
2361     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2362                                         inp1, inp2, inp3, inp4,
2363                                         inp1, inp0, inp0, inp1,
2364                                         inp2, inp3, inp4, inp5,
2365                                         const20, const6, const3);
2366     LD_UB2(src, src_stride, inp6, inp7);
2367     src += (2 * src_stride);
2368     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2369                                         inp3, inp4, inp5, inp6,
2370                                         inp3, inp2, inp1, inp0,
2371                                         inp4, inp5, inp6, inp7,
2372                                         const20, const6, const3);
2373     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2374     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2375     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2376     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2377     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2378     dst += (4 * dst_stride);
2379 
2380     inp8 = LD_UB(src);
2381     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2382                                         inp5, inp6, inp7, inp8,
2383                                         inp5, inp4, inp3, inp2,
2384                                         inp6, inp7, inp8, inp8,
2385                                         const20, const6, const3);
2386     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2387                                         inp7, inp8, inp8, inp7,
2388                                         inp7, inp6, inp5, inp4,
2389                                         inp8, inp8, inp7, inp6,
2390                                         const20, const6, const3);
2391     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2392     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2393     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2394     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2395     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2396 }
2397 
vert_mc_qpel_avg_dst_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2398 static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
2399                                            int32_t src_stride,
2400                                            uint8_t *dst,
2401                                            int32_t dst_stride)
2402 {
2403     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2404     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2405     v16u8 res0, res1, dst0, dst1;
2406     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2407     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2408     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2409 
2410     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2411     src += (5 * src_stride);
2412     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2413                                   inp1, inp2, inp3, inp4,
2414                                   const20, const6, const3);
2415     inp5 = LD_UB(src);
2416     src += src_stride;
2417     res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2418                                   inp2, inp3, inp4, inp5,
2419                                   const20, const6, const3);
2420     LD_UB2(dst, dst_stride, dst0, dst1);
2421     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2422     ST_UB2(res0, res1, dst, dst_stride);
2423     dst += (2 * dst_stride);
2424 
2425     inp6 = LD_UB(src);
2426     src += src_stride;
2427     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2428                                   inp3, inp4, inp5, inp6,
2429                                   const20, const6, const3);
2430     inp7 = LD_UB(src);
2431     src += src_stride;
2432     res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2433                                   inp4, inp5, inp6, inp7,
2434                                   const20, const6, const3);
2435     LD_UB2(dst, dst_stride, dst0, dst1);
2436     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2437     ST_UB2(res0, res1, dst, dst_stride);
2438     dst += (2 * dst_stride);
2439 
2440     inp8 = LD_UB(src);
2441     src += src_stride;
2442     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2443                                   inp5, inp6, inp7, inp8,
2444                                   const20, const6, const3);
2445     inp9 = LD_UB(src);
2446     src += src_stride;
2447     res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2448                                   inp6, inp7, inp8, inp9,
2449                                   const20, const6, const3);
2450     LD_UB2(dst, dst_stride, dst0, dst1);
2451     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2452     ST_UB2(res0, res1, dst, dst_stride);
2453     dst += (2 * dst_stride);
2454 
2455     inp10 = LD_UB(src);
2456     src += src_stride;
2457     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2458                                   inp7, inp8, inp9, inp10,
2459                                   const20, const6, const3);
2460     inp11 = LD_UB(src);
2461     src += src_stride;
2462     res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2463                                   inp8, inp9, inp10, inp11,
2464                                   const20, const6, const3);
2465     LD_UB2(dst, dst_stride, dst0, dst1);
2466     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2467     ST_UB2(res0, res1, dst, dst_stride);
2468     dst += (2 * dst_stride);
2469 
2470     inp12 = LD_UB(src);
2471     src += src_stride;
2472     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2473                                   inp9, inp10, inp11, inp12,
2474                                   const20, const6, const3);
2475     inp13 = LD_UB(src);
2476     src += src_stride;
2477     res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2478                                   inp10, inp11, inp12, inp13,
2479                                   const20, const6, const3);
2480     LD_UB2(dst, dst_stride, dst0, dst1);
2481     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2482     ST_UB2(res0, res1, dst, dst_stride);
2483     dst += (2 * dst_stride);
2484 
2485     inp14 = LD_UB(src);
2486     src += src_stride;
2487     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2488                                   inp11, inp12, inp13, inp14,
2489                                   const20, const6, const3);
2490     inp15 = LD_UB(src);
2491     src += src_stride;
2492     res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2493                                   inp12, inp13, inp14, inp15,
2494                                   const20, const6, const3);
2495     LD_UB2(dst, dst_stride, dst0, dst1);
2496     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2497     ST_UB2(res0, res1, dst, dst_stride);
2498     dst += (2 * dst_stride);
2499 
2500     inp16 = LD_UB(src);
2501     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2502                                   inp13, inp14, inp15, inp16,
2503                                   const20, const6, const3);
2504     res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2505                                   inp14, inp15, inp16, inp16,
2506                                   const20, const6, const3);
2507     LD_UB2(dst, dst_stride, dst0, dst1);
2508     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2509     ST_UB2(res0, res1, dst, dst_stride);
2510     dst += (2 * dst_stride);
2511 
2512     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2513                                   inp15, inp16, inp16, inp15,
2514                                   const20, const6, const3);
2515     res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2516                                   inp16, inp16, inp15, inp14,
2517                                   const20, const6, const3);
2518     LD_UB2(dst, dst_stride, dst0, dst1);
2519     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520     ST_UB2(res0, res1, dst, dst_stride);
2521 }
2522 
vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2523 static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src,
2524                                                    int32_t src_stride,
2525                                                    uint8_t *dst,
2526                                                    int32_t dst_stride)
2527 {
2528     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2529     v16u8 dst0, dst1, dst2, dst3;
2530     v16u8 tmp0, tmp1, res0, res1;
2531     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2532     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2533     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2534 
2535     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2536     src += (4 * src_stride);
2537     LD_UB2(src, src_stride, inp4, inp5);
2538     src += (2 * src_stride);
2539     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2540                                         inp1, inp2, inp3, inp4,
2541                                         inp1, inp0, inp0, inp1,
2542                                         inp2, inp3, inp4, inp5,
2543                                         const20, const6, const3);
2544     LD_UB2(src, src_stride, inp6, inp7);
2545     src += (2 * src_stride);
2546     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2547                                         inp3, inp4, inp5, inp6,
2548                                         inp3, inp2, inp1, inp0,
2549                                         inp4, inp5, inp6, inp7,
2550                                         const20, const6, const3);
2551     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2552     tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2553     tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2554     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2555     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2556     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2557     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2558     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2559     dst += (4 * dst_stride);
2560 
2561     inp8 = LD_UB(src);
2562     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2563                                         inp5, inp6, inp7, inp8,
2564                                         inp5, inp4, inp3, inp2,
2565                                         inp6, inp7, inp8, inp8,
2566                                         const20, const6, const3);
2567     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2568                                         inp7, inp8, inp8, inp7,
2569                                         inp7, inp6, inp5, inp4,
2570                                         inp8, inp8, inp7, inp6,
2571                                         const20, const6, const3);
2572     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2573     tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2574     tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2575     dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2576     dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2577     AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2578     AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2579     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2580 }
2581 
vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2582 static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
2583                                                      int32_t src_stride,
2584                                                      uint8_t *dst,
2585                                                      int32_t dst_stride)
2586 {
2587     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2588     v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2589     v16u8 res0, res1, dst0, dst1;
2590     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2591     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2592     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2593 
2594     LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2595     src += (5 * src_stride);
2596     res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2597                                   inp1, inp2, inp3, inp4,
2598                                   const20, const6, const3);
2599     inp5 = LD_UB(src);
2600     src += src_stride;
2601     res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2602                                   inp2, inp3, inp4, inp5,
2603                                   const20, const6, const3);
2604     LD_UB2(dst, dst_stride, dst0, dst1);
2605     AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2606     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2607     ST_UB2(res0, res1, dst, dst_stride);
2608     dst += (2 * dst_stride);
2609 
2610     inp6 = LD_UB(src);
2611     src += src_stride;
2612     res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2613                                   inp3, inp4, inp5, inp6,
2614                                   const20, const6, const3);
2615     inp7 = LD_UB(src);
2616     src += src_stride;
2617     res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2618                                   inp4, inp5, inp6, inp7,
2619                                   const20, const6, const3);
2620     LD_UB2(dst, dst_stride, dst0, dst1);
2621     AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2622     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2623     ST_UB2(res0, res1, dst, dst_stride);
2624     dst += (2 * dst_stride);
2625 
2626     inp8 = LD_UB(src);
2627     src += src_stride;
2628     res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2629                                   inp5, inp6, inp7, inp8,
2630                                   const20, const6, const3);
2631     inp9 = LD_UB(src);
2632     src += src_stride;
2633     res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2634                                   inp6, inp7, inp8, inp9,
2635                                   const20, const6, const3);
2636     LD_UB2(dst, dst_stride, dst0, dst1);
2637     AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2638     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2639     ST_UB2(res0, res1, dst, dst_stride);
2640     dst += (2 * dst_stride);
2641 
2642     inp10 = LD_UB(src);
2643     src += src_stride;
2644     res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2645                                   inp7, inp8, inp9, inp10,
2646                                   const20, const6, const3);
2647     inp11 = LD_UB(src);
2648     src += src_stride;
2649     res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2650                                   inp8, inp9, inp10, inp11,
2651                                   const20, const6, const3);
2652     LD_UB2(dst, dst_stride, dst0, dst1);
2653     AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2654     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2655     ST_UB2(res0, res1, dst, dst_stride);
2656     dst += (2 * dst_stride);
2657 
2658     inp12 = LD_UB(src);
2659     src += src_stride;
2660     res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2661                                   inp9, inp10, inp11, inp12,
2662                                   const20, const6, const3);
2663     inp13 = LD_UB(src);
2664     src += src_stride;
2665     res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2666                                   inp10, inp11, inp12, inp13,
2667                                   const20, const6, const3);
2668     LD_UB2(dst, dst_stride, dst0, dst1);
2669     AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2670     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2671     ST_UB2(res0, res1, dst, dst_stride);
2672     dst += (2 * dst_stride);
2673 
2674     inp14 = LD_UB(src);
2675     src += src_stride;
2676     res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2677                                   inp11, inp12, inp13, inp14,
2678                                   const20, const6, const3);
2679     inp15 = LD_UB(src);
2680     src += src_stride;
2681     res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2682                                   inp12, inp13, inp14, inp15,
2683                                   const20, const6, const3);
2684     LD_UB2(dst, dst_stride, dst0, dst1);
2685     AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2686     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2687     ST_UB2(res0, res1, dst, dst_stride);
2688     dst += (2 * dst_stride);
2689 
2690     inp16 = LD_UB(src);
2691     res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2692                                   inp13, inp14, inp15, inp16,
2693                                   const20, const6, const3);
2694     res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2695                                   inp14, inp15, inp16, inp16,
2696                                   const20, const6, const3);
2697     LD_UB2(dst, dst_stride, dst0, dst1);
2698     AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2699     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2700     ST_UB2(res0, res1, dst, dst_stride);
2701     dst += (2 * dst_stride);
2702 
2703     res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2704                                   inp15, inp16, inp16, inp15,
2705                                   const20, const6, const3);
2706     res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2707                                   inp16, inp16, inp15, inp14,
2708                                   const20, const6, const3);
2709     LD_UB2(dst, dst_stride, dst0, dst1);
2710     AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2711     AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2712     ST_UB2(res0, res1, dst, dst_stride);
2713 }
2714 
hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)2715 static void hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(const uint8_t *src,
2716                                                    int32_t src_stride,
2717                                                    uint8_t *dst,
2718                                                    int32_t dst_stride,
2719                                                    int32_t height)
2720 {
2721     uint8_t loop_count;
2722     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2723     v16u8 res;
2724     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2725     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2726     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2727     v8u16 const20 = (v8u16) __msa_ldi_h(20);
2728 
2729     for (loop_count = (height >> 2); loop_count--;) {
2730         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2731         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2732         src += (4 * src_stride);
2733         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2734                                                const20, const6, const3);
2735         res = __msa_ave_u_b(inp0, res);
2736         ST_UB(res, dst);
2737         dst += dst_stride;
2738 
2739         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2740                                                const20, const6, const3);
2741         res = __msa_ave_u_b(inp2, res);
2742         ST_UB(res, dst);
2743         dst += dst_stride;
2744 
2745         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2746                                                const20, const6, const3);
2747         res = __msa_ave_u_b(inp4, res);
2748         ST_UB(res, dst);
2749         dst += dst_stride;
2750 
2751         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2752                                                const20, const6, const3);
2753         res = __msa_ave_u_b(inp6, res);
2754         ST_UB(res, dst);
2755         dst += dst_stride;
2756     }
2757 
2758     LD_UB2(src, 1, inp0, inp1);
2759     res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2760                                            const20, const6, const3);
2761     res = __msa_ave_u_b(inp0, res);
2762     ST_UB(res, dst);
2763 }
2764 
hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2765 static void hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(const uint8_t *src,
2766                                                       int32_t src_stride,
2767                                                       uint8_t *dst,
2768                                                       int32_t dst_stride)
2769 {
2770     uint8_t buff[272];
2771 
2772     hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2773     vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2774 }
2775 
hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2776 static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src,
2777                                                     int32_t src_stride,
2778                                                     uint8_t *dst,
2779                                                     int32_t dst_stride)
2780 {
2781     v16u8 inp0, inp1, inp2, inp3;
2782     v16u8 res0, res1, avg0, avg1;
2783     v16u8 horiz0, horiz1, horiz2, horiz3;
2784     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2785     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2786     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2787     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2788     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2789     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2790     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2791     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2792 
2793     LD_UB2(src, src_stride, inp0, inp1);
2794     src += (2 * src_stride);
2795     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2796                                                   mask2, mask3, const20,
2797                                                   const6, const3);
2798     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2799     horiz0 = __msa_ave_u_b(inp0, res0);
2800     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2801     LD_UB2(src, src_stride, inp2, inp3);
2802     src += (2 * src_stride);
2803     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2804                                                   mask2, mask3, const20,
2805                                                   const6, const3);
2806     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2807     horiz2 = __msa_ave_u_b(inp2, res1);
2808     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2809     LD_UB2(src, src_stride, inp0, inp1);
2810     src += (2 * src_stride);
2811     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2812                                                   mask2, mask3, const20,
2813                                                   const6, const3);
2814     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2815     horiz4 = __msa_ave_u_b(inp0, res0);
2816     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2817     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2818                                                  horiz1, horiz2, horiz3, horiz4,
2819                                                  horiz1, horiz0, horiz0, horiz1,
2820                                                  horiz2, horiz3, horiz4, horiz5,
2821                                                  const20, const6, const3);
2822     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2823     res0 = __msa_ave_u_b(avg0, res0);
2824     ST_D2(res0, 0, 1, dst, dst_stride);
2825     dst += (2 * dst_stride);
2826 
2827     LD_UB2(src, src_stride, inp2, inp3);
2828     src += (2 * src_stride);
2829     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2830                                                   mask2, mask3, const20,
2831                                                   const6, const3);
2832     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2833     horiz6 = __msa_ave_u_b(inp2, res1);
2834     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2835     inp0 = LD_UB(src);
2836     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2837                                                        mask2, mask3, const20,
2838                                                        const6, const3);
2839     horiz8 = __msa_ave_u_b(inp0, res0);
2840     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2841                                                  horiz3, horiz4, horiz5, horiz6,
2842                                                  horiz3, horiz2, horiz1, horiz0,
2843                                                  horiz4, horiz5, horiz6, horiz7,
2844                                                  const20, const6, const3);
2845     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2846     res1 = __msa_ave_u_b(avg1, res1);
2847     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2848                                                  horiz5, horiz6, horiz7, horiz8,
2849                                                  horiz5, horiz4, horiz3, horiz2,
2850                                                  horiz6, horiz7, horiz8, horiz8,
2851                                                  const20, const6, const3);
2852     ST_D2(res1, 0, 1, dst, dst_stride);
2853     dst += 2 * dst_stride;
2854 
2855     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2856     res0 = __msa_ave_u_b(avg0, res0);
2857     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2858                                                  horiz7, horiz8, horiz8, horiz7,
2859                                                  horiz7, horiz6, horiz5, horiz4,
2860                                                  horiz8, horiz8, horiz7, horiz6,
2861                                                  const20, const6, const3);
2862     ST_D2(res0, 0, 1, dst, dst_stride);
2863     dst += 2 * dst_stride;
2864 
2865     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2866     res1 = __msa_ave_u_b(avg1, res1);
2867     ST_D2(res1, 0, 1, dst, dst_stride);
2868 }
2869 
hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)2870 static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
2871                                               int32_t src_stride,
2872                                               uint8_t *dst,
2873                                               int32_t dst_stride,
2874                                               int32_t height)
2875 {
2876     uint8_t loop_count;
2877     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2878     v16u8 res;
2879     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2880     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2881     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2882     v8u16 const20 = (v8u16) __msa_ldi_h(20);
2883 
2884     for (loop_count = (height >> 2); loop_count--;) {
2885         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2886         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2887         src += (4 * src_stride);
2888         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2889                                                const20, const6, const3);
2890         ST_UB(res, dst);
2891         dst += dst_stride;
2892 
2893         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2894                                                const20, const6, const3);
2895         ST_UB(res, dst);
2896         dst += dst_stride;
2897 
2898         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2899                                                const20, const6, const3);
2900         ST_UB(res, dst);
2901         dst += dst_stride;
2902 
2903         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2904                                                const20, const6, const3);
2905         ST_UB(res, dst);
2906         dst += dst_stride;
2907     }
2908 
2909     LD_UB2(src, 1, inp0, inp1);
2910     res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2911                                            const20, const6, const3);
2912     ST_UB(res, dst);
2913 }
2914 
hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2915 static void hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(const uint8_t *src,
2916                                                     int32_t src_stride,
2917                                                     uint8_t *dst,
2918                                                     int32_t dst_stride)
2919 {
2920     uint8_t buff[272];
2921 
2922     hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2923     vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2924 }
2925 
hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)2926 static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src,
2927                                                   int32_t src_stride,
2928                                                   uint8_t *dst,
2929                                                   int32_t dst_stride)
2930 {
2931     v16u8 inp0, inp1, inp2, inp3;
2932     v16u8 res0, res1, avg0, avg1;
2933     v16u8 horiz0, horiz1, horiz2, horiz3;
2934     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2935     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2936     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2937     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2938     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2939     v16u8 const20 = (v16u8) __msa_ldi_b(20);
2940     v16u8 const6 = (v16u8) __msa_ldi_b(6);
2941     v16u8 const3 = (v16u8) __msa_ldi_b(3);
2942 
2943     LD_UB2(src, src_stride, inp0, inp1);
2944     src += (2 * src_stride);
2945     horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2946                                                     mask2, mask3, const20,
2947                                                     const6, const3);
2948     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2949 
2950     LD_UB2(src, src_stride, inp2, inp3);
2951     src += (2 * src_stride);
2952     horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2953                                                     mask2, mask3, const20,
2954                                                     const6, const3);
2955     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2956     LD_UB2(src, src_stride, inp0, inp1);
2957     src += (2 * src_stride);
2958     horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2959                                                     mask2, mask3, const20,
2960                                                     const6, const3);
2961     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2962     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2963                                                  horiz1, horiz2, horiz3, horiz4,
2964                                                  horiz1, horiz0, horiz0, horiz1,
2965                                                  horiz2, horiz3, horiz4, horiz5,
2966                                                  const20, const6, const3);
2967     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2968     res0 = __msa_ave_u_b(avg0, res0);
2969     ST_D2(res0, 0, 1, dst, dst_stride);
2970     dst += (2 * dst_stride);
2971 
2972     LD_UB2(src, src_stride, inp2, inp3);
2973     src += (2 * src_stride);
2974     horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2975                                                     mask2, mask3, const20,
2976                                                     const6, const3);
2977     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2978     inp0 = LD_UB(src);
2979     horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2980                                                          mask2, mask3, const20,
2981                                                          const6, const3);
2982     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2983                                                  horiz3, horiz4, horiz5, horiz6,
2984                                                  horiz3, horiz2, horiz1, horiz0,
2985                                                  horiz4, horiz5, horiz6, horiz7,
2986                                                  const20, const6, const3);
2987     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2988     res1 = __msa_ave_u_b(avg1, res1);
2989     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2990     res0 = __msa_ave_u_b(avg0, res0);
2991     ST_D2(res1, 0, 1, dst, dst_stride);
2992     dst += (2 * dst_stride);
2993 
2994     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2995                                                  horiz5, horiz6, horiz7, horiz8,
2996                                                  horiz5, horiz4, horiz3, horiz2,
2997                                                  horiz6, horiz7, horiz8, horiz8,
2998                                                  const20, const6, const3);
2999     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3000     res0 = __msa_ave_u_b(avg0, res0);
3001     ST_D2(res0, 0, 1, dst, dst_stride);
3002     dst += (2 * dst_stride);
3003 
3004     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3005                                                  horiz7, horiz8, horiz8, horiz7,
3006                                                  horiz7, horiz6, horiz5, horiz4,
3007                                                  horiz8, horiz8, horiz7, horiz6,
3008                                                  const20, const6, const3);
3009     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3010     res1 = __msa_ave_u_b(avg1, res1);
3011     ST_D2(res1, 0, 1, dst, dst_stride);
3012 }
3013 
hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3014 static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
3015                                                    int32_t src_stride,
3016                                                    uint8_t *dst,
3017                                                    int32_t dst_stride,
3018                                                    int32_t height)
3019 {
3020     uint8_t loop_count;
3021     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3022     v16u8 res;
3023     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3024     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3025     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3026     v8u16 const20 = (v8u16) __msa_ldi_h(20);
3027 
3028     for (loop_count = (height >> 2); loop_count--;) {
3029         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3030         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3031         src += (4 * src_stride);
3032         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3033                                                const20, const6, const3);
3034         res = __msa_ave_u_b(res, inp1);
3035         ST_UB(res, dst);
3036         dst += dst_stride;
3037 
3038         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3039                                                const20, const6, const3);
3040         res = __msa_ave_u_b(res, inp3);
3041         ST_UB(res, dst);
3042         dst += dst_stride;
3043 
3044         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3045                                                const20, const6, const3);
3046         res = __msa_ave_u_b(res, inp5);
3047         ST_UB(res, dst);
3048         dst += dst_stride;
3049 
3050         res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3051                                                const20, const6, const3);
3052         res = __msa_ave_u_b(res, inp7);
3053         ST_UB(res, dst);
3054         dst += dst_stride;
3055     }
3056 
3057     LD_UB2(src, 1, inp0, inp1);
3058     res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3059                                            const20, const6, const3);
3060     res = __msa_ave_u_b(inp1, res);
3061     ST_UB(res, dst);
3062 }
3063 
hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3064 static void hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(const uint8_t *src,
3065                                                       int32_t src_stride,
3066                                                       uint8_t *dst,
3067                                                       int32_t dst_stride)
3068 {
3069     uint8_t buff[272];
3070 
3071     hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3072     vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3073 }
3074 
hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3075 static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src,
3076                                                     int32_t src_stride,
3077                                                     uint8_t *dst,
3078                                                     int32_t dst_stride)
3079 {
3080     v16u8 inp0, inp1, inp2, inp3;
3081     v16u8 res0, res1, avg0, avg1;
3082     v16u8 horiz0, horiz1, horiz2, horiz3;
3083     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3084     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3085     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3086     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3087     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3088     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3089     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3090     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3091 
3092     LD_UB2(src, src_stride, inp0, inp1);
3093     src += (2 * src_stride);
3094     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3095                                                   mask2, mask3, const20,
3096                                                   const6, const3);
3097     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3098 
3099     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3100     horiz0 = __msa_ave_u_b(inp0, res0);
3101     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3102     LD_UB2(src, src_stride, inp2, inp3);
3103     src += (2 * src_stride);
3104     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3105                                                   mask2, mask3, const20,
3106                                                   const6, const3);
3107     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3108 
3109     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3110     horiz2 = __msa_ave_u_b(inp2, res1);
3111     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3112     LD_UB2(src, src_stride, inp0, inp1);
3113     src += (2 * src_stride);
3114     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3115                                                   mask2, mask3, const20,
3116                                                   const6, const3);
3117     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3118 
3119     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3120     horiz4 = __msa_ave_u_b(inp0, res0);
3121     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3122     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3123                                                  horiz1, horiz2, horiz3, horiz4,
3124                                                  horiz1, horiz0, horiz0, horiz1,
3125                                                  horiz2, horiz3, horiz4, horiz5,
3126                                                  const20, const6, const3);
3127     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3128     res0 = __msa_ave_u_b(avg0, res0);
3129     ST_D2(res0, 0, 1, dst, dst_stride);
3130     dst += (2 * dst_stride);
3131 
3132     LD_UB2(src, src_stride, inp2, inp3);
3133     src += (2 * src_stride);
3134     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3135                                                   mask2, mask3, const20,
3136                                                   const6, const3);
3137     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3138 
3139     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3140     horiz6 = __msa_ave_u_b(inp2, res1);
3141     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3142     inp0 = LD_UB(src);
3143     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3144                                                        mask2, mask3, const20,
3145                                                        const6, const3);
3146     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3147     horiz8 = __msa_ave_u_b(inp0, res0);
3148     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3149                                                  horiz3, horiz4, horiz5, horiz6,
3150                                                  horiz3, horiz2, horiz1, horiz0,
3151                                                  horiz4, horiz5, horiz6, horiz7,
3152                                                  const20, const6, const3);
3153     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3154     res1 = __msa_ave_u_b(avg1, res1);
3155     ST_D2(res1, 0, 1, dst, dst_stride);
3156     dst += (2 * dst_stride);
3157 
3158     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3159                                                  horiz5, horiz6, horiz7, horiz8,
3160                                                  horiz5, horiz4, horiz3, horiz2,
3161                                                  horiz6, horiz7, horiz8, horiz8,
3162                                                  const20, const6, const3);
3163     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3164     res0 = __msa_ave_u_b(avg0, res0);
3165     ST_D2(res0, 0, 1, dst, dst_stride);
3166     dst += (2 * dst_stride);
3167 
3168     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3169                                                  horiz7, horiz8, horiz8, horiz7,
3170                                                  horiz7, horiz6, horiz5, horiz4,
3171                                                  horiz8, horiz8, horiz7, horiz6,
3172                                                  const20, const6, const3);
3173     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3174     res1 = __msa_ave_u_b(avg1, res1);
3175     ST_D2(res1, 0, 1, dst, dst_stride);
3176 }
3177 
hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3178 static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
3179                                                     int32_t src_stride,
3180                                                     uint8_t *dst,
3181                                                     int32_t dst_stride)
3182 {
3183     uint8_t buff[272];
3184 
3185     hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3186     vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3187 }
3188 
hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3189 static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src,
3190                                                   int32_t src_stride,
3191                                                   uint8_t *dst,
3192                                                   int32_t dst_stride)
3193 {
3194     v16u8 inp0, inp1, inp2, inp3;
3195     v16u8 res0, res1;
3196     v16u8 horiz0, horiz1, horiz2, horiz3;
3197     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3198     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3199     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3200     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3201     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3202     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3203     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3204     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3205 
3206     LD_UB2(src, src_stride, inp0, inp1);
3207     src += (2 * src_stride);
3208     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3209                                                   mask2, mask3, const20,
3210                                                   const6, const3);
3211     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3212     horiz0 = __msa_ave_u_b(inp0, res0);
3213     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3214     LD_UB2(src, src_stride, inp2, inp3);
3215     src += (2 * src_stride);
3216     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3217                                                   mask2, mask3, const20,
3218                                                   const6, const3);
3219     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3220     horiz2 = __msa_ave_u_b(inp2, res1);
3221     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3222     LD_UB2(src, src_stride, inp0, inp1);
3223     src += (2 * src_stride);
3224     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3225                                                   mask2, mask3, const20,
3226                                                   const6, const3);
3227     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3228     horiz4 = __msa_ave_u_b(inp0, res0);
3229     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3230     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3231                                                  horiz1, horiz2, horiz3, horiz4,
3232                                                  horiz1, horiz0, horiz0, horiz1,
3233                                                  horiz2, horiz3, horiz4, horiz5,
3234                                                  const20, const6, const3);
3235 
3236     LD_UB2(src, src_stride, inp2, inp3);
3237     src += (2 * src_stride);
3238     ST_D2(res0, 0, 1, dst, dst_stride);
3239     dst += 2 * dst_stride;
3240 
3241     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3242                                                   mask2, mask3, const20,
3243                                                   const6, const3);
3244     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3245     horiz6 = __msa_ave_u_b(inp2, res1);
3246     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3247     inp0 = LD_UB(src);
3248     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3249                                                        mask2, mask3, const20,
3250                                                        const6, const3);
3251     horiz8 = __msa_ave_u_b(inp0, res0);
3252     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3253                                                  horiz3, horiz4, horiz5, horiz6,
3254                                                  horiz3, horiz2, horiz1, horiz0,
3255                                                  horiz4, horiz5, horiz6, horiz7,
3256                                                  const20, const6, const3);
3257     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3258                                                  horiz5, horiz6, horiz7, horiz8,
3259                                                  horiz5, horiz4, horiz3, horiz2,
3260                                                  horiz6, horiz7, horiz8, horiz8,
3261                                                  const20, const6, const3);
3262     ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
3263     dst += (4 * dst_stride);
3264 
3265     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3266                                                  horiz7, horiz8, horiz8, horiz7,
3267                                                  horiz7, horiz6, horiz5, horiz4,
3268                                                  horiz8, horiz8, horiz7, horiz6,
3269                                                  const20, const6, const3);
3270     ST_D2(res1, 0, 1, dst, dst_stride);
3271 }
3272 
hv_mc_qpel_no_rnd_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3273 static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
3274                                         int32_t src_stride,
3275                                         uint8_t *dst,
3276                                         int32_t dst_stride)
3277 {
3278     uint8_t buff[272];
3279 
3280     hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3281     vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3282 }
3283 
hv_mc_qpel_no_rnd_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3284 static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src,
3285                                       int32_t src_stride,
3286                                       uint8_t *dst,
3287                                       int32_t dst_stride)
3288 {
3289     v16u8 inp0, inp1, inp2, inp3;
3290     v16u8 res0, res1;
3291     v16u8 horiz0, horiz1, horiz2, horiz3;
3292     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3293     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3294     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3295     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3296     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3297     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3298     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3299     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3300 
3301     LD_UB2(src, src_stride, inp0, inp1);
3302     src += (2 * src_stride);
3303     horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3304                                                     mask2, mask3, const20,
3305                                                     const6, const3);
3306     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3307     LD_UB2(src, src_stride, inp2, inp3);
3308     src += (2 * src_stride);
3309     horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3310                                                     mask2, mask3, const20,
3311                                                     const6, const3);
3312     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3313     LD_UB2(src, src_stride, inp0, inp1);
3314     src += (2 * src_stride);
3315     horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3316                                                     mask2, mask3, const20,
3317                                                     const6, const3);
3318     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3319     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3320                                                  horiz1, horiz2, horiz3, horiz4,
3321                                                  horiz1, horiz0, horiz0, horiz1,
3322                                                  horiz2, horiz3, horiz4, horiz5,
3323                                                  const20, const6, const3);
3324     LD_UB2(src, src_stride, inp2, inp3);
3325     src += (2 * src_stride);
3326     ST_D2(res0, 0, 1, dst, dst_stride);
3327     dst += 2 * dst_stride;
3328 
3329     horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3330                                                     mask2, mask3, const20,
3331                                                     const6, const3);
3332     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3333     inp0 = LD_UB(src);
3334     horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3335                                                          mask2, mask3, const20,
3336                                                          const6, const3);
3337     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3338                                                  horiz3, horiz4, horiz5, horiz6,
3339                                                  horiz3, horiz2, horiz1, horiz0,
3340                                                  horiz4, horiz5, horiz6, horiz7,
3341                                                  const20, const6, const3);
3342     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3343                                                  horiz5, horiz6, horiz7, horiz8,
3344                                                  horiz5, horiz4, horiz3, horiz2,
3345                                                  horiz6, horiz7, horiz8, horiz8,
3346                                                  const20, const6, const3);
3347     ST_D2(res1, 0, 1, dst, dst_stride);
3348     dst += 2 * dst_stride;
3349 
3350 
3351     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3352                                                  horiz7, horiz8, horiz8, horiz7,
3353                                                  horiz7, horiz6, horiz5, horiz4,
3354                                                  horiz8, horiz8, horiz7, horiz6,
3355                                                  const20, const6, const3);
3356     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3357 }
3358 
hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3359 static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
3360                                                     int32_t src_stride,
3361                                                     uint8_t *dst,
3362                                                     int32_t dst_stride)
3363 {
3364     uint8_t buff[272];
3365 
3366     hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3367     vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3368 }
3369 
hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3370 static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src,
3371                                                   int32_t src_stride,
3372                                                   uint8_t *dst,
3373                                                   int32_t dst_stride)
3374 {
3375     v16u8 inp0, inp1, inp2, inp3;
3376     v16u8 res0, res1;
3377     v16u8 horiz0, horiz1, horiz2, horiz3;
3378     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3379     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3380     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3381     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3382     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3383     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3384     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3385     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3386 
3387     LD_UB2(src, src_stride, inp0, inp1);
3388     src += (2 * src_stride);
3389     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3390                                                   mask2, mask3, const20,
3391                                                   const6, const3);
3392     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3393 
3394     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3395     horiz0 = __msa_ave_u_b(inp0, res0);
3396     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3397     LD_UB2(src, src_stride, inp2, inp3);
3398     src += (2 * src_stride);
3399     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3400                                                   mask2, mask3, const20,
3401                                                   const6, const3);
3402     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3403 
3404     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3405     horiz2 = __msa_ave_u_b(inp2, res1);
3406     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3407     LD_UB2(src, src_stride, inp0, inp1);
3408     src += (2 * src_stride);
3409     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3410                                                   mask2, mask3, const20,
3411                                                   const6, const3);
3412     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3413 
3414     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3415     horiz4 = __msa_ave_u_b(inp0, res0);
3416     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3417     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3418                                                  horiz1, horiz2, horiz3, horiz4,
3419                                                  horiz1, horiz0, horiz0, horiz1,
3420                                                  horiz2, horiz3, horiz4, horiz5,
3421                                                  const20, const6, const3);
3422     LD_UB2(src, src_stride, inp2, inp3);
3423     src += (2 * src_stride);
3424     ST_D2(res0, 0, 1, dst, dst_stride);
3425     dst += 2 * dst_stride;
3426 
3427     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3428                                                   mask2, mask3, const20,
3429                                                   const6, const3);
3430     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3431 
3432     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3433     horiz6 = __msa_ave_u_b(inp2, res1);
3434     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3435     inp0 = LD_UB(src);
3436     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3437                                                        mask2, mask3, const20,
3438                                                        const6, const3);
3439     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3440     horiz8 = __msa_ave_u_b(inp0, res0);
3441     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3442                                                  horiz3, horiz4, horiz5, horiz6,
3443                                                  horiz3, horiz2, horiz1, horiz0,
3444                                                  horiz4, horiz5, horiz6, horiz7,
3445                                                  const20, const6, const3);
3446     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3447                                                  horiz5, horiz6, horiz7, horiz8,
3448                                                  horiz5, horiz4, horiz3, horiz2,
3449                                                  horiz6, horiz7, horiz8, horiz8,
3450                                                  const20, const6, const3);
3451     ST_D2(res1, 0, 1, dst, dst_stride);
3452     dst += 2 * dst_stride;
3453 
3454     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3455                                                  horiz7, horiz8, horiz8, horiz7,
3456                                                  horiz7, horiz6, horiz5, horiz4,
3457                                                  horiz8, horiz8, horiz7, horiz6,
3458                                                  const20, const6, const3);
3459     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3460 }
3461 
hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3462 static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
3463                                                       int32_t src_stride,
3464                                                       uint8_t *dst,
3465                                                       int32_t dst_stride)
3466 {
3467     uint8_t buff[272];
3468 
3469     hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3470     vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3471 }
3472 
hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3473 static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src,
3474                                                     int32_t src_stride,
3475                                                     uint8_t *dst,
3476                                                     int32_t dst_stride)
3477 {
3478     v16u8 inp0, inp1, inp2, inp3;
3479     v16u8 res0, res1, avg0, avg1;
3480     v16u8 horiz0, horiz1, horiz2, horiz3;
3481     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3482     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3483     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3484     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3485     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3486     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3487     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3488     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3489 
3490     LD_UB2(src, src_stride, inp0, inp1);
3491     src += (2 * src_stride);
3492     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3493                                                   mask2, mask3, const20,
3494                                                   const6, const3);
3495     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3496     horiz0 = __msa_ave_u_b(inp0, res0);
3497     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3498     LD_UB2(src, src_stride, inp2, inp3);
3499     src += (2 * src_stride);
3500     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3501                                                   mask2, mask3, const20,
3502                                                   const6, const3);
3503     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3504     horiz2 = __msa_ave_u_b(inp2, res1);
3505     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3506     LD_UB2(src, src_stride, inp0, inp1);
3507     src += (2 * src_stride);
3508     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3509                                                   mask2, mask3, const20,
3510                                                   const6, const3);
3511     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3512     horiz4 = __msa_ave_u_b(inp0, res0);
3513     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3514     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3515                                                  horiz1, horiz2, horiz3, horiz4,
3516                                                  horiz1, horiz0, horiz0, horiz1,
3517                                                  horiz2, horiz3, horiz4, horiz5,
3518                                                  const20, const6, const3);
3519     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3520     res0 = __msa_ave_u_b(avg0, res0);
3521     ST_D2(res0, 0, 1, dst, dst_stride);
3522     dst += (2 * dst_stride);
3523 
3524     LD_UB2(src, src_stride, inp2, inp3);
3525     src += (2 * src_stride);
3526     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3527                                                   mask2, mask3, const20,
3528                                                   const6, const3);
3529     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3530     horiz6 = __msa_ave_u_b(inp2, res1);
3531     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3532     inp0 = LD_UB(src);
3533     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3534                                                        mask2, mask3, const20,
3535                                                        const6, const3);
3536     horiz8 = __msa_ave_u_b(inp0, res0);
3537     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3538                                                  horiz3, horiz4, horiz5, horiz6,
3539                                                  horiz3, horiz2, horiz1, horiz0,
3540                                                  horiz4, horiz5, horiz6, horiz7,
3541                                                  const20, const6, const3);
3542     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3543     res1 = __msa_ave_u_b(avg1, res1);
3544     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3545                                                  horiz5, horiz6, horiz7, horiz8,
3546                                                  horiz5, horiz4, horiz3, horiz2,
3547                                                  horiz6, horiz7, horiz8, horiz8,
3548                                                  const20, const6, const3);
3549     ST_D2(res1, 0, 1, dst, dst_stride);
3550     dst += 2 * dst_stride;
3551 
3552     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3553     res0 = __msa_ave_u_b(avg0, res0);
3554 
3555     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3556                                                  horiz7, horiz8, horiz8, horiz7,
3557                                                  horiz7, horiz6, horiz5, horiz4,
3558                                                  horiz8, horiz8, horiz7, horiz6,
3559                                                  const20, const6, const3);
3560     ST_D2(res0, 0, 1, dst, dst_stride);
3561     dst += 2 * dst_stride;
3562 
3563     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3564     res1 = __msa_ave_u_b(avg1, res1);
3565     ST_D2(res1, 0, 1, dst, dst_stride);
3566 }
3567 
hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3568 static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
3569                                                     int32_t src_stride,
3570                                                     uint8_t *dst,
3571                                                     int32_t dst_stride)
3572 {
3573     uint8_t buff[272];
3574 
3575     hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3576     vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3577 }
3578 
hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3579 static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src,
3580                                                   int32_t src_stride,
3581                                                   uint8_t *dst,
3582                                                   int32_t dst_stride)
3583 {
3584     v16u8 inp0, inp1, inp2, inp3;
3585     v16u8 res0, res1, avg0, avg1;
3586     v16u8 horiz0, horiz1, horiz2, horiz3;
3587     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3588     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3589     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3590     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3591     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3592     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3593     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3594     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3595 
3596     LD_UB2(src, src_stride, inp0, inp1);
3597     src += (2 * src_stride);
3598     horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3599                                                     mask2, mask3, const20,
3600                                                     const6, const3);
3601     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3602     LD_UB2(src, src_stride, inp2, inp3);
3603     src += (2 * src_stride);
3604     horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3605                                                     mask2, mask3, const20,
3606                                                     const6, const3);
3607     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3608     LD_UB2(src, src_stride, inp0, inp1);
3609     src += (2 * src_stride);
3610     horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3611                                                     mask2, mask3, const20,
3612                                                     const6, const3);
3613     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3614     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3615                                                  horiz1, horiz2, horiz3, horiz4,
3616                                                  horiz1, horiz0, horiz0, horiz1,
3617                                                  horiz2, horiz3, horiz4, horiz5,
3618                                                  const20, const6, const3);
3619     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3620     res0 = __msa_ave_u_b(avg0, res0);
3621     LD_UB2(src, src_stride, inp2, inp3);
3622     src += (2 * src_stride);
3623     ST_D2(res0, 0, 1, dst, dst_stride);
3624     dst += 2 * dst_stride;
3625 
3626     horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3627                                                     mask2, mask3, const20,
3628                                                     const6, const3);
3629     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3630     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3631                                                  horiz3, horiz4, horiz5, horiz6,
3632                                                  horiz3, horiz2, horiz1, horiz0,
3633                                                  horiz4, horiz5, horiz6, horiz7,
3634                                                  const20, const6, const3);
3635     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3636     res1 = __msa_ave_u_b(avg1, res1);
3637     inp0 = LD_UB(src);
3638     horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3639                                                          mask2, mask3, const20,
3640                                                          const6, const3);
3641     ST_D2(res1, 0, 1, dst, dst_stride);
3642     dst += 2 * dst_stride;
3643 
3644     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3645                                                  horiz5, horiz6, horiz7, horiz8,
3646                                                  horiz5, horiz4, horiz3, horiz2,
3647                                                  horiz6, horiz7, horiz8, horiz8,
3648                                                  const20, const6, const3);
3649     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3650     res0 = __msa_ave_u_b(avg0, res0);
3651     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3652                                                  horiz7, horiz8, horiz8, horiz7,
3653                                                  horiz7, horiz6, horiz5, horiz4,
3654                                                  horiz8, horiz8, horiz7, horiz6,
3655                                                  const20, const6, const3);
3656     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3657     res1 = __msa_ave_u_b(avg1, res1);
3658     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3659 }
3660 
hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3661 static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
3662                                                       int32_t src_stride,
3663                                                       uint8_t *dst,
3664                                                       int32_t dst_stride)
3665 {
3666     uint8_t buff[272];
3667 
3668     hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3669     vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3670 }
3671 
hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3672 static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src,
3673                                                     int32_t src_stride,
3674                                                     uint8_t *dst,
3675                                                     int32_t dst_stride)
3676 {
3677     v16u8 inp0, inp1, inp2, inp3;
3678     v16u8 res0, res1, avg0, avg1;
3679     v16u8 horiz0, horiz1, horiz2, horiz3;
3680     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3681     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3682     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3683     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3684     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3685     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3686     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3687     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3688 
3689     LD_UB2(src, src_stride, inp0, inp1);
3690     src += (2 * src_stride);
3691     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3692                                                   mask2, mask3, const20,
3693                                                   const6, const3);
3694     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3695 
3696     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3697     horiz0 = __msa_ave_u_b(inp0, res0);
3698     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3699     LD_UB2(src, src_stride, inp2, inp3);
3700     src += (2 * src_stride);
3701     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3702                                                   mask2, mask3, const20,
3703                                                   const6, const3);
3704     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3705 
3706     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3707     horiz2 = __msa_ave_u_b(inp2, res1);
3708     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3709     LD_UB2(src, src_stride, inp0, inp1);
3710     src += (2 * src_stride);
3711     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3712                                                   mask2, mask3, const20,
3713                                                   const6, const3);
3714 
3715     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
3716     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3717     horiz4 = __msa_ave_u_b(inp0, res0);
3718     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3719     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3720                                                  horiz1, horiz2, horiz3, horiz4,
3721                                                  horiz1, horiz0, horiz0, horiz1,
3722                                                  horiz2, horiz3, horiz4, horiz5,
3723                                                  const20, const6, const3);
3724     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3725     res0 = __msa_ave_u_b(avg0, res0);
3726     ST_D2(res0, 0, 1, dst, dst_stride);
3727     dst += (2 * dst_stride);
3728 
3729     LD_UB2(src, src_stride, inp2, inp3);
3730     src += (2 * src_stride);
3731     res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3732                                                   mask2, mask3, const20,
3733                                                   const6, const3);
3734     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
3735 
3736     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3737     horiz6 = __msa_ave_u_b(inp2, res1);
3738     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3739     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3740                                                  horiz3, horiz4, horiz5, horiz6,
3741                                                  horiz3, horiz2, horiz1, horiz0,
3742                                                  horiz4, horiz5, horiz6, horiz7,
3743                                                  const20, const6, const3);
3744     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3745     res1 = __msa_ave_u_b(avg1, res1);
3746     ST_D2(res1, 0, 1, dst, dst_stride);
3747     dst += (2 * dst_stride);
3748 
3749     inp0 = LD_UB(src);
3750     res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3751                                                        mask2, mask3, const20,
3752                                                        const6, const3);
3753     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3754     horiz8 = __msa_ave_u_b(inp0, res0);
3755     res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3756                                                  horiz5, horiz6, horiz7, horiz8,
3757                                                  horiz5, horiz4, horiz3, horiz2,
3758                                                  horiz6, horiz7, horiz8, horiz8,
3759                                                  const20, const6, const3);
3760     res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3761                                                  horiz7, horiz8, horiz8, horiz7,
3762                                                  horiz7, horiz6, horiz5, horiz4,
3763                                                  horiz8, horiz8, horiz7, horiz6,
3764                                                  const20, const6, const3);
3765     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3766     res0 = __msa_ave_u_b(avg0, res0);
3767     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3768     res1 = __msa_ave_u_b(avg1, res1);
3769     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3770 }
3771 
hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3772 static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
3773                                                  int32_t src_stride,
3774                                                  uint8_t *dst,
3775                                                  int32_t dst_stride,
3776                                                  int32_t height)
3777 {
3778     uint8_t loop_count;
3779     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3780     v16u8 res;
3781     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3782     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3783     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3784     v8u16 const20 = (v8u16) __msa_ldi_h(20);
3785 
3786     for (loop_count = (height >> 2); loop_count--;) {
3787         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3788         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3789         src += (4 * src_stride);
3790         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3791                                       const20, const6, const3);
3792         res = __msa_aver_u_b(inp0, res);
3793         ST_UB(res, dst);
3794         dst += dst_stride;
3795 
3796         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3797                                       const20, const6, const3);
3798         res = __msa_aver_u_b(inp2, res);
3799         ST_UB(res, dst);
3800         dst += dst_stride;
3801 
3802         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3803                                       const20, const6, const3);
3804         res = __msa_aver_u_b(inp4, res);
3805         ST_UB(res, dst);
3806         dst += dst_stride;
3807 
3808         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3809                                       const20, const6, const3);
3810         res = __msa_aver_u_b(inp6, res);
3811         ST_UB(res, dst);
3812         dst += dst_stride;
3813     }
3814 
3815     LD_UB2(src, 1, inp0, inp1);
3816     res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3817     res = __msa_aver_u_b(inp0, res);
3818     ST_UB(res, dst);
3819 }
3820 
hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3821 static void hv_mc_qpel_aver_hv_src00_16x16_msa(const uint8_t *src,
3822                                                int32_t src_stride,
3823                                                uint8_t *dst,
3824                                                int32_t dst_stride)
3825 {
3826     uint8_t buff[272];
3827 
3828     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3829     vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3830 }
3831 
hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3832 static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src,
3833                                              int32_t src_stride,
3834                                              uint8_t *dst,
3835                                              int32_t dst_stride)
3836 {
3837     v16u8 inp0, inp1, inp2, inp3;
3838     v16u8 res0, res1, avg0, avg1;
3839     v16u8 horiz0, horiz1, horiz2, horiz3;
3840     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3841     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3842     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3843     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3844     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3845     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3846     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3847     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3848 
3849     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3850     src += (4 * src_stride);
3851     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3852                                          const20, const6, const3);
3853     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3854                                          const20, const6, const3);
3855     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3856     horiz0 = __msa_aver_u_b(inp0, res0);
3857     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3858     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3859     horiz2 = __msa_aver_u_b(inp2, res1);
3860     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3861     LD_UB2(src, src_stride, inp0, inp1);
3862     src += (2 * src_stride);
3863     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3864                                          const20, const6, const3);
3865     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3866     horiz4 = __msa_aver_u_b(inp0, res0);
3867     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3868     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3869                                         horiz1, horiz2, horiz3, horiz4,
3870                                         horiz1, horiz0, horiz0, horiz1,
3871                                         horiz2, horiz3, horiz4, horiz5,
3872                                         const20, const6, const3);
3873     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3874     res0 = __msa_aver_u_b(avg0, res0);
3875     ST_D2(res0, 0, 1, dst, dst_stride);
3876     dst += (2 * dst_stride);
3877 
3878     LD_UB2(src, src_stride, inp2, inp3);
3879     src += (2 * src_stride);
3880     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3881                                          const20, const6, const3);
3882     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3883     horiz6 = __msa_aver_u_b(inp2, res1);
3884     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3885     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3886                                         horiz3, horiz4, horiz5, horiz6,
3887                                         horiz3, horiz2, horiz1, horiz0,
3888                                         horiz4, horiz5, horiz6, horiz7,
3889                                         const20, const6, const3);
3890     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3891     res1 = __msa_aver_u_b(avg1, res1);
3892 
3893     inp0 = LD_UB(src);
3894     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3895                                               const20, const6, const3);
3896     horiz8 = __msa_aver_u_b(inp0, res0);
3897     ST_D2(res1, 0, 1, dst, dst_stride);
3898     dst += 2 * dst_stride;
3899 
3900     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3901                                         horiz5, horiz6, horiz7, horiz8,
3902                                         horiz5, horiz4, horiz3, horiz2,
3903                                         horiz6, horiz7, horiz8, horiz8,
3904                                         const20, const6, const3);
3905     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3906     res0 = __msa_aver_u_b(avg0, res0);
3907     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3908                                         horiz7, horiz8, horiz8, horiz7,
3909                                         horiz7, horiz6, horiz5, horiz4,
3910                                         horiz8, horiz8, horiz7, horiz6,
3911                                         const20, const6, const3);
3912     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3913     res1 = __msa_aver_u_b(avg1, res1);
3914     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3915 }
3916 
hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)3917 static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
3918                                             int32_t src_stride,
3919                                             uint8_t *dst,
3920                                             int32_t dst_stride,
3921                                             int32_t height)
3922 {
3923     uint8_t loop_count;
3924     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3925     v16u8 res;
3926     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3927     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3928     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3929     v8u16 const20 = (v8u16) __msa_ldi_h(20);
3930 
3931     for (loop_count = (height >> 2); loop_count--;) {
3932         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3933         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3934         src += (4 * src_stride);
3935         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3936                                       const20, const6, const3);
3937         ST_UB(res, dst);
3938         dst += dst_stride;
3939 
3940         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3941                                       const20, const6, const3);
3942         ST_UB(res, dst);
3943         dst += dst_stride;
3944 
3945         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3946                                       const20, const6, const3);
3947         ST_UB(res, dst);
3948         dst += dst_stride;
3949 
3950         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3951                                       const20, const6, const3);
3952         ST_UB(res, dst);
3953         dst += dst_stride;
3954     }
3955 
3956     LD_UB2(src, 1, inp0, inp1);
3957     res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3958     ST_UB(res, dst);
3959 }
3960 
hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3961 static void hv_mc_qpel_aver_v_src0_16x16_msa(const uint8_t *src,
3962                                              int32_t src_stride,
3963                                              uint8_t *dst,
3964                                              int32_t dst_stride)
3965 {
3966     uint8_t buff[272];
3967 
3968     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3969     vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3970 }
3971 
hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)3972 static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src,
3973                                            int32_t src_stride,
3974                                            uint8_t *dst,
3975                                            int32_t dst_stride)
3976 {
3977     v16u8 inp0, inp1, inp2, inp3;
3978     v16u8 res0, res1, avg0, avg1;
3979     v16u8 horiz0, horiz1, horiz2, horiz3;
3980     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3981     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3982     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3983     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3984     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3985     v16u8 const20 = (v16u8) __msa_ldi_b(20);
3986     v16u8 const6 = (v16u8) __msa_ldi_b(6);
3987     v16u8 const3 = (v16u8) __msa_ldi_b(3);
3988 
3989     LD_UB2(src, src_stride, inp0, inp1);
3990     src += (2 * src_stride);
3991     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
3992                                            mask0, mask1, mask2, mask3,
3993                                            const20, const6, const3);
3994     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3995     LD_UB2(src, src_stride, inp2, inp3);
3996     src += (2 * src_stride);
3997     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
3998                                            mask0, mask1, mask2, mask3,
3999                                            const20, const6, const3);
4000     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4001     LD_UB2(src, src_stride, inp0, inp1);
4002     src += (2 * src_stride);
4003     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4004                                            mask0, mask1, mask2, mask3,
4005                                            const20, const6, const3);
4006     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4007     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4008                                         horiz1, horiz2, horiz3, horiz4,
4009                                         horiz1, horiz0, horiz0, horiz1,
4010                                         horiz2, horiz3, horiz4, horiz5,
4011                                         const20, const6, const3);
4012     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4013     res0 = __msa_aver_u_b(avg0, res0);
4014     ST_D2(res0, 0, 1, dst, dst_stride);
4015     dst += (2 * dst_stride);
4016 
4017     LD_UB2(src, src_stride, inp2, inp3);
4018     src += (2 * src_stride);
4019     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4020                                            mask0, mask1, mask2, mask3,
4021                                            const20, const6, const3);
4022     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4023     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4024                                         horiz3, horiz4, horiz5, horiz6,
4025                                         horiz3, horiz2, horiz1, horiz0,
4026                                         horiz4, horiz5, horiz6, horiz7,
4027                                         const20, const6, const3);
4028     inp0 = LD_UB(src);
4029     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4030                                                 mask0, mask1, mask2, mask3,
4031                                                 const20, const6, const3);
4032     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4033     res1 = __msa_aver_u_b(avg1, res1);
4034     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4035                                         horiz5, horiz6, horiz7, horiz8,
4036                                         horiz5, horiz4, horiz3, horiz2,
4037                                         horiz6, horiz7, horiz8, horiz8,
4038                                         const20, const6, const3);
4039     ST_D2(res1, 0, 1, dst, dst_stride);
4040     dst += 2 * dst_stride;
4041 
4042     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4043     res0 = __msa_aver_u_b(avg0, res0);
4044     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4045                                         horiz7, horiz8, horiz8, horiz7,
4046                                         horiz7, horiz6, horiz5, horiz4,
4047                                         horiz8, horiz8, horiz7, horiz6,
4048                                         const20, const6, const3);
4049     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4050     res1 = __msa_aver_u_b(avg1, res1);
4051     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4052 }
4053 
hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)4054 static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
4055                                                  int32_t src_stride,
4056                                                  uint8_t *dst,
4057                                                  int32_t dst_stride,
4058                                                  int32_t height)
4059 {
4060     uint8_t loop_count;
4061     v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4062     v16u8 res;
4063     v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4064     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4065     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4066     v8u16 const20 = (v8u16) __msa_ldi_h(20);
4067 
4068     for (loop_count = (height >> 2); loop_count--;) {
4069         LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4070         LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4071         src += (4 * src_stride);
4072         res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4073                                       const20, const6, const3);
4074         res = __msa_aver_u_b(res, inp1);
4075         ST_UB(res, dst);
4076         dst += dst_stride;
4077 
4078         res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4079                                       const20, const6, const3);
4080         res = __msa_aver_u_b(res, inp3);
4081         ST_UB(res, dst);
4082         dst += dst_stride;
4083 
4084         res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4085                                       const20, const6, const3);
4086         res = __msa_aver_u_b(res, inp5);
4087         ST_UB(res, dst);
4088         dst += dst_stride;
4089 
4090         res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4091                                       const20, const6, const3);
4092         res = __msa_aver_u_b(res, inp7);
4093         ST_UB(res, dst);
4094         dst += dst_stride;
4095     }
4096 
4097     LD_UB2(src, 1, inp0, inp1);
4098     res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4099     res = __msa_aver_u_b(inp1, res);
4100     ST_UB(res, dst);
4101 }
4102 
hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4103 static void hv_mc_qpel_aver_hv_src10_16x16_msa(const uint8_t *src,
4104                                                int32_t src_stride,
4105                                                uint8_t *dst,
4106                                                int32_t dst_stride)
4107 {
4108     uint8_t buff[272];
4109 
4110     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4111     vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4112 }
4113 
hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4114 static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src,
4115                                              int32_t src_stride,
4116                                              uint8_t *dst,
4117                                              int32_t dst_stride)
4118 {
4119     v16u8 inp0, inp1, inp2, inp3;
4120     v16u8 res0, res1, avg0, avg1;
4121     v16u8 horiz0, horiz1, horiz2, horiz3;
4122     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4123     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4124     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4125     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4126     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4127     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4128     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4129     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4130 
4131     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4132     src += (4 * src_stride);
4133     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4134                                          const20, const6, const3);
4135     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4136                                          const20, const6, const3);
4137     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4138 
4139     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4140     horiz0 = __msa_aver_u_b(inp0, res0);
4141     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4142     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4143 
4144     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4145     horiz2 = __msa_aver_u_b(inp2, res1);
4146     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4147     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4148     src += (4 * src_stride);
4149     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4150                                          const20, const6, const3);
4151     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4152                                          const20, const6, const3);
4153     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4154 
4155     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4156     horiz4 = __msa_aver_u_b(inp0, res0);
4157     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4158     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4159 
4160     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4161     horiz6 = __msa_aver_u_b(inp2, res1);
4162     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4163     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4164                                         horiz1, horiz2, horiz3, horiz4,
4165                                         horiz1, horiz0, horiz0, horiz1,
4166                                         horiz2, horiz3, horiz4, horiz5,
4167                                         const20, const6, const3);
4168     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4169     res0 = __msa_aver_u_b(avg0, res0);
4170     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4171                                         horiz3, horiz4, horiz5, horiz6,
4172                                         horiz3, horiz2, horiz1, horiz0,
4173                                         horiz4, horiz5, horiz6, horiz7,
4174                                         const20, const6, const3);
4175     ST_D2(res0, 0, 1, dst, dst_stride);
4176     dst += 2 * dst_stride;
4177 
4178     inp0 = LD_UB(src);
4179     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4180                                               const20, const6, const3);
4181     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4182     res1 = __msa_aver_u_b(avg1, res1);
4183     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4184     horiz8 = __msa_aver_u_b(inp0, res0);
4185     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4186                                         horiz5, horiz6, horiz7, horiz8,
4187                                         horiz5, horiz4, horiz3, horiz2,
4188                                         horiz6, horiz7, horiz8, horiz8,
4189                                         const20, const6, const3);
4190     ST_D2(res1, 0, 1, dst, dst_stride);
4191     dst += 2 * dst_stride;
4192 
4193     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4194     res0 = __msa_aver_u_b(avg0, res0);
4195     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4196                                         horiz7, horiz8, horiz8, horiz7,
4197                                         horiz7, horiz6, horiz5, horiz4,
4198                                         horiz8, horiz8, horiz7, horiz6,
4199                                         const20, const6, const3);
4200     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4201     res1 = __msa_aver_u_b(avg1, res1);
4202     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4203 }
4204 
hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4205 static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
4206                                              int32_t src_stride,
4207                                              uint8_t *dst,
4208                                              int32_t dst_stride)
4209 {
4210     uint8_t buff[272];
4211 
4212     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4213     vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4214 }
4215 
hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4216 static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src,
4217                                            int32_t src_stride,
4218                                            uint8_t *dst,
4219                                            int32_t dst_stride)
4220 {
4221     v16u8 inp0, inp1, inp2, inp3;
4222     v16u8 res0, res1;
4223     v16u8 horiz0, horiz1, horiz2, horiz3;
4224     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4225     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4226     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4227     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4228     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4229     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4230     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4231     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4232 
4233     LD_UB2(src, src_stride, inp0, inp1);
4234     src += (2 * src_stride);
4235     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4236                                          const20, const6, const3);
4237     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4238     horiz0 = __msa_aver_u_b(inp0, res0);
4239     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4240 
4241     LD_UB2(src, src_stride, inp2, inp3);
4242     src += (2 * src_stride);
4243     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4244                                          const20, const6, const3);
4245     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4246     horiz2 = __msa_aver_u_b(inp2, res1);
4247     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4248     LD_UB2(src, src_stride, inp0, inp1);
4249     src += (2 * src_stride);
4250     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4251                                          const20, const6, const3);
4252     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4253     horiz4 = __msa_aver_u_b(inp0, res0);
4254     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4255     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4256                                         horiz1, horiz2, horiz3, horiz4,
4257                                         horiz1, horiz0, horiz0, horiz1,
4258                                         horiz2, horiz3, horiz4, horiz5,
4259                                         const20, const6, const3);
4260     ST_D2(res0, 0, 1, dst, dst_stride);
4261     dst += (2 * dst_stride);
4262 
4263     LD_UB2(src, src_stride, inp2, inp3);
4264     src += (2 * src_stride);
4265     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4266                                          const20, const6, const3);
4267     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4268     horiz6 = __msa_aver_u_b(inp2, res1);
4269     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4270     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4271                                         horiz3, horiz4, horiz5, horiz6,
4272                                         horiz3, horiz2, horiz1, horiz0,
4273                                         horiz4, horiz5, horiz6, horiz7,
4274                                         const20, const6, const3);
4275     inp0 = LD_UB(src);
4276     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4277                                               const20, const6, const3);
4278     horiz8 = __msa_aver_u_b(inp0, res0);
4279     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4280                                         horiz5, horiz6, horiz7, horiz8,
4281                                         horiz5, horiz4, horiz3, horiz2,
4282                                         horiz6, horiz7, horiz8, horiz8,
4283                                         const20, const6, const3);
4284     ST_D2(res1, 0, 1, dst, dst_stride);
4285     dst += 2 * dst_stride;
4286 
4287     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4288                                         horiz7, horiz8, horiz8, horiz7,
4289                                         horiz7, horiz6, horiz5, horiz4,
4290                                         horiz8, horiz8, horiz7, horiz6,
4291                                         const20, const6, const3);
4292     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4293 }
4294 
hv_mc_qpel_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4295 static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4296                                  int32_t src_stride,
4297                                  uint8_t *dst,
4298                                  int32_t dst_stride)
4299 {
4300     uint8_t buff[272];
4301 
4302     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4303     vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4304 }
4305 
hv_mc_qpel_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4306 static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4307                                uint8_t *dst, int32_t dst_stride)
4308 {
4309     v16u8 inp0, inp1, inp2, inp3;
4310     v16u8 res0, res1;
4311     v16u8 horiz0, horiz1, horiz2, horiz3;
4312     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4313     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4314     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4315     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4316     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4317     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4318     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4319     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4320 
4321     LD_UB2(src, src_stride, inp0, inp1);
4322     src += (2 * src_stride);
4323     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4324                                            mask0, mask1, mask2, mask3,
4325                                            const20, const6, const3);
4326     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4327     LD_UB2(src, src_stride, inp2, inp3);
4328     src += (2 * src_stride);
4329     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4330                                            mask0, mask1, mask2, mask3,
4331                                            const20, const6, const3);
4332     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4333     LD_UB2(src, src_stride, inp0, inp1);
4334     src += (2 * src_stride);
4335     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4336                                            mask0, mask1, mask2, mask3,
4337                                            const20, const6, const3);
4338     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4339     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4340                                         horiz1, horiz2, horiz3, horiz4,
4341                                         horiz1, horiz0, horiz0, horiz1,
4342                                         horiz2, horiz3, horiz4, horiz5,
4343                                         const20, const6, const3);
4344     ST_D2(res0, 0, 1, dst, dst_stride);
4345     dst += (2 * dst_stride);
4346 
4347     LD_UB2(src, src_stride, inp2, inp3);
4348     src += (2 * src_stride);
4349     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4350                                            mask0, mask1, mask2, mask3,
4351                                            const20, const6, const3);
4352     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4353     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4354                                         horiz3, horiz4, horiz5, horiz6,
4355                                         horiz3, horiz2, horiz1, horiz0,
4356                                         horiz4, horiz5, horiz6, horiz7,
4357                                         const20, const6, const3);
4358     inp0 = LD_UB(src);
4359     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4360                                                 mask0, mask1, mask2, mask3,
4361                                                 const20, const6, const3);
4362     ST_D2(res1, 0, 1, dst, dst_stride);
4363     dst += 2 * dst_stride;
4364 
4365     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4366                                         horiz5, horiz6, horiz7, horiz8,
4367                                         horiz5, horiz4, horiz3, horiz2,
4368                                         horiz6, horiz7, horiz8, horiz8,
4369                                         const20, const6, const3);
4370     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4371                                         horiz7, horiz8, horiz8, horiz7,
4372                                         horiz7, horiz6, horiz5, horiz4,
4373                                         horiz8, horiz8, horiz7, horiz6,
4374                                         const20, const6, const3);
4375     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4376 }
4377 
hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4378 static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
4379                                              int32_t src_stride,
4380                                              uint8_t *dst,
4381                                              int32_t dst_stride)
4382 {
4383     uint8_t buff[272];
4384 
4385     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4386     vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4387 }
4388 
hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4389 static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src,
4390                                            int32_t src_stride,
4391                                            uint8_t *dst,
4392                                            int32_t dst_stride)
4393 {
4394     v16u8 inp0, inp1, inp2, inp3;
4395     v16u8 res0, res1;
4396     v16u8 horiz0, horiz1, horiz2, horiz3;
4397     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4398     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4399     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4400     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4401     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4402     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4403     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4404     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4405 
4406     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4407     src += (4 * src_stride);
4408 
4409     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4410                                          const20, const6, const3);
4411     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4412                                          const20, const6, const3);
4413     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4414 
4415     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4416     horiz0 = __msa_aver_u_b(inp0, res0);
4417     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4418     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4419 
4420     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4421     horiz2 = __msa_aver_u_b(inp2, res1);
4422     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4423     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4424     src += (4 * src_stride);
4425     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4426                                          const20, const6, const3);
4427     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4428                                          const20, const6, const3);
4429     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4430 
4431     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4432     horiz4 = __msa_aver_u_b(inp0, res0);
4433     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4434     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4435 
4436     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4437     horiz6 = __msa_aver_u_b(inp2, res1);
4438     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4439     inp0 = LD_UB(src);
4440     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4441                                               const20, const6, const3);
4442     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4443     horiz8 = __msa_aver_u_b(inp0, res0);
4444     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4445                                         horiz1, horiz2, horiz3, horiz4,
4446                                         horiz1, horiz0, horiz0, horiz1,
4447                                         horiz2, horiz3, horiz4, horiz5,
4448                                         const20, const6, const3);
4449     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4450                                         horiz3, horiz4, horiz5, horiz6,
4451                                         horiz3, horiz2, horiz1, horiz0,
4452                                         horiz4, horiz5, horiz6, horiz7,
4453                                         const20, const6, const3);
4454     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4455     dst += (4 * dst_stride);
4456 
4457     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4458                                         horiz5, horiz6, horiz7, horiz8,
4459                                         horiz5, horiz4, horiz3, horiz2,
4460                                         horiz6, horiz7, horiz8, horiz8,
4461                                         const20, const6, const3);
4462     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4463                                         horiz7, horiz8, horiz8, horiz7,
4464                                         horiz7, horiz6, horiz5, horiz4,
4465                                         horiz8, horiz8, horiz7, horiz6,
4466                                         const20, const6, const3);
4467     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4468 }
4469 
hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4470 static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
4471                                                int32_t src_stride,
4472                                                uint8_t *dst,
4473                                                int32_t dst_stride)
4474 {
4475     uint8_t buff[272];
4476 
4477     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4478     vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4479 }
4480 
hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4481 static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src,
4482                                              int32_t src_stride,
4483                                              uint8_t *dst,
4484                                              int32_t dst_stride)
4485 {
4486     v16u8 inp0, inp1, inp2, inp3;
4487     v16u8 res0, res1, avg0, avg1;
4488     v16u8 horiz0, horiz1, horiz2, horiz3;
4489     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4490     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4491     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4492     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4493     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4494     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4495     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4496     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4497 
4498     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4499     src += (4 * src_stride);
4500 
4501     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4502                                          const20, const6, const3);
4503     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4504                                          const20, const6, const3);
4505     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4506     horiz0 = __msa_aver_u_b(inp0, res0);
4507     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4508     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4509     horiz2 = __msa_aver_u_b(inp2, res1);
4510     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4511     LD_UB2(src, src_stride, inp0, inp1);
4512     src += (2 * src_stride);
4513 
4514     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4515                                          const20, const6, const3);
4516     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4517     horiz4 = __msa_aver_u_b(inp0, res0);
4518     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4519     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4520                                         horiz1, horiz2, horiz3, horiz4,
4521                                         horiz1, horiz0, horiz0, horiz1,
4522                                         horiz2, horiz3, horiz4, horiz5,
4523                                         const20, const6, const3);
4524     avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4525     res0 = __msa_aver_u_b(avg0, res0);
4526     ST_D2(res0, 0, 1, dst, dst_stride);
4527     dst += (2 * dst_stride);
4528 
4529     LD_UB2(src, src_stride, inp2, inp3);
4530     src += (2 * src_stride);
4531     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4532                                          const20, const6, const3);
4533     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4534     horiz6 = __msa_aver_u_b(inp2, res1);
4535     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4536     inp0 = LD_UB(src);
4537     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4538                                               const20, const6, const3);
4539     horiz8 = __msa_aver_u_b(inp0, res0);
4540     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4541                                         horiz3, horiz4, horiz5, horiz6,
4542                                         horiz3, horiz2, horiz1, horiz0,
4543                                         horiz4, horiz5, horiz6, horiz7,
4544                                         const20, const6, const3);
4545     avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4546     res1 = __msa_aver_u_b(avg1, res1);
4547     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4548                                         horiz5, horiz6, horiz7, horiz8,
4549                                         horiz5, horiz4, horiz3, horiz2,
4550                                         horiz6, horiz7, horiz8, horiz8,
4551                                         const20, const6, const3);
4552     ST_D2(res1, 0, 1, dst, dst_stride);
4553     dst += 2 * dst_stride;
4554 
4555     avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4556     res0 = __msa_aver_u_b(avg0, res0);
4557     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4558                                         horiz7, horiz8, horiz8, horiz7,
4559                                         horiz7, horiz6, horiz5, horiz4,
4560                                         horiz8, horiz8, horiz7, horiz6,
4561                                         const20, const6, const3);
4562     avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4563     res1 = __msa_aver_u_b(avg1, res1);
4564     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4565 }
4566 
hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4567 static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
4568                                              int32_t src_stride,
4569                                              uint8_t *dst,
4570                                              int32_t dst_stride)
4571 {
4572     uint8_t buff[272];
4573 
4574     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4575     vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4576 }
4577 
hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4578 static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src,
4579                                            int32_t src_stride,
4580                                            uint8_t *dst,
4581                                            int32_t dst_stride)
4582 {
4583     v16u8 inp0, inp1, inp2, inp3;
4584     v16u8 res0, res1, avg0, avg1;
4585     v16u8 horiz0, horiz1, horiz2, horiz3;
4586     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4587     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4588     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4589     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4590     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4591     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4592     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4593     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4594 
4595     LD_UB2(src, src_stride, inp0, inp1);
4596     src += (2 * src_stride);
4597     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4598                                            mask0, mask1, mask2, mask3,
4599                                            const20, const6, const3);
4600     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4601     LD_UB2(src, src_stride, inp2, inp3);
4602     src += (2 * src_stride);
4603     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4604                                            mask0, mask1, mask2, mask3,
4605                                            const20, const6, const3);
4606     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4607     LD_UB2(src, src_stride, inp0, inp1);
4608     src += (2 * src_stride);
4609     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4610                                            mask0, mask1, mask2, mask3,
4611                                            const20, const6, const3);
4612     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4613     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4614     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4615                                         horiz1, horiz2, horiz3, horiz4,
4616                                         horiz1, horiz0, horiz0, horiz1,
4617                                         horiz2, horiz3, horiz4, horiz5,
4618                                         const20, const6, const3);
4619     avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4620     res0 = __msa_aver_u_b(avg0, res0);
4621     ST_D2(res0, 0, 1, dst, dst_stride);
4622     dst += (2 * dst_stride);
4623 
4624     LD_UB2(src, src_stride, inp2, inp3);
4625     src += (2 * src_stride);
4626     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4627                                            mask0, mask1, mask2, mask3,
4628                                            const20, const6, const3);
4629     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4630     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4631                                         horiz3, horiz4, horiz5, horiz6,
4632                                         horiz3, horiz2, horiz1, horiz0,
4633                                         horiz4, horiz5, horiz6, horiz7,
4634                                         const20, const6, const3);
4635     inp0 = LD_UB(src);
4636     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4637                                                 mask0, mask1, mask2, mask3,
4638                                                 const20, const6, const3);
4639     avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4640     res1 = __msa_aver_u_b(avg1, res1);
4641     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4642                                         horiz5, horiz6, horiz7, horiz8,
4643                                         horiz5, horiz4, horiz3, horiz2,
4644                                         horiz6, horiz7, horiz8, horiz8,
4645                                         const20, const6, const3);
4646     ST_D2(res1, 0, 1, dst, dst_stride);
4647     dst += 2 * dst_stride;
4648     avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4649     res0 = __msa_aver_u_b(avg0, res0);
4650 
4651     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4652                                         horiz7, horiz8, horiz8, horiz7,
4653                                         horiz7, horiz6, horiz5, horiz4,
4654                                         horiz8, horiz8, horiz7, horiz6,
4655                                         const20, const6, const3);
4656     avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4657     res1 = __msa_aver_u_b(avg1, res1);
4658     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4659 }
4660 
hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4661 static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
4662                                                int32_t src_stride,
4663                                                uint8_t *dst,
4664                                                int32_t dst_stride)
4665 {
4666     uint8_t buff[272];
4667 
4668     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4669     vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4670 }
4671 
hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4672 static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src,
4673                                              int32_t src_stride,
4674                                              uint8_t *dst, int32_t dst_stride)
4675 {
4676     v16u8 inp0, inp1, inp2, inp3;
4677     v16u8 res0, res1, avg0, avg1;
4678     v16u8 horiz0, horiz1, horiz2, horiz3;
4679     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4680     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4681     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4682     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4683     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4684     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4685     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4686     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4687 
4688     LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4689     src += (4 * src_stride);
4690     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4691                                          mask0, mask1, mask2, mask3,
4692                                          const20, const6, const3);
4693     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4694 
4695     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4696     horiz0 = __msa_aver_u_b(inp0, res0);
4697     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4698     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4699                                          const20, const6, const3);
4700     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4701 
4702     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4703     horiz2 = __msa_aver_u_b(inp2, res1);
4704     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4705     LD_UB2(src, src_stride, inp0, inp1);
4706     src += (2 * src_stride);
4707     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4708                                          const20, const6, const3);
4709     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
4710 
4711     inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4712     horiz4 = __msa_aver_u_b(inp0, res0);
4713     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4714     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4715                                         horiz1, horiz2, horiz3, horiz4,
4716                                         horiz1, horiz0, horiz0, horiz1,
4717                                         horiz2, horiz3, horiz4, horiz5,
4718                                         const20, const6, const3);
4719     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4720     res0 = __msa_aver_u_b(avg0, res0);
4721     LD_UB2(src, src_stride, inp2, inp3);
4722     src += (2 * src_stride);
4723     ST_D2(res0, 0, 1, dst, dst_stride);
4724     dst += 2 * dst_stride;
4725 
4726     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4727                                          const20, const6, const3);
4728     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
4729 
4730     inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4731     horiz6 = __msa_aver_u_b(inp2, res1);
4732     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4733     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4734                                         horiz3, horiz4, horiz5, horiz6,
4735                                         horiz3, horiz2, horiz1, horiz0,
4736                                         horiz4, horiz5, horiz6, horiz7,
4737                                         const20, const6, const3);
4738     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4739     res1 = __msa_aver_u_b(avg1, res1);
4740     inp0 = LD_UB(src);
4741     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4742                                               const20, const6, const3);
4743     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4744     horiz8 = __msa_aver_u_b(inp0, res0);
4745     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4746                                         horiz5, horiz6, horiz7, horiz8,
4747                                         horiz5, horiz4, horiz3, horiz2,
4748                                         horiz6, horiz7, horiz8, horiz8,
4749                                         const20, const6, const3);
4750     ST_D2(res1, 0, 1, dst, dst_stride);
4751     dst += 2 * dst_stride;
4752 
4753     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4754     res0 = __msa_aver_u_b(avg0, res0);
4755     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4756                                         horiz7, horiz8, horiz8, horiz7,
4757                                         horiz7, horiz6, horiz5, horiz4,
4758                                         horiz8, horiz8, horiz7, horiz6,
4759                                         const20, const6, const3);
4760     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4761     res1 = __msa_aver_u_b(avg1, res1);
4762     ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
4763 }
4764 
hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4765 static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
4766                                                        int32_t src_stride,
4767                                                        uint8_t *dst,
4768                                                        int32_t dst_stride)
4769 {
4770     uint8_t buff[272];
4771 
4772     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4773     vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4774 }
4775 
hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4776 static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src,
4777                                                      int32_t src_stride,
4778                                                      uint8_t *dst,
4779                                                      int32_t dst_stride)
4780 {
4781     v16u8 inp0, inp1, inp2, inp3;
4782     v16u8 res0, res1, avg0, avg1;
4783     v16u8 horiz0, horiz1, horiz2, horiz3;
4784     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4785     v16u8 dst0, dst1;
4786     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4787     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4788     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4789     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4790     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4791     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4792     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4793 
4794     LD_UB2(src, src_stride, inp0, inp1);
4795     src += (2 * src_stride);
4796     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4797                                          const20, const6, const3);
4798     LD_UB2(src, src_stride, inp2, inp3);
4799     src += (2 * src_stride);
4800     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4801     horiz0 = __msa_aver_u_b(inp0, res0);
4802     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4803     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4804                                          const20, const6, const3);
4805     LD_UB2(src, src_stride, inp0, inp1);
4806     src += (2 * src_stride);
4807     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4808     horiz2 = __msa_aver_u_b(inp2, res1);
4809     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4810     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4811                                          const20, const6, const3);
4812     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4813     horiz4 = __msa_aver_u_b(inp0, res0);
4814     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4815     LD_UB2(dst, dst_stride, dst0, dst1);
4816     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4817     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4818                                         horiz1, horiz2, horiz3, horiz4,
4819                                         horiz1, horiz0, horiz0, horiz1,
4820                                         horiz2, horiz3, horiz4, horiz5,
4821                                         const20, const6, const3);
4822     res0 = __msa_aver_u_b(avg0, res0);
4823     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4824     res0 = __msa_aver_u_b(avg0, res0);
4825     ST_D2(res0, 0, 1, dst, dst_stride);
4826     dst += (2 * dst_stride);
4827 
4828     LD_UB2(src, src_stride, inp2, inp3);
4829     src += (2 * src_stride);
4830     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4831                                          const20, const6, const3);
4832     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4833     horiz6 = __msa_aver_u_b(inp2, res1);
4834     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4835     LD_UB2(dst, dst_stride, dst0, dst1);
4836     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4837     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4838                                         horiz3, horiz4, horiz5, horiz6,
4839                                         horiz3, horiz2, horiz1, horiz0,
4840                                         horiz4, horiz5, horiz6, horiz7,
4841                                         const20, const6, const3);
4842     res1 = __msa_aver_u_b(avg1, res1);
4843     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4844     res1 = __msa_aver_u_b(avg1, res1);
4845     ST_D2(res1, 0, 1, dst, dst_stride);
4846     dst += (2 * dst_stride);
4847 
4848     inp0 = LD_UB(src);
4849     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4850                                               const20, const6, const3);
4851     horiz8 = __msa_aver_u_b(inp0, res0);
4852     LD_UB2(dst, dst_stride, dst0, dst1);
4853     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4854     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4855                                         horiz5, horiz6, horiz7, horiz8,
4856                                         horiz5, horiz4, horiz3, horiz2,
4857                                         horiz6, horiz7, horiz8, horiz8,
4858                                         const20, const6, const3);
4859     res0 = __msa_aver_u_b(avg0, res0);
4860     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4861     res0 = __msa_aver_u_b(avg0, res0);
4862     ST_D2(res0, 0, 1, dst, dst_stride);
4863     dst += (2 * dst_stride);
4864 
4865     LD_UB2(dst, dst_stride, dst0, dst1);
4866     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4867     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4868                                         horiz7, horiz8, horiz8, horiz7,
4869                                         horiz7, horiz6, horiz5, horiz4,
4870                                         horiz8, horiz8, horiz7, horiz6,
4871                                         const20, const6, const3);
4872     res1 = __msa_aver_u_b(avg1, res1);
4873     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4874     res1 = __msa_aver_u_b(avg1, res1);
4875     ST_D2(res1, 0, 1, dst, dst_stride);
4876 }
4877 
hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4878 static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
4879                                                      int32_t src_stride,
4880                                                      uint8_t *dst,
4881                                                      int32_t dst_stride)
4882 {
4883     uint8_t buff[272];
4884 
4885     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4886     vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4887 }
4888 
hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4889 static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src,
4890                                                    int32_t src_stride,
4891                                                    uint8_t *dst,
4892                                                    int32_t dst_stride)
4893 {
4894     v16u8 inp0, inp1, inp2, inp3;
4895     v16u8 res0, res1, avg0, avg1;
4896     v16u8 horiz0, horiz1, horiz2, horiz3;
4897     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4898     v16u8 dst0, dst1;
4899     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4900     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4901     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4902     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4903     v16u8 const20 = (v16u8) __msa_ldi_b(20);
4904     v16u8 const6 = (v16u8) __msa_ldi_b(6);
4905     v16u8 const3 = (v16u8) __msa_ldi_b(3);
4906 
4907     LD_UB2(src, src_stride, inp0, inp1);
4908     src += (2 * src_stride);
4909     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4910                                            mask0, mask1, mask2, mask3,
4911                                            const20, const6, const3);
4912     LD_UB2(src, src_stride, inp2, inp3);
4913     src += (2 * src_stride);
4914     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4915     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4916                                            mask0, mask1, mask2, mask3,
4917                                            const20, const6, const3);
4918     LD_UB2(src, src_stride, inp0, inp1);
4919     src += (2 * src_stride);
4920     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4921     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4922                                            mask0, mask1, mask2, mask3,
4923                                            const20, const6, const3);
4924     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4925     LD_UB2(dst, dst_stride, dst0, dst1);
4926     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4927     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4928                                         horiz1, horiz2, horiz3, horiz4,
4929                                         horiz1, horiz0, horiz0, horiz1,
4930                                         horiz2, horiz3, horiz4, horiz5,
4931                                         const20, const6, const3);
4932     res0 = __msa_aver_u_b(avg0, res0);
4933     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4934     res0 = __msa_aver_u_b(avg0, res0);
4935     ST_D2(res0, 0, 1, dst, dst_stride);
4936     dst += (2 * dst_stride);
4937 
4938     LD_UB2(src, src_stride, inp2, inp3);
4939     src += (2 * src_stride);
4940     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4941                                            mask0, mask1, mask2, mask3,
4942                                            const20, const6, const3);
4943     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4944     LD_UB2(dst, dst_stride, dst0, dst1);
4945     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4946     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4947                                         horiz3, horiz4, horiz5, horiz6,
4948                                         horiz3, horiz2, horiz1, horiz0,
4949                                         horiz4, horiz5, horiz6, horiz7,
4950                                         const20, const6, const3);
4951     res1 = __msa_aver_u_b(avg1, res1);
4952     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4953     res1 = __msa_aver_u_b(avg1, res1);
4954     ST_D2(res1, 0, 1, dst, dst_stride);
4955     dst += (2 * dst_stride);
4956 
4957     inp0 = LD_UB(src);
4958     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4959                                                 mask0, mask1, mask2, mask3,
4960                                                 const20, const6, const3);
4961     LD_UB2(dst, dst_stride, dst0, dst1);
4962     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4963     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4964                                         horiz5, horiz6, horiz7, horiz8,
4965                                         horiz5, horiz4, horiz3, horiz2,
4966                                         horiz6, horiz7, horiz8, horiz8,
4967                                         const20, const6, const3);
4968     res0 = __msa_aver_u_b(avg0, res0);
4969     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4970     res0 = __msa_aver_u_b(avg0, res0);
4971     ST_D2(res0, 0, 1, dst, dst_stride);
4972     dst += (2 * dst_stride);
4973 
4974     LD_UB2(dst, dst_stride, dst0, dst1);
4975     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4976     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4977                                         horiz7, horiz8, horiz8, horiz7,
4978                                         horiz7, horiz6, horiz5, horiz4,
4979                                         horiz8, horiz8, horiz7, horiz6,
4980                                         const20, const6, const3);
4981     res1 = __msa_aver_u_b(avg1, res1);
4982     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4983     res1 = __msa_aver_u_b(avg1, res1);
4984     ST_D2(res1, 0, 1, dst, dst_stride);
4985 }
4986 
hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4987 static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
4988                                                        int32_t src_stride,
4989                                                        uint8_t *dst,
4990                                                        int32_t dst_stride)
4991 {
4992     uint8_t buff[272];
4993 
4994     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4995     vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4996 }
4997 
hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)4998 static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src,
4999                                                      int32_t src_stride,
5000                                                      uint8_t *dst,
5001                                                      int32_t dst_stride)
5002 {
5003     v16u8 inp0, inp1, inp2, inp3;
5004     v16u8 res0, res1, avg0, avg1;
5005     v16u8 horiz0, horiz1, horiz2, horiz3;
5006     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5007     v16u8 dst0, dst1;
5008     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5009     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5010     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5011     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5012     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5013     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5014     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5015 
5016     LD_UB2(src, src_stride, inp0, inp1);
5017     src += (2 * src_stride);
5018     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5019                                          const20, const6, const3);
5020 
5021     LD_UB2(src, src_stride, inp2, inp3);
5022     src += (2 * src_stride);
5023     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5024 
5025     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5026     horiz0 = __msa_aver_u_b(inp0, res0);
5027     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5028     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5029                                          const20, const6, const3);
5030     LD_UB2(src, src_stride, inp0, inp1);
5031     src += (2 * src_stride);
5032     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5033 
5034     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5035     horiz2 = __msa_aver_u_b(inp2, res1);
5036     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5037     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5038                                          const20, const6, const3);
5039 
5040     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5041 
5042     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5043     horiz4 = __msa_aver_u_b(inp0, res0);
5044     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5045     LD_UB2(dst, dst_stride, dst0, dst1);
5046     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5047     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5048                                         horiz1, horiz2, horiz3, horiz4,
5049                                         horiz1, horiz0, horiz0, horiz1,
5050                                         horiz2, horiz3, horiz4, horiz5,
5051                                         const20, const6, const3);
5052     res0 = __msa_aver_u_b(avg0, res0);
5053     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5054     res0 = __msa_aver_u_b(avg0, res0);
5055     ST_D2(res0, 0, 1, dst, dst_stride);
5056     dst += (2 * dst_stride);
5057 
5058     LD_UB2(src, src_stride, inp2, inp3);
5059     src += (2 * src_stride);
5060     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5061                                          const20, const6, const3);
5062 
5063     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5064 
5065     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5066     horiz6 = __msa_aver_u_b(inp2, res1);
5067     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5068     LD_UB2(dst, dst_stride, dst0, dst1);
5069     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5070     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5071                                         horiz3, horiz4, horiz5, horiz6,
5072                                         horiz3, horiz2, horiz1, horiz0,
5073                                         horiz4, horiz5, horiz6, horiz7,
5074                                         const20, const6, const3);
5075     res1 = __msa_aver_u_b(avg1, res1);
5076     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5077     res1 = __msa_aver_u_b(avg1, res1);
5078     ST_D2(res1, 0, 1, dst, dst_stride);
5079     dst += (2 * dst_stride);
5080 
5081     inp0 = LD_UB(src);
5082     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5083                                               const20, const6, const3);
5084     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5085     horiz8 = __msa_aver_u_b(inp0, res0);
5086     LD_UB2(dst, dst_stride, dst0, dst1);
5087     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5088     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5089                                         horiz5, horiz6, horiz7, horiz8,
5090                                         horiz5, horiz4, horiz3, horiz2,
5091                                         horiz6, horiz7, horiz8, horiz8,
5092                                         const20, const6, const3);
5093     res0 = __msa_aver_u_b(avg0, res0);
5094     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5095     res0 = __msa_aver_u_b(avg0, res0);
5096     ST_D2(res0, 0, 1, dst, dst_stride);
5097     dst += (2 * dst_stride);
5098 
5099     LD_UB2(dst, dst_stride, dst0, dst1);
5100     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5101     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5102                                         horiz7, horiz8, horiz8, horiz7,
5103                                         horiz7, horiz6, horiz5, horiz4,
5104                                         horiz8, horiz8, horiz7, horiz6,
5105                                         const20, const6, const3);
5106     res1 = __msa_aver_u_b(avg1, res1);
5107     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5108     res1 = __msa_aver_u_b(avg1, res1);
5109     ST_D2(res1, 0, 1, dst, dst_stride);
5110 }
5111 
hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5112 static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
5113                                                      int32_t src_stride,
5114                                                      uint8_t *dst,
5115                                                      int32_t dst_stride)
5116 {
5117     uint8_t buff[272];
5118 
5119     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5120     vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5121 }
5122 
hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5123 static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src,
5124                                                    int32_t src_stride,
5125                                                    uint8_t *dst,
5126                                                    int32_t dst_stride)
5127 {
5128     v16u8 inp0, inp1, inp2, inp3;
5129     v16u8 res0, res1, avg0, avg1;
5130     v16u8 horiz0, horiz1, horiz2, horiz3;
5131     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5132     v16u8 dst0, dst1;
5133     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5134     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5135     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5136     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5137     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5138     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5139     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5140 
5141     LD_UB2(src, src_stride, inp0, inp1);
5142     src += (2 * src_stride);
5143     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5144                                          const20, const6, const3);
5145     LD_UB2(src, src_stride, inp2, inp3);
5146     src += (2 * src_stride);
5147     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5148     horiz0 = __msa_aver_u_b(inp0, res0);
5149     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5150     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5151                                          const20, const6, const3);
5152     LD_UB2(src, src_stride, inp0, inp1);
5153     src += (2 * src_stride);
5154     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5155     horiz2 = __msa_aver_u_b(inp2, res1);
5156     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5157     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5158                                          const20, const6, const3);
5159     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5160     horiz4 = __msa_aver_u_b(inp0, res0);
5161     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5162     LD_UB2(dst, dst_stride, dst0, dst1);
5163     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5164                                         horiz1, horiz2, horiz3, horiz4,
5165                                         horiz1, horiz0, horiz0, horiz1,
5166                                         horiz2, horiz3, horiz4, horiz5,
5167                                         const20, const6, const3);
5168     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5169     res0 = __msa_aver_u_b(avg0, res0);
5170     ST_D2(res0, 0, 1, dst, dst_stride);
5171     dst += (2 * dst_stride);
5172 
5173     LD_UB2(src, src_stride, inp2, inp3);
5174     src += (2 * src_stride);
5175     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5176                                          const20, const6, const3);
5177     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5178     horiz6 = __msa_aver_u_b(inp2, res1);
5179     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5180     LD_UB2(dst, dst_stride, dst0, dst1);
5181     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5182                                         horiz3, horiz4, horiz5, horiz6,
5183                                         horiz3, horiz2, horiz1, horiz0,
5184                                         horiz4, horiz5, horiz6, horiz7,
5185                                         const20, const6, const3);
5186     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5187     res1 = __msa_aver_u_b(avg1, res1);
5188     ST_D2(res1, 0, 1, dst, dst_stride);
5189     dst += (2 * dst_stride);
5190 
5191     inp0 = LD_UB(src);
5192     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5193                                               const20, const6, const3);
5194     horiz8 = __msa_aver_u_b(inp0, res0);
5195     LD_UB2(dst, dst_stride, dst0, dst1);
5196     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5197                                         horiz5, horiz6, horiz7, horiz8,
5198                                         horiz5, horiz4, horiz3, horiz2,
5199                                         horiz6, horiz7, horiz8, horiz8,
5200                                         const20, const6, const3);
5201     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5202     res0 = __msa_aver_u_b(avg0, res0);
5203     ST_D2(res0, 0, 1, dst, dst_stride);
5204     dst += (2 * dst_stride);
5205 
5206     LD_UB2(dst, dst_stride, dst0, dst1);
5207     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5208                                         horiz7, horiz8, horiz8, horiz7,
5209                                         horiz7, horiz6, horiz5, horiz4,
5210                                         horiz8, horiz8, horiz7, horiz6,
5211                                         const20, const6, const3);
5212     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5213     res1 = __msa_aver_u_b(avg1, res1);
5214     ST_D2(res1, 0, 1, dst, dst_stride);
5215 }
5216 
hv_mc_qpel_avg_dst_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5217 static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5218                                          uint8_t *dst, int32_t dst_stride)
5219 {
5220     uint8_t buff[272];
5221 
5222     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5223     vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5224 
5225 }
5226 
hv_mc_qpel_avg_dst_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5227 static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5228                                        uint8_t *dst, int32_t dst_stride)
5229 {
5230     v16u8 inp0, inp1, inp2, inp3;
5231     v16u8 res0, res1, avg0, avg1;
5232     v16u8 horiz0, horiz1, horiz2, horiz3;
5233     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5234     v16u8 dst0, dst1;
5235     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5236     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5237     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5238     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5239     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5240     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5241     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5242 
5243     LD_UB2(src, src_stride, inp0, inp1);
5244     src += (2 * src_stride);
5245     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5246                                            mask0, mask1, mask2, mask3,
5247                                            const20, const6, const3);
5248     LD_UB2(src, src_stride, inp2, inp3);
5249     src += (2 * src_stride);
5250     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5251     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5252                                            mask0, mask1, mask2, mask3,
5253                                            const20, const6, const3);
5254     LD_UB2(src, src_stride, inp0, inp1);
5255     src += (2 * src_stride);
5256     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5257     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5258                                            mask0, mask1, mask2, mask3,
5259                                            const20, const6, const3);
5260     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5261     LD_UB2(src, src_stride, inp2, inp3);
5262     src += (2 * src_stride);
5263     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5264                                            mask0, mask1, mask2, mask3,
5265                                            const20, const6, const3);
5266     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5267     inp0 = LD_UB(src);
5268     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5269                                                 mask0, mask1, mask2, mask3,
5270                                                 const20, const6, const3);
5271     LD_UB2(dst, dst_stride, dst0, dst1);
5272     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5273                                         horiz1, horiz2, horiz3, horiz4,
5274                                         horiz1, horiz0, horiz0, horiz1,
5275                                         horiz2, horiz3, horiz4, horiz5,
5276                                         const20, const6, const3);
5277     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5278     res0 = __msa_aver_u_b(avg0, res0);
5279     ST_D2(res0, 0, 1, dst, dst_stride);
5280     dst += (2 * dst_stride);
5281 
5282     LD_UB2(dst, dst_stride, dst0, dst1);
5283     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5284                                         horiz3, horiz4, horiz5, horiz6,
5285                                         horiz3, horiz2, horiz1, horiz0,
5286                                         horiz4, horiz5, horiz6, horiz7,
5287                                         const20, const6, const3);
5288     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5289     res1 = __msa_aver_u_b(avg1, res1);
5290     ST_D2(res1, 0, 1, dst, dst_stride);
5291     dst += (2 * dst_stride);
5292 
5293     LD_UB2(dst, dst_stride, dst0, dst1);
5294     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5295                                         horiz5, horiz6, horiz7, horiz8,
5296                                         horiz5, horiz4, horiz3, horiz2,
5297                                         horiz6, horiz7, horiz8, horiz8,
5298                                         const20, const6, const3);
5299     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5300     res0 = __msa_aver_u_b(avg0, res0);
5301     ST_D2(res0, 0, 1, dst, dst_stride);
5302     dst += (2 * dst_stride);
5303 
5304     LD_UB2(dst, dst_stride, dst0, dst1);
5305     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5306                                         horiz7, horiz8, horiz8, horiz7,
5307                                         horiz7, horiz6, horiz5, horiz4,
5308                                         horiz8, horiz8, horiz7, horiz6,
5309                                         const20, const6, const3);
5310     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5311     res1 = __msa_aver_u_b(avg1, res1);
5312     ST_D2(res1, 0, 1, dst, dst_stride);
5313 }
5314 
hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5315 static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
5316                                                      int32_t src_stride,
5317                                                      uint8_t *dst,
5318                                                      int32_t dst_stride)
5319 {
5320     uint8_t buff[272];
5321 
5322     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5323     vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5324 }
5325 
hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5326 static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src,
5327                                                    int32_t src_stride,
5328                                                    uint8_t *dst,
5329                                                    int32_t dst_stride)
5330 {
5331     v16u8 inp0, inp1, inp2, inp3;
5332     v16u8 res0, res1, avg0, avg1;
5333     v16u8 horiz0, horiz1, horiz2, horiz3;
5334     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5335     v16u8 dst0, dst1;
5336     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5337     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5338     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5339     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5340     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5341     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5342     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5343 
5344     LD_UB2(src, src_stride, inp0, inp1);
5345     src += (2 * src_stride);
5346     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5347                                          const20, const6, const3);
5348     LD_UB2(src, src_stride, inp2, inp3);
5349     src += (2 * src_stride);
5350     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5351 
5352     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5353     horiz0 = __msa_aver_u_b(inp0, res0);
5354     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5355     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5356                                          const20, const6, const3);
5357     LD_UB2(src, src_stride, inp0, inp1);
5358     src += (2 * src_stride);
5359     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5360 
5361     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5362     horiz2 = __msa_aver_u_b(inp2, res1);
5363     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5364     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5365                                          const20, const6, const3);
5366 
5367     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5368 
5369     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5370     horiz4 = __msa_aver_u_b(inp0, res0);
5371     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5372     LD_UB2(dst, dst_stride, dst0, dst1);
5373     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5374                                         horiz1, horiz2, horiz3, horiz4,
5375                                         horiz1, horiz0, horiz0, horiz1,
5376                                         horiz2, horiz3, horiz4, horiz5,
5377                                         const20, const6, const3);
5378     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5379     res0 = __msa_aver_u_b(avg0, res0);
5380     ST_D2(res0, 0, 1, dst, dst_stride);
5381     dst += (2 * dst_stride);
5382 
5383     LD_UB2(src, src_stride, inp2, inp3);
5384     src += (2 * src_stride);
5385     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5386                                          const20, const6, const3);
5387 
5388     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5389 
5390     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5391     horiz6 = __msa_aver_u_b(inp2, res1);
5392     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5393     LD_UB2(dst, dst_stride, dst0, dst1);
5394     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5395                                         horiz3, horiz4, horiz5, horiz6,
5396                                         horiz3, horiz2, horiz1, horiz0,
5397                                         horiz4, horiz5, horiz6, horiz7,
5398                                         const20, const6, const3);
5399     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5400     res1 = __msa_aver_u_b(avg1, res1);
5401     ST_D2(res1, 0, 1, dst, dst_stride);
5402     dst += (2 * dst_stride);
5403 
5404     inp0 = LD_UB(src);
5405     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5406                                               const20, const6, const3);
5407     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5408     horiz8 = __msa_aver_u_b(inp0, res0);
5409     LD_UB2(dst, dst_stride, dst0, dst1);
5410     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5411                                         horiz5, horiz6, horiz7, horiz8,
5412                                         horiz5, horiz4, horiz3, horiz2,
5413                                         horiz6, horiz7, horiz8, horiz8,
5414                                         const20, const6, const3);
5415     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5416     res0 = __msa_aver_u_b(avg0, res0);
5417     ST_D2(res0, 0, 1, dst, dst_stride);
5418     dst += (2 * dst_stride);
5419 
5420     LD_UB2(dst, dst_stride, dst0, dst1);
5421     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5422                                         horiz7, horiz8, horiz8, horiz7,
5423                                         horiz7, horiz6, horiz5, horiz4,
5424                                         horiz8, horiz8, horiz7, horiz6,
5425                                         const20, const6, const3);
5426     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427     res1 = __msa_aver_u_b(avg1, res1);
5428     ST_D2(res1, 0, 1, dst, dst_stride);
5429 }
5430 
hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5431 static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
5432                                                        int32_t src_stride,
5433                                                        uint8_t *dst,
5434                                                        int32_t dst_stride)
5435 {
5436     uint8_t buff[272];
5437 
5438     hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5439     vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5440 }
5441 
hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5442 static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src,
5443                                                      int32_t src_stride,
5444                                                      uint8_t *dst,
5445                                                      int32_t dst_stride)
5446 {
5447     v16u8 inp0, inp1, inp2, inp3;
5448     v16u8 res0, res1, avg0, avg1;
5449     v16u8 horiz0, horiz1, horiz2, horiz3;
5450     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5451     v16u8 dst0, dst1;
5452     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5453     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5454     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5455     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5456     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5457     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5458     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5459 
5460     LD_UB2(src, src_stride, inp0, inp1);
5461     src += (2 * src_stride);
5462 
5463     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5464                                          const20, const6, const3);
5465     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5466     horiz0 = __msa_aver_u_b(inp0, res0);
5467     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5468     LD_UB2(src, src_stride, inp2, inp3);
5469     src += (2 * src_stride);
5470     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5471                                          const20, const6, const3);
5472     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5473     horiz2 = __msa_aver_u_b(inp2, res1);
5474     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5475     LD_UB2(dst, dst_stride, dst0, dst1);
5476     LD_UB2(src, src_stride, inp0, inp1);
5477     src += (2 * src_stride);
5478     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5479                                          const20, const6, const3);
5480     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5481     horiz4 = __msa_aver_u_b(inp0, res0);
5482     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5483     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5484                                         horiz1, horiz2, horiz3, horiz4,
5485                                         horiz1, horiz0, horiz0, horiz1,
5486                                         horiz2, horiz3, horiz4, horiz5,
5487                                         const20, const6, const3);
5488     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5489     res0 = __msa_aver_u_b(avg0, res0);
5490     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5491     res0 = __msa_aver_u_b(avg0, res0);
5492     ST_D2(res0, 0, 1, dst, dst_stride);
5493     dst += (2 * dst_stride);
5494 
5495     LD_UB2(dst, dst_stride, dst0, dst1);
5496     LD_UB2(src, src_stride, inp2, inp3);
5497     src += (2 * src_stride);
5498     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5499                                          const20, const6, const3);
5500     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5501     horiz6 = __msa_aver_u_b(inp2, res1);
5502     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5503     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5504                                         horiz3, horiz4, horiz5, horiz6,
5505                                         horiz3, horiz2, horiz1, horiz0,
5506                                         horiz4, horiz5, horiz6, horiz7,
5507                                         const20, const6, const3);
5508     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5509     res1 = __msa_aver_u_b(avg1, res1);
5510     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5511     res1 = __msa_aver_u_b(avg1, res1);
5512     ST_D2(res1, 0, 1, dst, dst_stride);
5513     dst += (2 * dst_stride);
5514 
5515     inp0 = LD_UB(src);
5516     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5517                                               const20, const6, const3);
5518     horiz8 = __msa_aver_u_b(inp0, res0);
5519     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5520                                         horiz5, horiz6, horiz7, horiz8,
5521                                         horiz5, horiz4, horiz3, horiz2,
5522                                         horiz6, horiz7, horiz8, horiz8,
5523                                         const20, const6, const3);
5524     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5525                                         horiz7, horiz8, horiz8, horiz7,
5526                                         horiz7, horiz6, horiz5, horiz4,
5527                                         horiz8, horiz8, horiz7, horiz6,
5528                                         const20, const6, const3);
5529     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5530     res0 = __msa_aver_u_b(avg0, res0);
5531     LD_UB2(dst, dst_stride, dst0, dst1);
5532     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5533     res0 = __msa_aver_u_b(avg0, res0);
5534     ST_D2(res0, 0, 1, dst, dst_stride);
5535     dst += (2 * dst_stride);
5536 
5537     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5538     res1 = __msa_aver_u_b(avg1, res1);
5539     LD_UB2(dst, dst_stride, dst0, dst1);
5540     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5541     res1 = __msa_aver_u_b(avg1, res1);
5542     ST_D2(res1, 0, 1, dst, dst_stride);
5543 }
5544 
hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5545 static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
5546                                                      int32_t src_stride,
5547                                                      uint8_t *dst,
5548                                                      int32_t dst_stride)
5549 {
5550     uint8_t buff[272];
5551 
5552     hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5553     vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5554 }
5555 
hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5556 static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src,
5557                                                    int32_t src_stride,
5558                                                    uint8_t *dst,
5559                                                    int32_t dst_stride)
5560 {
5561     v16u8 inp0, inp1, inp2, inp3;
5562     v16u8 res0, res1, avg0, avg1;
5563     v16u8 horiz0, horiz1, horiz2, horiz3;
5564     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5565     v16u8 dst0, dst1;
5566     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5567     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5568     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5569     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5570     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5571     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5572     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5573 
5574     LD_UB2(src, src_stride, inp0, inp1);
5575     src += (2 * src_stride);
5576     horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5577                                            mask0, mask1, mask2, mask3,
5578                                            const20, const6, const3);
5579     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5580     LD_UB2(src, src_stride, inp2, inp3);
5581     src += (2 * src_stride);
5582     horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5583                                            mask0, mask1, mask2, mask3,
5584                                            const20, const6, const3);
5585     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5586     LD_UB2(dst, dst_stride, dst0, dst1);
5587     LD_UB2(src, src_stride, inp0, inp1);
5588     src += (2 * src_stride);
5589     horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5590                                            mask0, mask1, mask2, mask3,
5591                                            const20, const6, const3);
5592     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5593     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5594                                         horiz1, horiz2, horiz3, horiz4,
5595                                         horiz1, horiz0, horiz0, horiz1,
5596                                         horiz2, horiz3, horiz4, horiz5,
5597                                         const20, const6, const3);
5598     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5599     res0 = __msa_aver_u_b(avg0, res0);
5600     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5601     res0 = __msa_aver_u_b(avg0, res0);
5602     ST_D2(res0, 0, 1, dst, dst_stride);
5603     dst += (2 * dst_stride);
5604 
5605     LD_UB2(dst, dst_stride, dst0, dst1);
5606     LD_UB2(src, src_stride, inp2, inp3);
5607     src += (2 * src_stride);
5608     horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5609                                            mask0, mask1, mask2, mask3,
5610                                            const20, const6, const3);
5611     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5612     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5613                                         horiz3, horiz4, horiz5, horiz6,
5614                                         horiz3, horiz2, horiz1, horiz0,
5615                                         horiz4, horiz5, horiz6, horiz7,
5616                                         const20, const6, const3);
5617     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5618     res1 = __msa_aver_u_b(avg1, res1);
5619     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5620     res1 = __msa_aver_u_b(avg1, res1);
5621     ST_D2(res1, 0, 1, dst, dst_stride);
5622     dst += (2 * dst_stride);
5623 
5624     inp0 = LD_UB(src);
5625     horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5626                                                 mask0, mask1, mask2, mask3,
5627                                                 const20, const6, const3);
5628     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5629                                         horiz6, horiz7, horiz8, horiz5, horiz4,
5630                                         horiz3, horiz2, horiz6, horiz7, horiz8,
5631                                         horiz8, const20, const6, const3);
5632     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5633                                         horiz8, horiz8, horiz7, horiz7, horiz6,
5634                                         horiz5, horiz4, horiz8, horiz8, horiz7,
5635                                         horiz6, const20, const6, const3);
5636     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5637     res0 = __msa_aver_u_b(avg0, res0);
5638     LD_UB2(dst, dst_stride, dst0, dst1);
5639     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5640     res0 = __msa_aver_u_b(avg0, res0);
5641     ST_D2(res0, 0, 1, dst, dst_stride);
5642     dst += (2 * dst_stride);
5643 
5644     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5645     res1 = __msa_aver_u_b(avg1, res1);
5646     LD_UB2(dst, dst_stride, dst0, dst1);
5647     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5648     res1 = __msa_aver_u_b(avg1, res1);
5649     ST_D2(res1, 0, 1, dst, dst_stride);
5650 }
5651 
hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5652 static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
5653                                                        int32_t src_stride,
5654                                                        uint8_t *dst,
5655                                                        int32_t dst_stride)
5656 {
5657     uint8_t buff[272];
5658 
5659     hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5660     vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5661 }
5662 
hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5663 static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src,
5664                                                      int32_t src_stride,
5665                                                      uint8_t *dst,
5666                                                      int32_t dst_stride)
5667 {
5668     v16u8 inp0, inp1, inp2, inp3;
5669     v16u8 res0, res1, avg0, avg1;
5670     v16u8 horiz0, horiz1, horiz2, horiz3;
5671     v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5672     v16u8 dst0, dst1;
5673     v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5674     v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5675     v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5676     v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5677     v16u8 const20 = (v16u8) __msa_ldi_b(20);
5678     v16u8 const6 = (v16u8) __msa_ldi_b(6);
5679     v16u8 const3 = (v16u8) __msa_ldi_b(3);
5680 
5681     LD_UB2(src, src_stride, inp0, inp1);
5682     src += (2 * src_stride);
5683     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5684                                          const20, const6, const3);
5685     LD_UB2(src, src_stride, inp2, inp3);
5686     src += (2 * src_stride);
5687     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5688 
5689     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5690     horiz0 = __msa_aver_u_b(inp0, res0);
5691     horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5692     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5693                                          const20, const6, const3);
5694     LD_UB2(src, src_stride, inp0, inp1);
5695     src += (2 * src_stride);
5696     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5697 
5698     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5699     horiz2 = __msa_aver_u_b(inp2, res1);
5700     horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5701     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5702                                          const20, const6, const3);
5703     SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
5704 
5705     inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5706     horiz4 = __msa_aver_u_b(inp0, res0);
5707     horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5708     LD_UB2(dst, dst_stride, dst0, dst1);
5709     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5710     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5711                                         horiz2, horiz3, horiz4, horiz1, horiz0,
5712                                         horiz0, horiz1, horiz2, horiz3, horiz4,
5713                                         horiz5, const20, const6, const3);
5714     res0 = __msa_aver_u_b(avg0, res0);
5715     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5716     res0 = __msa_aver_u_b(avg0, res0);
5717     ST_D2(res0, 0, 1, dst, dst_stride);
5718     dst += (2 * dst_stride);
5719 
5720     LD_UB2(src, src_stride, inp2, inp3);
5721     src += (2 * src_stride);
5722     res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5723                                          const20, const6, const3);
5724     SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
5725 
5726     inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5727     horiz6 = __msa_aver_u_b(inp2, res1);
5728     horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5729     LD_UB2(dst, dst_stride, dst0, dst1);
5730     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5731     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5732                                         horiz4, horiz5, horiz6, horiz3, horiz2,
5733                                         horiz1, horiz0, horiz4, horiz5, horiz6,
5734                                         horiz7, const20, const6, const3);
5735     res1 = __msa_aver_u_b(avg1, res1);
5736     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5737     res1 = __msa_aver_u_b(avg1, res1);
5738     ST_D2(res1, 0, 1, dst, dst_stride);
5739     dst += (2 * dst_stride);
5740 
5741     inp0 = LD_UB(src);
5742     res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5743                                               const20, const6, const3);
5744     inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5745     horiz8 = __msa_aver_u_b(inp0, res0);
5746     LD_UB2(dst, dst_stride, dst0, dst1);
5747     avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5748     res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5749                                         horiz6, horiz7, horiz8, horiz5, horiz4,
5750                                         horiz3, horiz2, horiz6, horiz7, horiz8,
5751                                         horiz8, const20, const6, const3);
5752     res0 = __msa_aver_u_b(avg0, res0);
5753     avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5754     res0 = __msa_aver_u_b(avg0, res0);
5755     ST_D2(res0, 0, 1, dst, dst_stride);
5756     dst += (2 * dst_stride);
5757 
5758     LD_UB2(dst, dst_stride, dst0, dst1);
5759     avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5760     res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5761                                         horiz8, horiz8, horiz7, horiz7, horiz6,
5762                                         horiz5, horiz4, horiz8, horiz8, horiz7,
5763                                         horiz6, const20, const6, const3);
5764     res1 = __msa_aver_u_b(avg1, res1);
5765     avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5766     res1 = __msa_aver_u_b(avg1, res1);
5767     ST_D2(res1, 0, 1, dst, dst_stride);
5768 }
5769 
copy_8x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5770 static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5771                          uint8_t *dst, int32_t dst_stride)
5772 {
5773     uint64_t src0, src1;
5774     int32_t loop_cnt;
5775 
5776     for (loop_cnt = 4; loop_cnt--;) {
5777         src0 = LD(src);
5778         src += src_stride;
5779         src1 = LD(src);
5780         src += src_stride;
5781 
5782         SD(src0, dst);
5783         dst += dst_stride;
5784         SD(src1, dst);
5785         dst += dst_stride;
5786     }
5787 }
5788 
copy_16x16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride)5789 static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5790                            uint8_t *dst, int32_t dst_stride)
5791 {
5792     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5793     v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5794 
5795     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5796     src += (8 * src_stride);
5797     LD_UB8(src, src_stride,
5798            src8, src9, src10, src11, src12, src13, src14, src15);
5799 
5800     ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5801     dst += (8 * dst_stride);
5802     ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5803            dst, dst_stride);
5804 }
5805 
avg_width8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)5806 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5807                            uint8_t *dst, int32_t dst_stride,
5808                            int32_t height)
5809 {
5810     int32_t cnt;
5811     uint64_t out0, out1, out2, out3;
5812     v16u8 src0, src1, src2, src3;
5813     v16u8 dst0, dst1, dst2, dst3;
5814 
5815     for (cnt = (height / 4); cnt--;) {
5816         LD_UB4(src, src_stride, src0, src1, src2, src3);
5817         src += (4 * src_stride);
5818         LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5819 
5820         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5821                     dst0, dst1, dst2, dst3);
5822 
5823         out0 = __msa_copy_u_d((v2i64) dst0, 0);
5824         out1 = __msa_copy_u_d((v2i64) dst1, 0);
5825         out2 = __msa_copy_u_d((v2i64) dst2, 0);
5826         out3 = __msa_copy_u_d((v2i64) dst3, 0);
5827         SD4(out0, out1, out2, out3, dst, dst_stride);
5828         dst += (4 * dst_stride);
5829     }
5830 }
5831 
avg_width16_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height)5832 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5833                             uint8_t *dst, int32_t dst_stride,
5834                             int32_t height)
5835 {
5836     int32_t cnt;
5837     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5838     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5839 
5840     for (cnt = (height / 8); cnt--;) {
5841         LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5842         src += (8 * src_stride);
5843         LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5844 
5845         AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5846                     dst0, dst1, dst2, dst3);
5847         AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5848                     dst4, dst5, dst6, dst7);
5849         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5850         dst += (8 * dst_stride);
5851     }
5852 }
5853 
ff_copy_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5854 void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5855 {
5856     copy_16x16_msa(src, stride, dest, stride);
5857 }
5858 
ff_copy_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5859 void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5860 {
5861     copy_8x8_msa(src, stride, dest, stride);
5862 }
5863 
ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5864 void ff_horiz_mc_qpel_aver_src0_8width_msa(uint8_t *dest,
5865                                            const uint8_t *src,
5866                                            ptrdiff_t stride)
5867 {
5868     horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5869 }
5870 
ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5871 void ff_horiz_mc_qpel_aver_src0_16width_msa(uint8_t *dest,
5872                                             const uint8_t *src,
5873                                             ptrdiff_t stride)
5874 {
5875     horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5876 }
5877 
ff_horiz_mc_qpel_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5878 void ff_horiz_mc_qpel_8width_msa(uint8_t *dest, const uint8_t *src,
5879                                  ptrdiff_t stride)
5880 {
5881     horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5882 }
5883 
ff_horiz_mc_qpel_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5884 void ff_horiz_mc_qpel_16width_msa(uint8_t *dest,
5885                                   const uint8_t *src, ptrdiff_t stride)
5886 {
5887     horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5888 }
5889 
ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5890 void ff_horiz_mc_qpel_aver_src1_8width_msa(uint8_t *dest,
5891                                            const uint8_t *src,
5892                                            ptrdiff_t stride)
5893 {
5894     horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5895 }
5896 
ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5897 void ff_horiz_mc_qpel_aver_src1_16width_msa(uint8_t *dest,
5898                                             const uint8_t *src,
5899                                             ptrdiff_t stride)
5900 {
5901     horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5902 }
5903 
ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5904 void ff_horiz_mc_qpel_no_rnd_aver_src0_8width_msa(uint8_t *dest,
5905                                                   const uint8_t *src,
5906                                                   ptrdiff_t stride)
5907 {
5908     horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5909 }
5910 
ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5911 void ff_horiz_mc_qpel_no_rnd_aver_src0_16width_msa(uint8_t *dest,
5912                                                    const uint8_t *src,
5913                                                    ptrdiff_t stride)
5914 {
5915     horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5916 }
5917 
ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5918 void ff_horiz_mc_qpel_no_rnd_8width_msa(uint8_t *dest,
5919                                         const uint8_t *src, ptrdiff_t stride)
5920 {
5921     horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5922 }
5923 
ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5924 void ff_horiz_mc_qpel_no_rnd_16width_msa(uint8_t *dest,
5925                                          const uint8_t *src, ptrdiff_t stride)
5926 {
5927     horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5928 }
5929 
ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5930 void ff_horiz_mc_qpel_no_rnd_aver_src1_8width_msa(uint8_t *dest,
5931                                                   const uint8_t *src,
5932                                                   ptrdiff_t stride)
5933 {
5934     horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5935 }
5936 
ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5937 void ff_horiz_mc_qpel_no_rnd_aver_src1_16width_msa(uint8_t *dest,
5938                                                    const uint8_t *src,
5939                                                    ptrdiff_t stride)
5940 {
5941     horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5942 }
5943 
ff_avg_width8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5944 void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5945 {
5946     avg_width8_msa(src, stride, dest, stride, 8);
5947 }
5948 
ff_avg_width16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5949 void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5950 {
5951     avg_width16_msa(src, stride, dest, stride, 16);
5952 }
5953 
ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5954 void ff_horiz_mc_qpel_avg_dst_aver_src0_8width_msa(uint8_t *dest,
5955                                                    const uint8_t *src,
5956                                                    ptrdiff_t stride)
5957 {
5958     horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
5959 }
5960 
ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5961 void ff_horiz_mc_qpel_avg_dst_aver_src0_16width_msa(uint8_t *dest,
5962                                                     const uint8_t *src,
5963                                                     ptrdiff_t stride)
5964 {
5965     horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
5966 }
5967 
ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5968 void ff_horiz_mc_qpel_avg_dst_8width_msa(uint8_t *dest,
5969                                          const uint8_t *src, ptrdiff_t stride)
5970 {
5971     horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
5972 }
5973 
ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5974 void ff_horiz_mc_qpel_avg_dst_16width_msa(uint8_t *dest,
5975                                           const uint8_t *src, ptrdiff_t stride)
5976 {
5977     horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
5978 }
5979 
ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5980 void ff_horiz_mc_qpel_avg_dst_aver_src1_8width_msa(uint8_t *dest,
5981                                                    const uint8_t *src,
5982                                                    ptrdiff_t stride)
5983 {
5984     horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
5985 }
5986 
ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5987 void ff_horiz_mc_qpel_avg_dst_aver_src1_16width_msa(uint8_t *dest,
5988                                                     const uint8_t *src,
5989                                                     ptrdiff_t stride)
5990 {
5991     horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
5992 }
5993 
5994 
ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)5995 void ff_vert_mc_qpel_aver_src0_8x8_msa(uint8_t *dest,
5996                                        const uint8_t *src, ptrdiff_t stride)
5997 {
5998     vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
5999 }
6000 
ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6001 void ff_vert_mc_qpel_aver_src0_16x16_msa(uint8_t *dest,
6002                                          const uint8_t *src, ptrdiff_t stride)
6003 {
6004     vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6005 }
6006 
ff_vert_mc_qpel_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6007 void ff_vert_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6008                              ptrdiff_t stride)
6009 {
6010     vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6011 }
6012 
ff_vert_mc_qpel_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6013 void ff_vert_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6014                                ptrdiff_t stride)
6015 {
6016     vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6017 }
6018 
ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6019 void ff_vert_mc_qpel_aver_src1_8x8_msa(uint8_t *dest,
6020                                        const uint8_t *src, ptrdiff_t stride)
6021 {
6022     vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6023 }
6024 
ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6025 void ff_vert_mc_qpel_aver_src1_16x16_msa(uint8_t *dest,
6026                                          const uint8_t *src, ptrdiff_t stride)
6027 {
6028     vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6029 }
6030 
ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6031 void ff_vert_mc_qpel_no_rnd_aver_src0_8x8_msa(uint8_t *dest,
6032                                               const uint8_t *src,
6033                                               ptrdiff_t stride)
6034 {
6035     vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6036 }
6037 
ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6038 void ff_vert_mc_qpel_no_rnd_aver_src0_16x16_msa(uint8_t *dest,
6039                                                 const uint8_t *src,
6040                                                 ptrdiff_t stride)
6041 {
6042     vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6043 }
6044 
ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6045 void ff_vert_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6046                                     const uint8_t *src, ptrdiff_t stride)
6047 {
6048     vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6049 }
6050 
ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6051 void ff_vert_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6052                                       const uint8_t *src, ptrdiff_t stride)
6053 {
6054     vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6055 }
6056 
ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6057 void ff_vert_mc_qpel_no_rnd_aver_src1_8x8_msa(uint8_t *dest,
6058                                               const uint8_t *src,
6059                                               ptrdiff_t stride)
6060 {
6061     vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6062 }
6063 
ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6064 void ff_vert_mc_qpel_no_rnd_aver_src1_16x16_msa(uint8_t *dest,
6065                                                 const uint8_t *src,
6066                                                 ptrdiff_t stride)
6067 {
6068     vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6069 }
6070 
ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6071 void ff_vert_mc_qpel_avg_dst_aver_src0_8x8_msa(uint8_t *dest,
6072                                                const uint8_t *src,
6073                                                ptrdiff_t stride)
6074 {
6075     vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6076 }
6077 
ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6078 void ff_vert_mc_qpel_avg_dst_aver_src0_16x16_msa(uint8_t *dest,
6079                                                  const uint8_t *src,
6080                                                  ptrdiff_t stride)
6081 {
6082     vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6083 }
6084 
ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6085 void ff_vert_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6086                                      const uint8_t *src, ptrdiff_t stride)
6087 {
6088     vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6089 }
6090 
ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6091 void ff_vert_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6092                                        const uint8_t *src, ptrdiff_t stride)
6093 {
6094     vert_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6095 }
6096 
ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6097 void ff_vert_mc_qpel_avg_dst_aver_src1_8x8_msa(uint8_t *dest,
6098                                                const uint8_t *src,
6099                                                ptrdiff_t stride)
6100 {
6101     vert_mc_qpel_avg_dst_aver_src1_8x8_msa(src, stride, dest, stride);
6102 }
6103 
ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6104 void ff_vert_mc_qpel_avg_dst_aver_src1_16x16_msa(uint8_t *dest,
6105                                                  const uint8_t *src,
6106                                                  ptrdiff_t stride)
6107 {
6108     vert_mc_qpel_avg_dst_aver_src1_16x16_msa(src, stride, dest, stride);
6109 }
6110 
6111 /* HV cases */
ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6112 void ff_hv_mc_qpel_aver_hv_src00_16x16_msa(uint8_t *dest,
6113                                            const uint8_t *src,
6114                                            ptrdiff_t stride)
6115 {
6116     hv_mc_qpel_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6117 }
6118 
ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6119 void ff_hv_mc_qpel_aver_hv_src00_8x8_msa(uint8_t *dest,
6120                                          const uint8_t *src, ptrdiff_t stride)
6121 {
6122     hv_mc_qpel_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6123 }
6124 
ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6125 void ff_hv_mc_qpel_aver_v_src0_16x16_msa(uint8_t *dest,
6126                                          const uint8_t *src, ptrdiff_t stride)
6127 {
6128     hv_mc_qpel_aver_v_src0_16x16_msa(src, stride, dest, stride);
6129 }
6130 
ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6131 void ff_hv_mc_qpel_aver_v_src0_8x8_msa(uint8_t *dest,
6132                                        const uint8_t *src, ptrdiff_t stride)
6133 {
6134     hv_mc_qpel_aver_v_src0_8x8_msa(src, stride, dest, stride);
6135 }
6136 
ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6137 void ff_hv_mc_qpel_aver_hv_src10_16x16_msa(uint8_t *dest,
6138                                            const uint8_t *src,
6139                                            ptrdiff_t stride)
6140 {
6141     hv_mc_qpel_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6142 }
6143 
ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6144 void ff_hv_mc_qpel_aver_hv_src10_8x8_msa(uint8_t *dest,
6145                                          const uint8_t *src, ptrdiff_t stride)
6146 {
6147     hv_mc_qpel_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6148 }
6149 
ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6150 void ff_hv_mc_qpel_aver_h_src0_16x16_msa(uint8_t *dest,
6151                                          const uint8_t *src, ptrdiff_t stride)
6152 {
6153     hv_mc_qpel_aver_h_src0_16x16_msa(src, stride, dest, stride);
6154 }
6155 
ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6156 void ff_hv_mc_qpel_aver_h_src0_8x8_msa(uint8_t *dest,
6157                                        const uint8_t *src, ptrdiff_t stride)
6158 {
6159     hv_mc_qpel_aver_h_src0_8x8_msa(src, stride, dest, stride);
6160 }
6161 
ff_hv_mc_qpel_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6162 void ff_hv_mc_qpel_16x16_msa(uint8_t *dest, const uint8_t *src,
6163                              ptrdiff_t stride)
6164 {
6165     hv_mc_qpel_16x16_msa(src, stride, dest, stride);
6166 }
6167 
ff_hv_mc_qpel_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6168 void ff_hv_mc_qpel_8x8_msa(uint8_t *dest, const uint8_t *src,
6169                            ptrdiff_t stride)
6170 {
6171     hv_mc_qpel_8x8_msa(src, stride, dest, stride);
6172 }
6173 
ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6174 void ff_hv_mc_qpel_aver_h_src1_16x16_msa(uint8_t *dest,
6175                                          const uint8_t *src, ptrdiff_t stride)
6176 {
6177     hv_mc_qpel_aver_h_src1_16x16_msa(src, stride, dest, stride);
6178 }
6179 
ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6180 void ff_hv_mc_qpel_aver_h_src1_8x8_msa(uint8_t *dest,
6181                                        const uint8_t *src, ptrdiff_t stride)
6182 {
6183     hv_mc_qpel_aver_h_src1_8x8_msa(src, stride, dest, stride);
6184 }
6185 
ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6186 void ff_hv_mc_qpel_aver_hv_src01_16x16_msa(uint8_t *dest,
6187                                            const uint8_t *src,
6188                                            ptrdiff_t stride)
6189 {
6190     hv_mc_qpel_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6191 }
6192 
ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6193 void ff_hv_mc_qpel_aver_hv_src01_8x8_msa(uint8_t *dest,
6194                                          const uint8_t *src, ptrdiff_t stride)
6195 {
6196     hv_mc_qpel_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6197 }
6198 
ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6199 void ff_hv_mc_qpel_aver_v_src1_16x16_msa(uint8_t *dest,
6200                                          const uint8_t *src, ptrdiff_t stride)
6201 {
6202     hv_mc_qpel_aver_v_src1_16x16_msa(src, stride, dest, stride);
6203 }
6204 
ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6205 void ff_hv_mc_qpel_aver_v_src1_8x8_msa(uint8_t *dest,
6206                                        const uint8_t *src, ptrdiff_t stride)
6207 {
6208     hv_mc_qpel_aver_v_src1_8x8_msa(src, stride, dest, stride);
6209 }
6210 
ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6211 void ff_hv_mc_qpel_aver_hv_src11_16x16_msa(uint8_t *dest,
6212                                            const uint8_t *src,
6213                                            ptrdiff_t stride)
6214 {
6215     hv_mc_qpel_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6216 }
6217 
ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6218 void ff_hv_mc_qpel_aver_hv_src11_8x8_msa(uint8_t *dest,
6219                                          const uint8_t *src, ptrdiff_t stride)
6220 {
6221     hv_mc_qpel_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6222 }
6223 
ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6224 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(uint8_t *dest,
6225                                                    const uint8_t *src,
6226                                                    ptrdiff_t stride)
6227 {
6228     hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6229 }
6230 
ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6231 void ff_hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(uint8_t *dest,
6232                                                  const uint8_t *src,
6233                                                  ptrdiff_t stride)
6234 {
6235     hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6236 }
6237 
ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6238 void ff_hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(uint8_t *dest,
6239                                                  const uint8_t *src,
6240                                                  ptrdiff_t stride)
6241 {
6242     hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(src, stride, dest, stride);
6243 }
6244 
ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6245 void ff_hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(uint8_t *dest,
6246                                                const uint8_t *src,
6247                                                ptrdiff_t stride)
6248 {
6249     hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(src, stride, dest, stride);
6250 }
6251 
ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6252 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(uint8_t *dest,
6253                                                    const uint8_t *src,
6254                                                    ptrdiff_t stride)
6255 {
6256     hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6257 }
6258 
ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6259 void ff_hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(uint8_t *dest,
6260                                                  const uint8_t *src,
6261                                                  ptrdiff_t stride)
6262 {
6263     hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6264 }
6265 
ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6266 void ff_hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(uint8_t *dest,
6267                                                  const uint8_t *src,
6268                                                  ptrdiff_t stride)
6269 {
6270     hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(src, stride, dest, stride);
6271 }
6272 
ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6273 void ff_hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(uint8_t *dest,
6274                                                const uint8_t *src,
6275                                                ptrdiff_t stride)
6276 {
6277     hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(src, stride, dest, stride);
6278 }
6279 
ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6280 void ff_hv_mc_qpel_avg_dst_16x16_msa(uint8_t *dest,
6281                                      const uint8_t *src, ptrdiff_t stride)
6282 {
6283     hv_mc_qpel_avg_dst_16x16_msa(src, stride, dest, stride);
6284 }
6285 
ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6286 void ff_hv_mc_qpel_avg_dst_8x8_msa(uint8_t *dest,
6287                                    const uint8_t *src, ptrdiff_t stride)
6288 {
6289     hv_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6290 }
6291 
ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6292 void ff_hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(uint8_t *dest,
6293                                                  const uint8_t *src,
6294                                                  ptrdiff_t stride)
6295 {
6296     hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(src, stride, dest, stride);
6297 }
6298 
ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6299 void ff_hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(uint8_t *dest,
6300                                                const uint8_t *src,
6301                                                ptrdiff_t stride)
6302 {
6303     hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(src, stride, dest, stride);
6304 }
6305 
ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6306 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(uint8_t *dest,
6307                                                    const uint8_t *src,
6308                                                    ptrdiff_t stride)
6309 {
6310     hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6311 }
6312 
ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6313 void ff_hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(uint8_t *dest,
6314                                                  const uint8_t *src,
6315                                                  ptrdiff_t stride)
6316 {
6317     hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6318 }
6319 
ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6320 void ff_hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(uint8_t *dest,
6321                                                  const uint8_t *src,
6322                                                  ptrdiff_t stride)
6323 {
6324     hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(src, stride, dest, stride);
6325 }
6326 
ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6327 void ff_hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(uint8_t *dest,
6328                                                const uint8_t *src,
6329                                                ptrdiff_t stride)
6330 {
6331     hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(src, stride, dest, stride);
6332 }
6333 
ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6334 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(uint8_t *dest,
6335                                                    const uint8_t *src,
6336                                                    ptrdiff_t stride)
6337 {
6338     hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6339 }
6340 
ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6341 void ff_hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(uint8_t *dest,
6342                                                  const uint8_t *src,
6343                                                  ptrdiff_t stride)
6344 {
6345     hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6346 }
6347 
ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6348 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(uint8_t *dest,
6349                                                   const uint8_t *src,
6350                                                   ptrdiff_t stride)
6351 {
6352     hv_mc_qpel_no_rnd_aver_hv_src00_16x16_msa(src, stride, dest, stride);
6353 }
6354 
ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6355 void ff_hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(uint8_t *dest,
6356                                                 const uint8_t *src,
6357                                                 ptrdiff_t stride)
6358 {
6359     hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(src, stride, dest, stride);
6360 }
6361 
ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6362 void ff_hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(uint8_t *dest,
6363                                                 const uint8_t *src,
6364                                                 ptrdiff_t stride)
6365 {
6366     hv_mc_qpel_no_rnd_aver_v_src0_16x16_msa(src, stride, dest, stride);
6367 }
6368 
ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6369 void ff_hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(uint8_t *dest,
6370                                               const uint8_t *src,
6371                                               ptrdiff_t stride)
6372 {
6373     hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(src, stride, dest, stride);
6374 }
6375 
ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6376 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(uint8_t *dest,
6377                                                   const uint8_t *src,
6378                                                   ptrdiff_t stride)
6379 {
6380     hv_mc_qpel_no_rnd_aver_hv_src10_16x16_msa(src, stride, dest, stride);
6381 }
6382 
ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6383 void ff_hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(uint8_t *dest,
6384                                                 const uint8_t *src,
6385                                                 ptrdiff_t stride)
6386 {
6387     hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(src, stride, dest, stride);
6388 }
6389 
ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6390 void ff_hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(uint8_t *dest,
6391                                                 const uint8_t *src,
6392                                                 ptrdiff_t stride)
6393 {
6394     hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(src, stride, dest, stride);
6395 }
6396 
ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6397 void ff_hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(uint8_t *dest,
6398                                               const uint8_t *src,
6399                                               ptrdiff_t stride)
6400 {
6401     hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(src, stride, dest, stride);
6402 }
6403 
ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6404 void ff_hv_mc_qpel_no_rnd_16x16_msa(uint8_t *dest,
6405                                     const uint8_t *src, ptrdiff_t stride)
6406 {
6407     hv_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6408 }
6409 
ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6410 void ff_hv_mc_qpel_no_rnd_8x8_msa(uint8_t *dest,
6411                                   const uint8_t *src, ptrdiff_t stride)
6412 {
6413     hv_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6414 }
6415 
ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6416 void ff_hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(uint8_t *dest,
6417                                                 const uint8_t *src,
6418                                                 ptrdiff_t stride)
6419 {
6420     hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(src, stride, dest, stride);
6421 }
6422 
ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6423 void ff_hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(uint8_t *dest,
6424                                               const uint8_t *src,
6425                                               ptrdiff_t stride)
6426 {
6427     hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(src, stride, dest, stride);
6428 }
6429 
ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6430 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(uint8_t *dest,
6431                                                   const uint8_t *src,
6432                                                   ptrdiff_t stride)
6433 {
6434     hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(src, stride, dest, stride);
6435 }
6436 
ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6437 void ff_hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(uint8_t *dest,
6438                                                 const uint8_t *src,
6439                                                 ptrdiff_t stride)
6440 {
6441     hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(src, stride, dest, stride);
6442 }
6443 
ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6444 void ff_hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(uint8_t *dest,
6445                                                 const uint8_t *src,
6446                                                 ptrdiff_t stride)
6447 {
6448     hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(src, stride, dest, stride);
6449 }
6450 
ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6451 void ff_hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(uint8_t *dest,
6452                                               const uint8_t *src,
6453                                               ptrdiff_t stride)
6454 {
6455     hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(src, stride, dest, stride);
6456 }
6457 
ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6458 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(uint8_t *dest,
6459                                                   const uint8_t *src,
6460                                                   ptrdiff_t stride)
6461 {
6462     hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(src, stride, dest, stride);
6463 }
6464 
ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t * dest,const uint8_t * src,ptrdiff_t stride)6465 void ff_hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(uint8_t *dest,
6466                                                 const uint8_t *src,
6467                                                 ptrdiff_t stride)
6468 {
6469     hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(src, stride, dest, stride);
6470 }
6471