1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp8dsp.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "vp8dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26     /* 8 width cases */
27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28     /* 4 width cases */
29     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30     /* 4 width cases */
31     8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t subpel_filters_msa[7][8] = {
35     {-6, 123, 12, -1, 0, 0, 0, 0},
36     {2, -11, 108, 36, -8, 1, 0, 0},     /* New 1/4 pel 6 tap filter */
37     {-9, 93, 50, -6, 0, 0, 0, 0},
38     {3, -16, 77, 77, -16, 3, 0, 0},     /* New 1/2 pel 6 tap filter */
39     {-6, 50, 93, -9, 0, 0, 0, 0},
40     {1, -8, 36, 108, -11, 2, 0, 0},     /* New 1/4 pel 6 tap filter */
41     {-1, 12, 123, -6, 0, 0, 0, 0},
42 };
43 
44 static const int8_t bilinear_filters_msa[7][2] = {
45     {112, 16},
46     {96, 32},
47     {80, 48},
48     {64, 64},
49     {48, 80},
50     {32, 96},
51     {16, 112}
52 };
53 
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,                 \
55                         filt_h0, filt_h1, filt_h2)                       \
56 ( {                                                                      \
57     v16i8 vec0_m, vec1_m, vec2_m;                                        \
58     v8i16 hz_out_m;                                                      \
59                                                                          \
60     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,  \
61                vec0_m, vec1_m, vec2_m);                                  \
62     hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m,                      \
63                             filt_h0, filt_h1, filt_h2);                  \
64                                                                          \
65     hz_out_m = __msa_srari_h(hz_out_m, 7);                               \
66     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                               \
67                                                                          \
68     hz_out_m;                                                            \
69 } )
70 
71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
72                                    mask0, mask1, mask2,                \
73                                    filt0, filt1, filt2,                \
74                                    out0, out1)                         \
75 {                                                                      \
76     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;              \
77                                                                        \
78     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
79     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
80     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
81     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
82     VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);  \
83     DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);            \
84 }
85 
86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
87                                    mask0, mask1, mask2,                       \
88                                    filt0, filt1, filt2,                       \
89                                    out0, out1, out2, out3)                    \
90 {                                                                             \
91     v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;     \
92                                                                               \
93     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
94     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
95     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
96                 out0, out1, out2, out3);                                      \
97     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
98     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
99     VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);         \
100     VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);         \
101     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
102                  out0, out1, out2, out3);                                     \
103     DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2,  \
104                  out0, out1, out2, out3);                                     \
105 }
106 
107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)           \
108 ( {                                                             \
109     v8i16 tmp0;                                                 \
110                                                                 \
111     tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0);         \
112     tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1);  \
113                                                                 \
114     tmp0;                                                       \
115 } )
116 
117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)    \
118 ( {                                                                    \
119     v16i8 vec0_m, vec1_m;                                              \
120     v8i16 hz_out_m;                                                    \
121                                                                        \
122     VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m);  \
123     hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1);  \
124                                                                        \
125     hz_out_m = __msa_srari_h(hz_out_m, 7);                             \
126     hz_out_m = __msa_sat_s_h(hz_out_m, 7);                             \
127                                                                        \
128     hz_out_m;                                                          \
129 } )
130 
131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,             \
132                                    mask0, mask1, filt0, filt1,         \
133                                    out0, out1)                         \
134 {                                                                      \
135     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                              \
136                                                                        \
137     VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);  \
138     DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);             \
139     VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);  \
140     DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);            \
141 }
142 
143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,                    \
144                                    mask0, mask1, filt0, filt1,                \
145                                    out0, out1, out2, out3)                    \
146 {                                                                             \
147     v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                     \
148                                                                               \
149     VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);         \
150     VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);         \
151     DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,   \
152                 out0, out1, out2, out3);                                      \
153     VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);         \
154     VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);         \
155     DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,  \
156                  out0, out1, out2, out3);                                     \
157 }
158 
common_hz_6t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)159 static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride,
160                                  uint8_t *dst, int32_t dst_stride,
161                                  const int8_t *filter)
162 {
163     v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
164     v16u8 mask0, mask1, mask2, out;
165     v8i16 filt, out0, out1;
166 
167     mask0 = LD_UB(&mc_filt_mask_arr[16]);
168     src -= 2;
169 
170     /* rearranging filter */
171     filt = LD_SH(filter);
172     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
173 
174     mask1 = mask0 + 2;
175     mask2 = mask0 + 4;
176 
177     LD_SB4(src, src_stride, src0, src1, src2, src3);
178     XORI_B4_128_SB(src0, src1, src2, src3);
179     HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
180                                filt0, filt1, filt2, out0, out1);
181     SRARI_H2_SH(out0, out1, 7);
182     SAT_SH2_SH(out0, out1, 7);
183     out = PCKEV_XORI128_UB(out0, out1);
184     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
185 }
186 
common_hz_6t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)187 static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
188                                  uint8_t *dst, int32_t dst_stride,
189                                  const int8_t *filter)
190 {
191     v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
192     v16u8 mask0, mask1, mask2, out;
193     v8i16 filt, out0, out1, out2, out3;
194 
195     mask0 = LD_UB(&mc_filt_mask_arr[16]);
196     src -= 2;
197 
198     /* rearranging filter */
199     filt = LD_SH(filter);
200     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
201 
202     mask1 = mask0 + 2;
203     mask2 = mask0 + 4;
204 
205     LD_SB4(src, src_stride, src0, src1, src2, src3);
206     XORI_B4_128_SB(src0, src1, src2, src3);
207     src += (4 * src_stride);
208     HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
209                                filt0, filt1, filt2, out0, out1);
210     LD_SB4(src, src_stride, src0, src1, src2, src3);
211     XORI_B4_128_SB(src0, src1, src2, src3);
212     HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213                                filt0, filt1, filt2, out2, out3);
214     SRARI_H4_SH(out0, out1, out2, out3, 7);
215     SAT_SH4_SH(out0, out1, out2, out3, 7);
216     out = PCKEV_XORI128_UB(out0, out1);
217     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
218     out = PCKEV_XORI128_UB(out2, out3);
219     ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
220 }
221 
ff_put_vp8_epel4_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)222 void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
223                              uint8_t *src, ptrdiff_t src_stride,
224                              int height, int mx, int my)
225 {
226     const int8_t *filter = subpel_filters_msa[mx - 1];
227 
228     if (4 == height) {
229         common_hz_6t_4x4_msa(src, src_stride, dst, dst_stride, filter);
230     } else if (8 == height) {
231         common_hz_6t_4x8_msa(src, src_stride, dst, dst_stride, filter);
232     }
233 }
234 
ff_put_vp8_epel8_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)235 void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
236                              uint8_t *src, ptrdiff_t src_stride,
237                              int height, int mx, int my)
238 {
239     uint32_t loop_cnt;
240     const int8_t *filter = subpel_filters_msa[mx - 1];
241     v16i8 src0, src1, src2, src3, filt0, filt1, filt2;
242     v16u8 mask0, mask1, mask2, tmp0, tmp1;
243     v8i16 filt, out0, out1, out2, out3;
244 
245     mask0 = LD_UB(&mc_filt_mask_arr[0]);
246 
247     src -= 2;
248 
249     /* rearranging filter */
250     filt = LD_SH(filter);
251     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
252 
253     mask1 = mask0 + 2;
254     mask2 = mask0 + 4;
255 
256     LD_SB4(src, src_stride, src0, src1, src2, src3);
257     XORI_B4_128_SB(src0, src1, src2, src3);
258     src += (4 * src_stride);
259     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
260                                filt0, filt1, filt2, out0, out1, out2, out3);
261     SRARI_H4_SH(out0, out1, out2, out3, 7);
262     SAT_SH4_SH(out0, out1, out2, out3, 7);
263     tmp0 = PCKEV_XORI128_UB(out0, out1);
264     tmp1 = PCKEV_XORI128_UB(out2, out3);
265     ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266     dst += (4 * dst_stride);
267 
268     for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269         LD_SB4(src, src_stride, src0, src1, src2, src3);
270         XORI_B4_128_SB(src0, src1, src2, src3);
271         src += (4 * src_stride);
272         HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
273                                    filt0, filt1, filt2, out0, out1, out2, out3);
274         SRARI_H4_SH(out0, out1, out2, out3, 7);
275         SAT_SH4_SH(out0, out1, out2, out3, 7);
276         tmp0 = PCKEV_XORI128_UB(out0, out1);
277         tmp1 = PCKEV_XORI128_UB(out2, out3);
278         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279         dst += (4 * dst_stride);
280     }
281 }
282 
ff_put_vp8_epel16_h6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)283 void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
284                               uint8_t *src, ptrdiff_t src_stride,
285                               int height, int mx, int my)
286 {
287     uint32_t loop_cnt;
288     const int8_t *filter = subpel_filters_msa[mx - 1];
289     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290     v16u8 mask0, mask1, mask2, out;
291     v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
292 
293     mask0 = LD_UB(&mc_filt_mask_arr[0]);
294     src -= 2;
295 
296     /* rearranging filter */
297     filt = LD_SH(filter);
298     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
299 
300     mask1 = mask0 + 2;
301     mask2 = mask0 + 4;
302 
303     for (loop_cnt = (height >> 2); loop_cnt--;) {
304         LD_SB4(src, src_stride, src0, src2, src4, src6);
305         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
306         XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
307         src += (4 * src_stride);
308 
309         HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
310                                    filt0, filt1, filt2, out0, out1, out2, out3);
311         HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
312                                    filt0, filt1, filt2, out4, out5, out6, out7);
313         SRARI_H4_SH(out0, out1, out2, out3, 7);
314         SRARI_H4_SH(out4, out5, out6, out7, 7);
315         SAT_SH4_SH(out0, out1, out2, out3, 7);
316         SAT_SH4_SH(out4, out5, out6, out7, 7);
317         out = PCKEV_XORI128_UB(out0, out1);
318         ST_UB(out, dst);
319         dst += dst_stride;
320         out = PCKEV_XORI128_UB(out2, out3);
321         ST_UB(out, dst);
322         dst += dst_stride;
323         out = PCKEV_XORI128_UB(out4, out5);
324         ST_UB(out, dst);
325         dst += dst_stride;
326         out = PCKEV_XORI128_UB(out6, out7);
327         ST_UB(out, dst);
328         dst += dst_stride;
329     }
330 }
331 
ff_put_vp8_epel4_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)332 void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
333                              uint8_t *src, ptrdiff_t src_stride,
334                              int height, int mx, int my)
335 {
336     uint32_t loop_cnt;
337     const int8_t *filter = subpel_filters_msa[my - 1];
338     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
339     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340     v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
341     v16u8 out;
342     v8i16 filt, out10, out32;
343 
344     src -= (2 * src_stride);
345 
346     filt = LD_SH(filter);
347     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
348 
349     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350     src += (5 * src_stride);
351 
352     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
353                src32_r, src43_r);
354     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
355     XORI_B2_128_SB(src2110, src4332);
356 
357     for (loop_cnt = (height >> 2); loop_cnt--;) {
358         LD_SB4(src, src_stride, src5, src6, src7, src8);
359         src += (4 * src_stride);
360 
361         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362                    src65_r, src76_r, src87_r);
363         ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
364         XORI_B2_128_SB(src6554, src8776);
365         out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366         out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
367         SRARI_H2_SH(out10, out32, 7);
368         SAT_SH2_SH(out10, out32, 7);
369         out = PCKEV_XORI128_UB(out10, out32);
370         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371         dst += (4 * dst_stride);
372 
373         src2110 = src6554;
374         src4332 = src8776;
375         src4 = src8;
376     }
377 }
378 
ff_put_vp8_epel8_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)379 void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
380                              uint8_t *src, ptrdiff_t src_stride,
381                              int height, int mx, int my)
382 {
383     uint32_t loop_cnt;
384     const int8_t *filter = subpel_filters_msa[my - 1];
385     v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
386     v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387     v16i8 src109_r, filt0, filt1, filt2;
388     v16u8 tmp0, tmp1;
389     v8i16 filt, out0_r, out1_r, out2_r, out3_r;
390 
391     src -= (2 * src_stride);
392 
393     filt = LD_SH(filter);
394     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
395 
396     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397     src += (5 * src_stride);
398 
399     XORI_B5_128_SB(src0, src1, src2, src3, src4);
400     ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401                src10_r, src32_r, src21_r, src43_r);
402 
403     for (loop_cnt = (height >> 2); loop_cnt--;) {
404         LD_SB4(src, src_stride, src7, src8, src9, src10);
405         XORI_B4_128_SB(src7, src8, src9, src10);
406         src += (4 * src_stride);
407 
408         ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409                    src87_r, src98_r, src109_r);
410         out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411         out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412         out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413         out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
414         SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
415         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
416         tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
417         tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
418         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419         dst += (4 * dst_stride);
420 
421         src10_r = src76_r;
422         src32_r = src98_r;
423         src21_r = src87_r;
424         src43_r = src109_r;
425         src4 = src10;
426     }
427 }
428 
ff_put_vp8_epel16_v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)429 void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
430                               uint8_t *src, ptrdiff_t src_stride,
431                               int height, int mx, int my)
432 {
433     uint32_t loop_cnt;
434     const int8_t *filter = subpel_filters_msa[my - 1];
435     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
436     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437     v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438     v16i8 src65_l, src87_l, filt0, filt1, filt2;
439     v16u8 tmp0, tmp1, tmp2, tmp3;
440     v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
441 
442     src -= (2 * src_stride);
443 
444     filt = LD_SH(filter);
445     SPLATI_H3_SB(filt, 0, 1, 2, filt0, filt1, filt2);
446 
447     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448     src += (5 * src_stride);
449 
450     XORI_B5_128_SB(src0, src1, src2, src3, src4);
451     ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452                src32_r, src43_r, src21_r);
453     ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454                src32_l, src43_l, src21_l);
455 
456     for (loop_cnt = (height >> 2); loop_cnt--;) {
457         LD_SB4(src, src_stride, src5, src6, src7, src8);
458         src += (4 * src_stride);
459 
460         XORI_B4_128_SB(src5, src6, src7, src8);
461         ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462                    src65_r, src76_r, src87_r);
463         ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464                    src65_l, src76_l, src87_l);
465         out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
466                               filt2);
467         out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
468                               filt2);
469         out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
470                               filt2);
471         out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
472                               filt2);
473         out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
474                               filt2);
475         out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
476                               filt2);
477         out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
478                               filt2);
479         out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
480                               filt2);
481         SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
482         SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
483         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486                     out3_r, tmp0, tmp1, tmp2, tmp3);
487         XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
488         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489         dst += (4 * dst_stride);
490 
491         src10_r = src54_r;
492         src32_r = src76_r;
493         src21_r = src65_r;
494         src43_r = src87_r;
495         src10_l = src54_l;
496         src32_l = src76_l;
497         src21_l = src65_l;
498         src43_l = src87_l;
499         src4 = src8;
500     }
501 }
502 
ff_put_vp8_epel4_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)503 void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
504                                uint8_t *src, ptrdiff_t src_stride,
505                                int height, int mx, int my)
506 {
507     uint32_t loop_cnt;
508     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
509     const int8_t *filter_vert = subpel_filters_msa[my - 1];
510     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
511     v16i8 filt_hz0, filt_hz1, filt_hz2;
512     v16u8 mask0, mask1, mask2, out;
513     v8i16 tmp0, tmp1;
514     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515     v8i16 hz_out7, filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
516 
517     mask0 = LD_UB(&mc_filt_mask_arr[16]);
518     src -= (2 + 2 * src_stride);
519 
520     /* rearranging filter */
521     filt = LD_SH(filter_horiz);
522     SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
523 
524     filt = LD_SH(filter_vert);
525     SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
526 
527     mask1 = mask0 + 2;
528     mask2 = mask0 + 4;
529 
530     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531     src += (5 * src_stride);
532 
533     XORI_B5_128_SB(src0, src1, src2, src3, src4);
534     hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
535                               filt_hz1, filt_hz2);
536     hz_out2 = HORIZ_6TAP_FILT(src2, src3, mask0, mask1, mask2, filt_hz0,
537                               filt_hz1, filt_hz2);
538     hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
539     hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
540                               filt_hz1, filt_hz2);
541     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
542 
543     for (loop_cnt = (height >> 2); loop_cnt--;) {
544         LD_SB2(src, src_stride, src5, src6);
545         src += (2 * src_stride);
546 
547         XORI_B2_128_SB(src5, src6);
548         hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
549                                   filt_hz1, filt_hz2);
550         hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
551 
552         LD_SB2(src, src_stride, src7, src8);
553         src += (2 * src_stride);
554 
555         XORI_B2_128_SB(src7, src8);
556         hz_out7 = HORIZ_6TAP_FILT(src7, src8, mask0, mask1, mask2, filt_hz0,
557                                   filt_hz1, filt_hz2);
558         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
559 
560         out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561         tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
562 
563         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564         tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
565 
566         SRARI_H2_SH(tmp0, tmp1, 7);
567         SAT_SH2_SH(tmp0, tmp1, 7);
568         out = PCKEV_XORI128_UB(tmp0, tmp1);
569         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570         dst += (4 * dst_stride);
571 
572         hz_out3 = hz_out7;
573         out0 = out2;
574         out1 = out3;
575     }
576 }
577 
ff_put_vp8_epel8_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)578 void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
579                                uint8_t *src, ptrdiff_t src_stride,
580                                int height, int mx, int my)
581 {
582     uint32_t loop_cnt;
583     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
584     const int8_t *filter_vert = subpel_filters_msa[my - 1];
585     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
586     v16i8 filt_hz0, filt_hz1, filt_hz2;
587     v16u8 mask0, mask1, mask2, vec0, vec1;
588     v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
589     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590     v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591     v8i16 tmp0, tmp1, tmp2, tmp3;
592 
593     mask0 = LD_UB(&mc_filt_mask_arr[0]);
594     src -= (2 + 2 * src_stride);
595 
596     /* rearranging filter */
597     filt = LD_SH(filter_horiz);
598     SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
599 
600     mask1 = mask0 + 2;
601     mask2 = mask0 + 4;
602 
603     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604     src += (5 * src_stride);
605 
606     XORI_B5_128_SB(src0, src1, src2, src3, src4);
607     hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
608                               filt_hz1, filt_hz2);
609     hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
610                               filt_hz1, filt_hz2);
611     hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
612                               filt_hz1, filt_hz2);
613     hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
614                               filt_hz1, filt_hz2);
615     hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
616                               filt_hz1, filt_hz2);
617 
618     filt = LD_SH(filter_vert);
619     SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
620 
621     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622     ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
623 
624     for (loop_cnt = (height >> 2); loop_cnt--;) {
625         LD_SB4(src, src_stride, src5, src6, src7, src8);
626         src += (4 * src_stride);
627 
628         XORI_B4_128_SB(src5, src6, src7, src8);
629         hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
630                                   filt_hz1, filt_hz2);
631         out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632         tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
633 
634         hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
635                                   filt_hz1, filt_hz2);
636         out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637         tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
638 
639         hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
640                                   filt_hz1, filt_hz2);
641         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642         tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
643 
644         hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
645                                   filt_hz1, filt_hz2);
646         out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647         tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
648 
649         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
650         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
651         vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
652         vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
653         ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654         dst += (4 * dst_stride);
655 
656         hz_out4 = hz_out8;
657         out0 = out2;
658         out1 = out7;
659         out3 = out5;
660         out4 = out6;
661     }
662 }
663 
664 
ff_put_vp8_epel16_h6v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)665 void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
666                                uint8_t *src, ptrdiff_t src_stride,
667                                int height, int mx, int my)
668 {
669     int32_t multiple8_cnt;
670 
671     for (multiple8_cnt = 2; multiple8_cnt--;) {
672         ff_put_vp8_epel8_h6v6_msa(dst, dst_stride, src, src_stride, height,
673                                   mx, my);
674 
675         src += 8;
676         dst += 8;
677     }
678 }
679 
common_hz_4t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)680 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
681                                  uint8_t *dst, int32_t dst_stride,
682                                  const int8_t *filter)
683 {
684     v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
685     v8i16 filt, out0, out1;
686     v16u8 out;
687 
688     mask0 = LD_SB(&mc_filt_mask_arr[16]);
689     src -= 1;
690 
691     /* rearranging filter */
692     filt = LD_SH(filter);
693     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
694 
695     mask1 = mask0 + 2;
696 
697     LD_SB4(src, src_stride, src0, src1, src2, src3);
698     XORI_B4_128_SB(src0, src1, src2, src3);
699     HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
700                                filt0, filt1, out0, out1);
701     SRARI_H2_SH(out0, out1, 7);
702     SAT_SH2_SH(out0, out1, 7);
703     out = PCKEV_XORI128_UB(out0, out1);
704     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
705 }
706 
common_hz_4t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)707 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
708                                  uint8_t *dst, int32_t dst_stride,
709                                  const int8_t *filter)
710 {
711     v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
712     v16u8 out;
713     v8i16 filt, out0, out1, out2, out3;
714 
715     mask0 = LD_SB(&mc_filt_mask_arr[16]);
716     src -= 1;
717 
718     /* rearranging filter */
719     filt = LD_SH(filter);
720     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
721 
722     mask1 = mask0 + 2;
723 
724     LD_SB4(src, src_stride, src0, src1, src2, src3);
725     src += (4 * src_stride);
726 
727     XORI_B4_128_SB(src0, src1, src2, src3);
728     HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
729                                filt0, filt1, out0, out1);
730     LD_SB4(src, src_stride, src0, src1, src2, src3);
731     XORI_B4_128_SB(src0, src1, src2, src3);
732     HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
733                                filt0, filt1, out2, out3);
734     SRARI_H4_SH(out0, out1, out2, out3, 7);
735     SAT_SH4_SH(out0, out1, out2, out3, 7);
736     out = PCKEV_XORI128_UB(out0, out1);
737     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
738     out = PCKEV_XORI128_UB(out2, out3);
739     ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
740 }
741 
common_hz_4t_4x16_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)742 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
743                                   uint8_t *dst, int32_t dst_stride,
744                                   const int8_t *filter)
745 {
746     v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
747     v16i8 filt0, filt1, mask0, mask1;
748     v16u8 out;
749     v8i16 filt, out0, out1, out2, out3;
750 
751     mask0 = LD_SB(&mc_filt_mask_arr[16]);
752     src -= 1;
753 
754     /* rearranging filter */
755     filt = LD_SH(filter);
756     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
757 
758     mask1 = mask0 + 2;
759 
760     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761     src += (8 * src_stride);
762     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
763     HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
764                                filt0, filt1, out0, out1);
765     HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
766                                filt0, filt1, out2, out3);
767     SRARI_H4_SH(out0, out1, out2, out3, 7);
768     SAT_SH4_SH(out0, out1, out2, out3, 7);
769     out = PCKEV_XORI128_UB(out0, out1);
770     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771     dst += (4 * dst_stride);
772     out = PCKEV_XORI128_UB(out2, out3);
773     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774     dst += (4 * dst_stride);
775 
776     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777     src += (8 * src_stride);
778     XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
779     HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
780                                filt0, filt1, out0, out1);
781     HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
782                                filt0, filt1, out2, out3);
783     SRARI_H4_SH(out0, out1, out2, out3, 7);
784     SAT_SH4_SH(out0, out1, out2, out3, 7);
785     out = PCKEV_XORI128_UB(out0, out1);
786     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787     dst += (4 * dst_stride);
788     out = PCKEV_XORI128_UB(out2, out3);
789     ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
790 }
791 
ff_put_vp8_epel4_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)792 void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
793                              uint8_t *src, ptrdiff_t src_stride,
794                              int height, int mx, int my)
795 {
796     const int8_t *filter = subpel_filters_msa[mx - 1];
797 
798     if (4 == height) {
799         common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
800     } else if (8 == height) {
801         common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
802     } else if (16 == height) {
803         common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
804     }
805 }
806 
ff_put_vp8_epel8_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)807 void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
808                              uint8_t *src, ptrdiff_t src_stride,
809                              int height, int mx, int my)
810 {
811     uint32_t loop_cnt;
812     const int8_t *filter = subpel_filters_msa[mx - 1];
813     v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
814     v16u8 tmp0, tmp1;
815     v8i16 filt, out0, out1, out2, out3;
816 
817     mask0 = LD_SB(&mc_filt_mask_arr[0]);
818     src -= 1;
819 
820     /* rearranging filter */
821     filt = LD_SH(filter);
822     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
823 
824     mask1 = mask0 + 2;
825 
826     for (loop_cnt = (height >> 2); loop_cnt--;) {
827         LD_SB4(src, src_stride, src0, src1, src2, src3);
828         src += (4 * src_stride);
829 
830         XORI_B4_128_SB(src0, src1, src2, src3);
831         HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
832                                    filt1, out0, out1, out2, out3);
833         SRARI_H4_SH(out0, out1, out2, out3, 7);
834         SAT_SH4_SH(out0, out1, out2, out3, 7);
835         tmp0 = PCKEV_XORI128_UB(out0, out1);
836         tmp1 = PCKEV_XORI128_UB(out2, out3);
837         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838         dst += (4 * dst_stride);
839     }
840 }
841 
ff_put_vp8_epel16_h4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)842 void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
843                               uint8_t *src, ptrdiff_t src_stride,
844                               int height, int mx, int my)
845 {
846     uint32_t loop_cnt;
847     const int8_t *filter = subpel_filters_msa[mx - 1];
848     v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
849     v16i8 filt0, filt1, mask0, mask1;
850     v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
851     v16u8 out;
852 
853     mask0 = LD_SB(&mc_filt_mask_arr[0]);
854     src -= 1;
855 
856     /* rearranging filter */
857     filt = LD_SH(filter);
858     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
859 
860     mask1 = mask0 + 2;
861 
862     for (loop_cnt = (height >> 2); loop_cnt--;) {
863         LD_SB4(src, src_stride, src0, src2, src4, src6);
864         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865         src += (4 * src_stride);
866 
867         XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
868         HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
869                                    filt1, out0, out1, out2, out3);
870         HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
871                                    filt1, out4, out5, out6, out7);
872         SRARI_H4_SH(out0, out1, out2, out3, 7);
873         SRARI_H4_SH(out4, out5, out6, out7, 7);
874         SAT_SH4_SH(out0, out1, out2, out3, 7);
875         SAT_SH4_SH(out4, out5, out6, out7, 7);
876         out = PCKEV_XORI128_UB(out0, out1);
877         ST_UB(out, dst);
878         dst += dst_stride;
879         out = PCKEV_XORI128_UB(out2, out3);
880         ST_UB(out, dst);
881         dst += dst_stride;
882         out = PCKEV_XORI128_UB(out4, out5);
883         ST_UB(out, dst);
884         dst += dst_stride;
885         out = PCKEV_XORI128_UB(out6, out7);
886         ST_UB(out, dst);
887         dst += dst_stride;
888     }
889 }
890 
ff_put_vp8_epel4_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)891 void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
892                              uint8_t *src, ptrdiff_t src_stride,
893                              int height, int mx, int my)
894 {
895     uint32_t loop_cnt;
896     const int8_t *filter = subpel_filters_msa[my - 1];
897     v16i8 src0, src1, src2, src3, src4, src5;
898     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899     v16i8 src2110, src4332, filt0, filt1;
900     v8i16 filt, out10, out32;
901     v16u8 out;
902 
903     src -= src_stride;
904 
905     filt = LD_SH(filter);
906     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
907 
908     LD_SB3(src, src_stride, src0, src1, src2);
909     src += (3 * src_stride);
910 
911     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
912 
913     src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914     src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
915 
916     for (loop_cnt = (height >> 2); loop_cnt--;) {
917         LD_SB3(src, src_stride, src3, src4, src5);
918         src += (3 * src_stride);
919         ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920         src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921         src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
922         out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
923 
924         src2 = LD_SB(src);
925         src += (src_stride);
926         ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927         src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928         src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
929         out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
930         SRARI_H2_SH(out10, out32, 7);
931         SAT_SH2_SH(out10, out32, 7);
932         out = PCKEV_XORI128_UB(out10, out32);
933         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934         dst += (4 * dst_stride);
935     }
936 }
937 
ff_put_vp8_epel8_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)938 void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
939                              uint8_t *src, ptrdiff_t src_stride,
940                              int height, int mx, int my)
941 {
942     uint32_t loop_cnt;
943     const int8_t *filter = subpel_filters_msa[my - 1];
944     v16i8 src0, src1, src2, src7, src8, src9, src10;
945     v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
946     v16u8 tmp0, tmp1;
947     v8i16 filt, out0_r, out1_r, out2_r, out3_r;
948 
949     src -= src_stride;
950 
951     filt = LD_SH(filter);
952     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
953 
954     LD_SB3(src, src_stride, src0, src1, src2);
955     src += (3 * src_stride);
956 
957     XORI_B3_128_SB(src0, src1, src2);
958     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
959 
960     for (loop_cnt = (height >> 2); loop_cnt--;) {
961         LD_SB4(src, src_stride, src7, src8, src9, src10);
962         src += (4 * src_stride);
963 
964         XORI_B4_128_SB(src7, src8, src9, src10);
965         ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966                    src72_r, src87_r, src98_r, src109_r);
967         out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
968         out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
969         out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
970         out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
971         SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
972         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
973         tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
974         tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
975         ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976         dst += (4 * dst_stride);
977 
978         src10_r = src98_r;
979         src21_r = src109_r;
980         src2 = src10;
981     }
982 }
983 
ff_put_vp8_epel16_v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)984 void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
985                               uint8_t *src, ptrdiff_t src_stride,
986                               int height, int mx, int my)
987 {
988     uint32_t loop_cnt;
989     const int8_t *filter = subpel_filters_msa[my - 1];
990     v16i8 src0, src1, src2, src3, src4, src5, src6;
991     v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992     v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993     v16u8 tmp0, tmp1, tmp2, tmp3;
994     v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
995 
996     src -= src_stride;
997 
998     filt = LD_SH(filter);
999     SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1000 
1001     LD_SB3(src, src_stride, src0, src1, src2);
1002     src += (3 * src_stride);
1003 
1004     XORI_B3_128_SB(src0, src1, src2);
1005     ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006     ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1007 
1008     for (loop_cnt = (height >> 2); loop_cnt--;) {
1009         LD_SB4(src, src_stride, src3, src4, src5, src6);
1010         src += (4 * src_stride);
1011 
1012         XORI_B4_128_SB(src3, src4, src5, src6);
1013         ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014                    src32_r, src43_r, src54_r, src65_r);
1015         ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016                    src32_l, src43_l, src54_l, src65_l);
1017         out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
1018         out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
1019         out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
1020         out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
1021         out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
1022         out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
1023         out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
1024         out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
1025         SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1026         SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1027         SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028         SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029         PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030                     out3_r, tmp0, tmp1, tmp2, tmp3);
1031         XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1032         ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033         dst += (4 * dst_stride);
1034 
1035         src10_r = src54_r;
1036         src21_r = src65_r;
1037         src10_l = src54_l;
1038         src21_l = src65_l;
1039         src2 = src6;
1040     }
1041 }
1042 
ff_put_vp8_epel4_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1043 void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1044                                uint8_t *src, ptrdiff_t src_stride,
1045                                int height, int mx, int my)
1046 {
1047     uint32_t loop_cnt;
1048     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1049     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1050     v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051     v16u8 mask0, mask1, out;
1052     v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1054 
1055     mask0 = LD_UB(&mc_filt_mask_arr[16]);
1056     src -= (1 + 1 * src_stride);
1057 
1058     /* rearranging filter */
1059     filt = LD_SH(filter_horiz);
1060     SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1061 
1062     mask1 = mask0 + 2;
1063 
1064     LD_SB3(src, src_stride, src0, src1, src2);
1065     src += (3 * src_stride);
1066 
1067     XORI_B3_128_SB(src0, src1, src2);
1068     hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069     hz_out1 = HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070     vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1071 
1072     filt = LD_SH(filter_vert);
1073     SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1074 
1075     for (loop_cnt = (height >> 2); loop_cnt--;) {
1076         LD_SB4(src, src_stride, src3, src4, src5, src6);
1077         src += (4 * src_stride);
1078 
1079         XORI_B2_128_SB(src3, src4);
1080         hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081         hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082         vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1083         tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1084 
1085         XORI_B2_128_SB(src5, src6);
1086         hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087         hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088         vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1089         tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1090 
1091         SRARI_H2_SH(tmp0, tmp1, 7);
1092         SAT_SH2_SH(tmp0, tmp1, 7);
1093         out = PCKEV_XORI128_UB(tmp0, tmp1);
1094         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095         dst += (4 * dst_stride);
1096 
1097         hz_out1 = hz_out5;
1098         vec0 = vec2;
1099     }
1100 }
1101 
ff_put_vp8_epel8_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1102 void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1103                                uint8_t *src, ptrdiff_t src_stride,
1104                                int height, int mx, int my)
1105 {
1106     uint32_t loop_cnt;
1107     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1108     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1109     v16i8 src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110     v16u8 mask0, mask1, out0, out1;
1111     v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112     v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113     v8i16 vec0, vec1, vec2, vec3, vec4;
1114 
1115     mask0 = LD_UB(&mc_filt_mask_arr[0]);
1116     src -= (1 + 1 * src_stride);
1117 
1118     /* rearranging filter */
1119     filt = LD_SH(filter_horiz);
1120     SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1121 
1122     mask1 = mask0 + 2;
1123 
1124     LD_SB3(src, src_stride, src0, src1, src2);
1125     src += (3 * src_stride);
1126 
1127     XORI_B3_128_SB(src0, src1, src2);
1128     hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129     hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130     hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131     ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1132 
1133     filt = LD_SH(filter_vert);
1134     SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1135 
1136     for (loop_cnt = (height >> 2); loop_cnt--;) {
1137         LD_SB4(src, src_stride, src3, src4, src5, src6);
1138         src += (4 * src_stride);
1139 
1140         XORI_B4_128_SB(src3, src4, src5, src6);
1141         hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142         vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1143         tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1144 
1145         hz_out0 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146         vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1147         tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1148 
1149         hz_out1 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150         vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1151         tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt_vt0, filt_vt1);
1152 
1153         hz_out2 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154         ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1155         tmp3 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1156 
1157         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1158         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1159         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1160         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1161         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162         dst += (4 * dst_stride);
1163 
1164         vec0 = vec4;
1165         vec2 = vec1;
1166     }
1167 }
1168 
ff_put_vp8_epel16_h4v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1169 void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1170                                 uint8_t *src, ptrdiff_t src_stride,
1171                                 int height, int mx, int my)
1172 {
1173     int32_t multiple8_cnt;
1174 
1175     for (multiple8_cnt = 2; multiple8_cnt--;) {
1176         ff_put_vp8_epel8_h4v4_msa(dst, dst_stride, src, src_stride, height,
1177                                   mx, my);
1178 
1179         src += 8;
1180         dst += 8;
1181     }
1182 }
1183 
ff_put_vp8_epel4_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1184 void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1185                                uint8_t *src, ptrdiff_t src_stride,
1186                                int height, int mx, int my)
1187 {
1188     uint32_t loop_cnt;
1189     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1190     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1191     v16i8 src0, src1, src2, src3, src4, src5, src6;
1192     v16i8 filt_hz0, filt_hz1, filt_hz2;
1193     v16u8 res0, res1, mask0, mask1, mask2;
1194     v8i16 filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1196 
1197     mask0 = LD_UB(&mc_filt_mask_arr[16]);
1198     src -= (2 + 1 * src_stride);
1199 
1200     /* rearranging filter */
1201     filt = LD_SH(filter_horiz);
1202     SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1203 
1204     mask1 = mask0 + 2;
1205     mask2 = mask0 + 4;
1206 
1207     LD_SB3(src, src_stride, src0, src1, src2);
1208     src += (3 * src_stride);
1209 
1210     XORI_B3_128_SB(src0, src1, src2);
1211     hz_out0 = HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_hz0,
1212                               filt_hz1, filt_hz2);
1213     hz_out1 = HORIZ_6TAP_FILT(src1, src2, mask0, mask1, mask2, filt_hz0,
1214                               filt_hz1, filt_hz2);
1215     vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1216 
1217     filt = LD_SH(filter_vert);
1218     SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1219 
1220     for (loop_cnt = (height >> 2); loop_cnt--;) {
1221         LD_SB4(src, src_stride, src3, src4, src5, src6);
1222         src += (4 * src_stride);
1223 
1224         XORI_B4_128_SB(src3, src4, src5, src6);
1225         hz_out3 = HORIZ_6TAP_FILT(src3, src4, mask0, mask1, mask2, filt_hz0,
1226                                   filt_hz1, filt_hz2);
1227         hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228         vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1229         tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1230 
1231         hz_out5 = HORIZ_6TAP_FILT(src5, src6, mask0, mask1, mask2, filt_hz0,
1232                                   filt_hz1, filt_hz2);
1233         hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234         vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1235         tmp1 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1236 
1237         SRARI_H2_SH(tmp0, tmp1, 7);
1238         SAT_SH2_SH(tmp0, tmp1, 7);
1239         PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1240         XORI_B2_128_UB(res0, res1);
1241         ST_W2(res0, 0, 1, dst, dst_stride);
1242         ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243         dst += (4 * dst_stride);
1244 
1245         hz_out1 = hz_out5;
1246         vec0 = vec2;
1247     }
1248 }
1249 
ff_put_vp8_epel8_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1250 void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1251                                uint8_t *src, ptrdiff_t src_stride,
1252                                int height, int mx, int my)
1253 {
1254     uint32_t loop_cnt;
1255     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1256     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1257     v16i8 src0, src1, src2, src3, src4, src5, src6;
1258     v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259     v8i16 filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260     v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1261     v16u8 out0, out1;
1262 
1263     mask0 = LD_SB(&mc_filt_mask_arr[0]);
1264     src -= (2 + src_stride);
1265 
1266     /* rearranging filter */
1267     filt = LD_SH(filter_horiz);
1268     SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1269 
1270     mask1 = mask0 + 2;
1271     mask2 = mask0 + 4;
1272 
1273     LD_SB3(src, src_stride, src0, src1, src2);
1274     src += (3 * src_stride);
1275 
1276     XORI_B3_128_SB(src0, src1, src2);
1277     hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
1278                               filt_hz1, filt_hz2);
1279     hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
1280                               filt_hz1, filt_hz2);
1281     hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
1282                               filt_hz1, filt_hz2);
1283     ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1284 
1285     filt = LD_SH(filter_vert);
1286     SPLATI_H2_SH(filt, 0, 1, filt_vt0, filt_vt1);
1287 
1288     for (loop_cnt = (height >> 2); loop_cnt--;) {
1289         LD_SB4(src, src_stride, src3, src4, src5, src6);
1290         src += (4 * src_stride);
1291 
1292         XORI_B4_128_SB(src3, src4, src5, src6);
1293 
1294         hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
1295                                   filt_hz1, filt_hz2);
1296         vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1297         tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
1298 
1299         hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
1300                                   filt_hz1, filt_hz2);
1301         vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1302         tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
1303 
1304         hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
1305                                   filt_hz1, filt_hz2);
1306         vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1307         tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
1308 
1309         hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
1310                                   filt_hz1, filt_hz2);
1311         ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1312         tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
1313 
1314         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1315         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1316         out0 = PCKEV_XORI128_UB(tmp0, tmp1);
1317         out1 = PCKEV_XORI128_UB(tmp2, tmp3);
1318         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319         dst += (4 * dst_stride);
1320     }
1321 }
1322 
ff_put_vp8_epel16_h6v4_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1323 void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride,
1324                                uint8_t *src, ptrdiff_t src_stride,
1325                                int height, int mx, int my)
1326 {
1327     int32_t multiple8_cnt;
1328 
1329     for (multiple8_cnt = 2; multiple8_cnt--;) {
1330         ff_put_vp8_epel8_h6v4_msa(dst, dst_stride, src, src_stride, height,
1331                                   mx, my);
1332 
1333         src += 8;
1334         dst += 8;
1335     }
1336 }
1337 
ff_put_vp8_epel4_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1338 void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1339                                uint8_t *src, ptrdiff_t src_stride,
1340                                int height, int mx, int my)
1341 {
1342     uint32_t loop_cnt;
1343     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1344     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1345     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346     v16i8 filt_hz0, filt_hz1, mask0, mask1;
1347     v16u8 out;
1348     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349     v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350     v8i16 filt, filt_vt0, filt_vt1, filt_vt2;
1351 
1352     mask0 = LD_SB(&mc_filt_mask_arr[16]);
1353 
1354     src -= (1 + 2 * src_stride);
1355 
1356     /* rearranging filter */
1357     filt = LD_SH(filter_horiz);
1358     SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1359 
1360     mask1 = mask0 + 2;
1361 
1362     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363     src += (5 * src_stride);
1364 
1365     XORI_B5_128_SB(src0, src1, src2, src3, src4);
1366     hz_out0 = HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367     hz_out2 = HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368     hz_out3 = HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369     hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1371 
1372     filt = LD_SH(filter_vert);
1373     SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1374 
1375     for (loop_cnt = (height >> 2); loop_cnt--;) {
1376         LD_SB4(src, src_stride, src5, src6, src7, src8);
1377         XORI_B4_128_SB(src5, src6, src7, src8);
1378         src += (4 * src_stride);
1379 
1380         hz_out5 = HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381         hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382         out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383         tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1384 
1385         hz_out7 = HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386         hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387         out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388         tmp1 = DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1389 
1390         SRARI_H2_SH(tmp0, tmp1, 7);
1391         SAT_SH2_SH(tmp0, tmp1, 7);
1392         out = PCKEV_XORI128_UB(tmp0, tmp1);
1393         ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394         dst += (4 * dst_stride);
1395 
1396         hz_out3 = hz_out7;
1397         out0 = out2;
1398         out1 = out3;
1399     }
1400 }
1401 
ff_put_vp8_epel8_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1402 void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1403                                uint8_t *src, ptrdiff_t src_stride,
1404                                int height, int mx, int my)
1405 {
1406     uint32_t loop_cnt;
1407     const int8_t *filter_horiz = subpel_filters_msa[mx - 1];
1408     const int8_t *filter_vert = subpel_filters_msa[my - 1];
1409     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1410     v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411     v8i16 filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412     v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413     v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1414     v16u8 vec0, vec1;
1415 
1416     mask0 = LD_SB(&mc_filt_mask_arr[0]);
1417     src -= (1 + 2 * src_stride);
1418 
1419     /* rearranging filter */
1420     filt = LD_SH(filter_horiz);
1421     SPLATI_H2_SB(filt, 0, 1, filt_hz0, filt_hz1);
1422 
1423     mask1 = mask0 + 2;
1424 
1425     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426     src += (5 * src_stride);
1427 
1428     XORI_B5_128_SB(src0, src1, src2, src3, src4);
1429     hz_out0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430     hz_out1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431     hz_out2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432     hz_out3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433     hz_out4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434     ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435     ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1436 
1437     filt = LD_SH(filter_vert);
1438     SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1439 
1440     for (loop_cnt = (height >> 2); loop_cnt--;) {
1441         LD_SB4(src, src_stride, src5, src6, src7, src8);
1442         src += (4 * src_stride);
1443 
1444         XORI_B4_128_SB(src5, src6, src7, src8);
1445 
1446         hz_out5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447         out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448         tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1449 
1450         hz_out6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451         out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452         tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1453 
1454         hz_out7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455         out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456         tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1457 
1458         hz_out8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459         out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460         tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1461 
1462         SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1463         SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1464         vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
1465         vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
1466         ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467         dst += (4 * dst_stride);
1468 
1469         hz_out4 = hz_out8;
1470         out0 = out2;
1471         out1 = out6;
1472         out3 = out5;
1473         out4 = out7;
1474     }
1475 }
1476 
ff_put_vp8_epel16_h4v6_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1477 void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride,
1478                                 uint8_t *src, ptrdiff_t src_stride,
1479                                 int height, int mx, int my)
1480 {
1481     int32_t multiple8_cnt;
1482 
1483     for (multiple8_cnt = 2; multiple8_cnt--;) {
1484         ff_put_vp8_epel8_h4v6_msa(dst, dst_stride, src, src_stride, height,
1485                                   mx, my);
1486 
1487         src += 8;
1488         dst += 8;
1489     }
1490 }
1491 
common_hz_2t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1492 static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1493                                  uint8_t *dst, int32_t dst_stride,
1494                                  const int8_t *filter)
1495 {
1496     v16i8 src0, src1, src2, src3, mask;
1497     v16u8 filt0, vec0, vec1, res0, res1;
1498     v8u16 vec2, vec3, filt;
1499 
1500     mask = LD_SB(&mc_filt_mask_arr[16]);
1501 
1502     /* rearranging filter */
1503     filt = LD_UH(filter);
1504     filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1505 
1506     LD_SB4(src, src_stride, src0, src1, src2, src3);
1507     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508     DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1509     SRARI_H2_UH(vec2, vec3, 7);
1510     PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1511     ST_W2(res0, 0, 1, dst, dst_stride);
1512     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1513 }
1514 
common_hz_2t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1515 static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1516                                  uint8_t *dst, int32_t dst_stride,
1517                                  const int8_t *filter)
1518 {
1519     v16u8 vec0, vec1, vec2, vec3, filt0;
1520     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1521     v16i8 res0, res1, res2, res3;
1522     v8u16 vec4, vec5, vec6, vec7, filt;
1523 
1524     mask = LD_SB(&mc_filt_mask_arr[16]);
1525 
1526     /* rearranging filter */
1527     filt = LD_UH(filter);
1528     filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1529 
1530     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531     VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532     VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534                 vec4, vec5, vec6, vec7);
1535     SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1536     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537                 res0, res1, res2, res3);
1538     ST_W2(res0, 0, 1, dst, dst_stride);
1539     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540     ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541     ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1542 }
1543 
ff_put_vp8_bilinear4_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1544 void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1545                                 uint8_t *src, ptrdiff_t src_stride,
1546                                 int height, int mx, int my)
1547 {
1548     const int8_t *filter = bilinear_filters_msa[mx - 1];
1549 
1550     if (4 == height) {
1551         common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1552     } else if (8 == height) {
1553         common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1554     }
1555 }
1556 
common_hz_2t_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1557 static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1558                                  uint8_t *dst, int32_t dst_stride,
1559                                  const int8_t *filter)
1560 {
1561     v16u8 filt0;
1562     v16i8 src0, src1, src2, src3, mask;
1563     v8u16 vec0, vec1, vec2, vec3, filt;
1564 
1565     mask = LD_SB(&mc_filt_mask_arr[0]);
1566 
1567     /* rearranging filter */
1568     filt = LD_UH(filter);
1569     filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1570 
1571     LD_SB4(src, src_stride, src0, src1, src2, src3);
1572     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575                 vec0, vec1, vec2, vec3);
1576     SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1577     PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1578     ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1579 }
1580 
common_hz_2t_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1581 static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1582                                      uint8_t *dst, int32_t dst_stride,
1583                                      const int8_t *filter, int32_t height)
1584 {
1585     v16u8 filt0;
1586     v16i8 src0, src1, src2, src3, mask, out0, out1;
1587     v8u16 vec0, vec1, vec2, vec3, filt;
1588 
1589     mask = LD_SB(&mc_filt_mask_arr[0]);
1590 
1591     /* rearranging filter */
1592     filt = LD_UH(filter);
1593     filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1594 
1595     LD_SB4(src, src_stride, src0, src1, src2, src3);
1596     src += (4 * src_stride);
1597 
1598     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601                 vec0, vec1, vec2, vec3);
1602     SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1603 
1604     LD_SB4(src, src_stride, src0, src1, src2, src3);
1605     src += (4 * src_stride);
1606 
1607     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1608     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1609 
1610     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613                 vec0, vec1, vec2, vec3);
1614     SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1615     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1616     ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617     dst += (8 * dst_stride);
1618 
1619     if (16 == height) {
1620         LD_SB4(src, src_stride, src0, src1, src2, src3);
1621         src += (4 * src_stride);
1622 
1623         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626                     vec0, vec1, vec2, vec3);
1627         SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1628         LD_SB4(src, src_stride, src0, src1, src2, src3);
1629         src += (4 * src_stride);
1630 
1631         PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1632         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1633 
1634         VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635         VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637                     vec0, vec1, vec2, vec3);
1638         SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1639         PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1640         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1641     }
1642 }
1643 
ff_put_vp8_bilinear8_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1644 void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1645                                 uint8_t *src, ptrdiff_t src_stride,
1646                                 int height, int mx, int my)
1647 {
1648     const int8_t *filter = bilinear_filters_msa[mx - 1];
1649 
1650     if (4 == height) {
1651         common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1652     } else {
1653         common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1654                                  height);
1655     }
1656 }
1657 
ff_put_vp8_bilinear16_h_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1658 void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1659                                  uint8_t *src, ptrdiff_t src_stride,
1660                                  int height, int mx, int my)
1661 {
1662     uint32_t loop_cnt;
1663     const int8_t *filter = bilinear_filters_msa[mx - 1];
1664     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1665     v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666     v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
1667 
1668     mask = LD_SB(&mc_filt_mask_arr[0]);
1669 
1670     loop_cnt = (height >> 2) - 1;
1671 
1672     /* rearranging filter */
1673     filt = LD_UH(filter);
1674     filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1675 
1676     LD_SB4(src, src_stride, src0, src2, src4, src6);
1677     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678     src += (4 * src_stride);
1679 
1680     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685                 out0, out1, out2, out3);
1686     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687                 out4, out5, out6, out7);
1688     SRARI_H4_UH(out0, out1, out2, out3, 7);
1689     SRARI_H4_UH(out4, out5, out6, out7, 7);
1690     PCKEV_ST_SB(out0, out1, dst);
1691     dst += dst_stride;
1692     PCKEV_ST_SB(out2, out3, dst);
1693     dst += dst_stride;
1694     PCKEV_ST_SB(out4, out5, dst);
1695     dst += dst_stride;
1696     PCKEV_ST_SB(out6, out7, dst);
1697     dst += dst_stride;
1698 
1699     for (; loop_cnt--;) {
1700         LD_SB4(src, src_stride, src0, src2, src4, src6);
1701         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702         src += (4 * src_stride);
1703 
1704         VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705         VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706         VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707         VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709                     out0, out1, out2, out3);
1710         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711                     out4, out5, out6, out7);
1712         SRARI_H4_UH(out0, out1, out2, out3, 7);
1713         SRARI_H4_UH(out4, out5, out6, out7, 7);
1714         PCKEV_ST_SB(out0, out1, dst);
1715         dst += dst_stride;
1716         PCKEV_ST_SB(out2, out3, dst);
1717         dst += dst_stride;
1718         PCKEV_ST_SB(out4, out5, dst);
1719         dst += dst_stride;
1720         PCKEV_ST_SB(out6, out7, dst);
1721         dst += dst_stride;
1722     }
1723 }
1724 
common_vt_2t_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1725 static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride,
1726                                  uint8_t *dst, int32_t dst_stride,
1727                                  const int8_t *filter)
1728 {
1729     v16i8 src0, src1, src2, src3, src4;
1730     v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1731     v16u8 filt0;
1732     v8i16 filt;
1733     v8u16 tmp0, tmp1;
1734 
1735     filt = LD_SH(filter);
1736     filt0 = (v16u8) __msa_splati_h(filt, 0);
1737 
1738     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739     src += (5 * src_stride);
1740 
1741     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742                src10_r, src21_r, src32_r, src43_r);
1743     ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1745     SRARI_H2_UH(tmp0, tmp1, 7);
1746     SAT_UH2_UH(tmp0, tmp1, 7);
1747     src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748     ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1749 }
1750 
common_vt_2t_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1751 static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
1752                                  uint8_t *dst, int32_t dst_stride,
1753                                  const int8_t *filter)
1754 {
1755     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1756     v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757     v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758     v8u16 tmp0, tmp1, tmp2, tmp3;
1759     v16u8 filt0;
1760     v8i16 filt;
1761 
1762     filt = LD_SH(filter);
1763     filt0 = (v16u8) __msa_splati_h(filt, 0);
1764 
1765     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766     src += (8 * src_stride);
1767 
1768     src8 = LD_SB(src);
1769     src += src_stride;
1770 
1771     ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1772                src32_r, src43_r);
1773     ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1774                src76_r, src87_r);
1775     ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776                src87_r, src76_r, src2110, src4332, src6554, src8776);
1777     DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778                 tmp0, tmp1, tmp2, tmp3);
1779     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1780     SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1781     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782     ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1783 }
1784 
ff_put_vp8_bilinear4_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1785 void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1786                                 uint8_t *src, ptrdiff_t src_stride,
1787                                 int height, int mx, int my)
1788 {
1789     const int8_t *filter = bilinear_filters_msa[my - 1];
1790 
1791     if (4 == height) {
1792         common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1793     } else if (8 == height) {
1794         common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1795     }
1796 }
1797 
common_vt_2t_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter)1798 static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride,
1799                                  uint8_t *dst, int32_t dst_stride,
1800                                  const int8_t *filter)
1801 {
1802     v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1803     v16i8 out0, out1;
1804     v8u16 tmp0, tmp1, tmp2, tmp3;
1805     v8i16 filt;
1806 
1807     /* rearranging filter_y */
1808     filt = LD_SH(filter);
1809     filt0 = (v16u8) __msa_splati_h(filt, 0);
1810 
1811     LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813     ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815                 tmp0, tmp1, tmp2, tmp3);
1816     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1817     SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1818     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1819     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1820 }
1821 
common_vt_2t_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1822 static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
1823                                      uint8_t *dst, int32_t dst_stride,
1824                                      const int8_t *filter, int32_t height)
1825 {
1826     uint32_t loop_cnt;
1827     v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1828     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1829     v16i8 out0, out1;
1830     v8u16 tmp0, tmp1, tmp2, tmp3;
1831     v8i16 filt;
1832 
1833     /* rearranging filter_y */
1834     filt = LD_SH(filter);
1835     filt0 = (v16u8) __msa_splati_h(filt, 0);
1836 
1837     src0 = LD_UB(src);
1838     src += src_stride;
1839 
1840     for (loop_cnt = (height >> 3); loop_cnt--;) {
1841         LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842         src += (8 * src_stride);
1843 
1844         ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845                    vec0, vec1, vec2, vec3);
1846         ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847                    vec4, vec5, vec6, vec7);
1848         DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849                     tmp0, tmp1, tmp2, tmp3);
1850         SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1851         SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1852         PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1853         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1854 
1855         DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856                     tmp0, tmp1, tmp2, tmp3);
1857         SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1858         SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
1859         PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1860         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861         dst += (8 * dst_stride);
1862 
1863         src0 = src8;
1864     }
1865 }
1866 
ff_put_vp8_bilinear8_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1867 void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1868                                 uint8_t *src, ptrdiff_t src_stride,
1869                                 int height, int mx, int my)
1870 {
1871     const int8_t *filter = bilinear_filters_msa[my - 1];
1872 
1873     if (4 == height) {
1874         common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
1875     } else {
1876         common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
1877                                  height);
1878     }
1879 }
1880 
ff_put_vp8_bilinear16_v_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)1881 void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
1882                                  uint8_t *src, ptrdiff_t src_stride,
1883                                  int height, int mx, int my)
1884 {
1885     uint32_t loop_cnt;
1886     const int8_t *filter = bilinear_filters_msa[my - 1];
1887     v16u8 src0, src1, src2, src3, src4;
1888     v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889     v8u16 tmp0, tmp1, tmp2, tmp3;
1890     v8i16 filt;
1891 
1892     /* rearranging filter_y */
1893     filt = LD_SH(filter);
1894     filt0 = (v16u8) __msa_splati_h(filt, 0);
1895 
1896     src0 = LD_UB(src);
1897     src += src_stride;
1898 
1899     for (loop_cnt = (height >> 2); loop_cnt--;) {
1900         LD_UB4(src, src_stride, src1, src2, src3, src4);
1901         src += (4 * src_stride);
1902 
1903         ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904         ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905         DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1906         SRARI_H2_UH(tmp0, tmp1, 7);
1907         SAT_UH2_UH(tmp0, tmp1, 7);
1908         PCKEV_ST_SB(tmp0, tmp1, dst);
1909         dst += dst_stride;
1910 
1911         ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912         ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913         DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1914         SRARI_H2_UH(tmp2, tmp3, 7);
1915         SAT_UH2_UH(tmp2, tmp3, 7);
1916         PCKEV_ST_SB(tmp2, tmp3, dst);
1917         dst += dst_stride;
1918 
1919         DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1920         SRARI_H2_UH(tmp0, tmp1, 7);
1921         SAT_UH2_UH(tmp0, tmp1, 7);
1922         PCKEV_ST_SB(tmp0, tmp1, dst);
1923         dst += dst_stride;
1924 
1925         DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1926         SRARI_H2_UH(tmp2, tmp3, 7);
1927         SAT_UH2_UH(tmp2, tmp3, 7);
1928         PCKEV_ST_SB(tmp2, tmp3, dst);
1929         dst += dst_stride;
1930 
1931         src0 = src4;
1932     }
1933 }
1934 
common_hv_2ht_2vt_4x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)1935 static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride,
1936                                       uint8_t *dst, int32_t dst_stride,
1937                                       const int8_t *filter_horiz,
1938                                       const int8_t *filter_vert)
1939 {
1940     v16i8 src0, src1, src2, src3, src4, mask;
1941     v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
1943 
1944     mask = LD_SB(&mc_filt_mask_arr[16]);
1945 
1946     /* rearranging filter */
1947     filt = LD_UH(filter_horiz);
1948     filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1949 
1950     filt = LD_UH(filter_vert);
1951     filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1952 
1953     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1954     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1955     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1956     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
1957     hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958     hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1959 
1960     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1962     SRARI_H2_UH(tmp0, tmp1, 7);
1963     SAT_UH2_UH(tmp0, tmp1, 7);
1964     PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
1965     ST_W2(res0, 0, 1, dst, dst_stride);
1966     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1967 }
1968 
common_hv_2ht_2vt_4x8_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)1969 static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
1970                                       uint8_t *dst, int32_t dst_stride,
1971                                       const int8_t *filter_horiz,
1972                                       const int8_t *filter_vert)
1973 {
1974     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
1975     v16i8 res0, res1, res2, res3;
1976     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977     v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978     v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
1979 
1980     mask = LD_SB(&mc_filt_mask_arr[16]);
1981 
1982     /* rearranging filter */
1983     filt = LD_UH(filter_horiz);
1984     filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
1985 
1986     filt = LD_UH(filter_vert);
1987     filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
1988 
1989     LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990     src += (8 * src_stride);
1991     src8 = LD_SB(src);
1992 
1993     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
1994     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
1995     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
1996     hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
1997     hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
1998     SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
1999                hz_out3, hz_out5);
2000     hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2001 
2002     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003     ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005                 vec4, vec5, vec6, vec7);
2006     SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2007     SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2008     PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009                 res0, res1, res2, res3);
2010     ST_W2(res0, 0, 1, dst, dst_stride);
2011     ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012     ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013     ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2014 }
2015 
ff_put_vp8_bilinear4_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2016 void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2017                                  uint8_t *src, ptrdiff_t src_stride,
2018                                  int height, int mx, int my)
2019 {
2020     const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2021     const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2022 
2023     if (4 == height) {
2024         common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2025                                   filter_horiz, filter_vert);
2026     } else if (8 == height) {
2027         common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2028                                   filter_horiz, filter_vert);
2029     }
2030 }
2031 
common_hv_2ht_2vt_8x4_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)2032 static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride,
2033                                       uint8_t *dst, int32_t dst_stride,
2034                                       const int8_t *filter_horiz,
2035                                       const int8_t *filter_vert)
2036 {
2037     v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2038     v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039     v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2040     v8i16 filt;
2041 
2042     mask = LD_SB(&mc_filt_mask_arr[0]);
2043 
2044     /* rearranging filter */
2045     filt = LD_SH(filter_horiz);
2046     filt_hz = (v16u8) __msa_splati_h(filt, 0);
2047 
2048     filt = LD_SH(filter_vert);
2049     filt_vt = (v16u8) __msa_splati_h(filt, 0);
2050 
2051     LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2052 
2053     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2054     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2055     vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2057 
2058     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2059     vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060     tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2061 
2062     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2063     vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064     tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2065 
2066     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2067     vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068     tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2069 
2070     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2071     SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2072     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2073     ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2074 }
2075 
common_hv_2ht_2vt_8x8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)2076 static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
2077                                           uint8_t *dst, int32_t dst_stride,
2078                                           const int8_t *filter_horiz,
2079                                           const int8_t *filter_vert,
2080                                           int32_t height)
2081 {
2082     uint32_t loop_cnt;
2083     v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2084     v16u8 filt_hz, filt_vt, vec0;
2085     v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2086     v8i16 filt;
2087 
2088     mask = LD_SB(&mc_filt_mask_arr[0]);
2089 
2090     /* rearranging filter */
2091     filt = LD_SH(filter_horiz);
2092     filt_hz = (v16u8) __msa_splati_h(filt, 0);
2093 
2094     filt = LD_SH(filter_vert);
2095     filt_vt = (v16u8) __msa_splati_h(filt, 0);
2096 
2097     src0 = LD_SB(src);
2098     src += src_stride;
2099 
2100     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2101 
2102     for (loop_cnt = (height >> 3); loop_cnt--;) {
2103         LD_SB4(src, src_stride, src1, src2, src3, src4);
2104         src += (4 * src_stride);
2105 
2106         hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2107         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108         tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2109 
2110         hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2111         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112         tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2113 
2114         SRARI_H2_UH(tmp1, tmp2, 7);
2115         SAT_UH2_UH(tmp1, tmp2, 7);
2116 
2117         hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2118         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119         tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2120 
2121         hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2122         LD_SB4(src, src_stride, src1, src2, src3, src4);
2123         src += (4 * src_stride);
2124         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125         tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2126 
2127         SRARI_H2_UH(tmp3, tmp4, 7);
2128         SAT_UH2_UH(tmp3, tmp4, 7);
2129         PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2130         ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2131 
2132         hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2133         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134         tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2135 
2136         hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2137         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138         tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2139 
2140         hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2141         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142         tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2143 
2144         hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2145         vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146         tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2147 
2148         SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2149         SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2150         PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2151         ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152         dst += (8 * dst_stride);
2153     }
2154 }
2155 
ff_put_vp8_bilinear8_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2156 void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2157                                  uint8_t *src, ptrdiff_t src_stride,
2158                                  int height, int mx, int my)
2159 {
2160     const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2161     const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2162 
2163     if (4 == height) {
2164         common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2165                                   filter_horiz, filter_vert);
2166     } else {
2167         common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2168                                       filter_horiz, filter_vert, height);
2169     }
2170 }
2171 
ff_put_vp8_bilinear16_hv_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2172 void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2173                                   uint8_t *src, ptrdiff_t src_stride,
2174                                   int height, int mx, int my)
2175 {
2176     uint32_t loop_cnt;
2177     const int8_t *filter_horiz = bilinear_filters_msa[mx - 1];
2178     const int8_t *filter_vert = bilinear_filters_msa[my - 1];
2179     v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2180     v16u8 filt_hz, filt_vt, vec0, vec1;
2181     v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2182     v8i16 filt;
2183 
2184     mask = LD_SB(&mc_filt_mask_arr[0]);
2185 
2186     /* rearranging filter */
2187     filt = LD_SH(filter_horiz);
2188     filt_hz = (v16u8) __msa_splati_h(filt, 0);
2189 
2190     filt = LD_SH(filter_vert);
2191     filt_vt = (v16u8) __msa_splati_h(filt, 0);
2192 
2193     LD_SB2(src, 8, src0, src1);
2194     src += src_stride;
2195 
2196     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2197     hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2198 
2199 
2200     for (loop_cnt = (height >> 2); loop_cnt--;) {
2201         LD_SB4(src, src_stride, src0, src2, src4, src6);
2202         LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203         src += (4 * src_stride);
2204 
2205         hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2206         hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2207         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2209         SRARI_H2_UH(tmp1, tmp2, 7);
2210         SAT_UH2_UH(tmp1, tmp2, 7);
2211         PCKEV_ST_SB(tmp1, tmp2, dst);
2212         dst += dst_stride;
2213 
2214         hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2215         hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2216         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2218         SRARI_H2_UH(tmp1, tmp2, 7);
2219         SAT_UH2_UH(tmp1, tmp2, 7);
2220         PCKEV_ST_SB(tmp1, tmp2, dst);
2221         dst += dst_stride;
2222 
2223         hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2224         hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2225         ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2227         SRARI_H2_UH(tmp1, tmp2, 7);
2228         SAT_UH2_UH(tmp1, tmp2, 7);
2229         PCKEV_ST_SB(tmp1, tmp2, dst);
2230         dst += dst_stride;
2231 
2232         hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2233         hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2234         ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235         DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2236         SRARI_H2_UH(tmp1, tmp2, 7);
2237         SAT_UH2_UH(tmp1, tmp2, 7);
2238         PCKEV_ST_SB(tmp1, tmp2, dst);
2239         dst += dst_stride;
2240     }
2241 }
2242 
ff_put_vp8_pixels8_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2243 void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride,
2244                             uint8_t *src, ptrdiff_t src_stride,
2245                             int height, int mx, int my)
2246 {
2247     int32_t cnt;
2248     uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2250 
2251     if (0 == height % 8) {
2252         for (cnt = height >> 3; cnt--;) {
2253             LD_UB8(src, src_stride,
2254                    src0, src1, src2, src3, src4, src5, src6, src7);
2255             src += (8 * src_stride);
2256 
2257             out0 = __msa_copy_u_d((v2i64) src0, 0);
2258             out1 = __msa_copy_u_d((v2i64) src1, 0);
2259             out2 = __msa_copy_u_d((v2i64) src2, 0);
2260             out3 = __msa_copy_u_d((v2i64) src3, 0);
2261             out4 = __msa_copy_u_d((v2i64) src4, 0);
2262             out5 = __msa_copy_u_d((v2i64) src5, 0);
2263             out6 = __msa_copy_u_d((v2i64) src6, 0);
2264             out7 = __msa_copy_u_d((v2i64) src7, 0);
2265 
2266             SD4(out0, out1, out2, out3, dst, dst_stride);
2267             dst += (4 * dst_stride);
2268             SD4(out4, out5, out6, out7, dst, dst_stride);
2269             dst += (4 * dst_stride);
2270         }
2271     } else if (0 == height % 4) {
2272         for (cnt = (height / 4); cnt--;) {
2273             LD_UB4(src, src_stride, src0, src1, src2, src3);
2274             src += (4 * src_stride);
2275             out0 = __msa_copy_u_d((v2i64) src0, 0);
2276             out1 = __msa_copy_u_d((v2i64) src1, 0);
2277             out2 = __msa_copy_u_d((v2i64) src2, 0);
2278             out3 = __msa_copy_u_d((v2i64) src3, 0);
2279 
2280             SD4(out0, out1, out2, out3, dst, dst_stride);
2281             dst += (4 * dst_stride);
2282         }
2283     }
2284 }
2285 
copy_16multx8mult_msa(uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t height,int32_t width)2286 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
2287                                   uint8_t *dst, int32_t dst_stride,
2288                                   int32_t height, int32_t width)
2289 {
2290     int32_t cnt, loop_cnt;
2291     uint8_t *src_tmp, *dst_tmp;
2292     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2293 
2294     for (cnt = (width >> 4); cnt--;) {
2295         src_tmp = src;
2296         dst_tmp = dst;
2297 
2298         for (loop_cnt = (height >> 3); loop_cnt--;) {
2299             LD_UB8(src_tmp, src_stride,
2300                    src0, src1, src2, src3, src4, src5, src6, src7);
2301             src_tmp += (8 * src_stride);
2302 
2303             ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304                    dst_tmp, dst_stride);
2305             dst_tmp += (8 * dst_stride);
2306         }
2307 
2308         src += 16;
2309         dst += 16;
2310     }
2311 }
2312 
ff_put_vp8_pixels16_msa(uint8_t * dst,ptrdiff_t dst_stride,uint8_t * src,ptrdiff_t src_stride,int height,int mx,int my)2313 void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314                             uint8_t *src, ptrdiff_t src_stride,
2315                             int height, int mx, int my)
2316 {
2317     int32_t cnt;
2318     v16u8 src0, src1, src2, src3;
2319 
2320     if (0 == height % 8) {
2321         copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2322     } else if (0 == height % 4) {
2323         for (cnt = (height >> 2); cnt--;) {
2324             LD_UB4(src, src_stride, src0, src1, src2, src3);
2325             src += (4 * src_stride);
2326 
2327             ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328             dst += (4 * dst_stride);
2329         }
2330     }
2331 }
2332