1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vp8/common/filter.h"
14 #include "vp8/common/mips/msa/vp8_macros_msa.h"
15 
16 DECLARE_ALIGNED(16, static const int8_t, vp8_bilinear_filters_msa[7][2]) = {
17   { 112, 16 }, { 96, 32 }, { 80, 48 }, { 64, 64 },
18   { 48, 80 },  { 32, 96 }, { 16, 112 }
19 };
20 
21 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
22   /* 8 width cases */
23   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
24   /* 4 width cases */
25   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26   /* 4 width cases */
27   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
28 };
29 
common_hz_2t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)30 static void common_hz_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
31                                  uint8_t *RESTRICT dst, int32_t dst_stride,
32                                  const int8_t *filter) {
33   v16i8 src0, src1, src2, src3, mask;
34   v16u8 filt0, vec0, vec1, res0, res1;
35   v8u16 vec2, vec3, filt;
36 
37   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
38 
39   filt = LD_UH(filter);
40   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
41 
42   LD_SB4(src, src_stride, src0, src1, src2, src3);
43   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
44   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
45   SRARI_H2_UH(vec2, vec3, VP8_FILTER_SHIFT);
46   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
47   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
48 }
49 
common_hz_2t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)50 static void common_hz_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
51                                  uint8_t *RESTRICT dst, int32_t dst_stride,
52                                  const int8_t *filter) {
53   v16u8 vec0, vec1, vec2, vec3, filt0;
54   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
55   v16i8 res0, res1, res2, res3;
56   v8u16 vec4, vec5, vec6, vec7, filt;
57 
58   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
59 
60   filt = LD_UH(filter);
61   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
62 
63   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
64   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
65   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
66   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
67               vec6, vec7);
68   SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
69   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
70               res3);
71   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
72   dst += (4 * dst_stride);
73   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
74 }
75 
common_hz_2t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)76 static void common_hz_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
77                                 uint8_t *RESTRICT dst, int32_t dst_stride,
78                                 const int8_t *filter, int32_t height) {
79   if (4 == height) {
80     common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
81   } else if (8 == height) {
82     common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
83   }
84 }
85 
common_hz_2t_8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)86 static void common_hz_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
87                                  uint8_t *RESTRICT dst, int32_t dst_stride,
88                                  const int8_t *filter) {
89   v16u8 filt0;
90   v16i8 src0, src1, src2, src3, mask;
91   v8u16 vec0, vec1, vec2, vec3, filt;
92 
93   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
94 
95   filt = LD_UH(filter);
96   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
97 
98   LD_SB4(src, src_stride, src0, src1, src2, src3);
99   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
100   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
101   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
102               vec2, vec3);
103   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
104   PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
105   ST8x4_UB(src0, src1, dst, dst_stride);
106 }
107 
common_hz_2t_8x8mult_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)108 static void common_hz_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
109                                      uint8_t *RESTRICT dst, int32_t dst_stride,
110                                      const int8_t *filter, int32_t height) {
111   v16u8 filt0;
112   v16i8 src0, src1, src2, src3, mask, out0, out1;
113   v8u16 vec0, vec1, vec2, vec3, filt;
114 
115   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
116 
117   filt = LD_UH(filter);
118   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
119 
120   LD_SB4(src, src_stride, src0, src1, src2, src3);
121   src += (4 * src_stride);
122 
123   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
124   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
125   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
126               vec2, vec3);
127   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
128 
129   LD_SB4(src, src_stride, src0, src1, src2, src3);
130   src += (4 * src_stride);
131 
132   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
133   ST8x4_UB(out0, out1, dst, dst_stride);
134   dst += (4 * dst_stride);
135 
136   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
137   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
138   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
139               vec2, vec3);
140   SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
141   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
142   ST8x4_UB(out0, out1, dst, dst_stride);
143   dst += (4 * dst_stride);
144 
145   if (16 == height) {
146     LD_SB4(src, src_stride, src0, src1, src2, src3);
147     src += (4 * src_stride);
148 
149     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
150     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
151     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
152                 vec2, vec3);
153     SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
154     LD_SB4(src, src_stride, src0, src1, src2, src3);
155     src += (4 * src_stride);
156 
157     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
158     ST8x4_UB(out0, out1, dst, dst_stride);
159 
160     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
161     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
162     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
163                 vec2, vec3);
164     SRARI_H4_UH(vec0, vec1, vec2, vec3, VP8_FILTER_SHIFT);
165     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
166     ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
167   }
168 }
169 
common_hz_2t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)170 static void common_hz_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
171                                 uint8_t *RESTRICT dst, int32_t dst_stride,
172                                 const int8_t *filter, int32_t height) {
173   if (4 == height) {
174     common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
175   } else {
176     common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
177   }
178 }
179 
common_hz_2t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)180 static void common_hz_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
181                                  uint8_t *RESTRICT dst, int32_t dst_stride,
182                                  const int8_t *filter, int32_t height) {
183   uint32_t loop_cnt;
184   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
185   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
186   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
187 
188   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
189 
190   loop_cnt = (height >> 2) - 1;
191 
192   filt = LD_UH(filter);
193   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
194 
195   LD_SB4(src, src_stride, src0, src2, src4, src6);
196   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
197   src += (4 * src_stride);
198 
199   VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
200   VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
201   VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
202   VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
203   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
204               out2, out3);
205   DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
206               out6, out7);
207   SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
208   SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
209   PCKEV_ST_SB(out0, out1, dst);
210   dst += dst_stride;
211   PCKEV_ST_SB(out2, out3, dst);
212   dst += dst_stride;
213   PCKEV_ST_SB(out4, out5, dst);
214   dst += dst_stride;
215   PCKEV_ST_SB(out6, out7, dst);
216   dst += dst_stride;
217 
218   for (; loop_cnt--;) {
219     LD_SB4(src, src_stride, src0, src2, src4, src6);
220     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
221     src += (4 * src_stride);
222 
223     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
224     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
225     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
226     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
227     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
228                 out2, out3);
229     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
230                 out6, out7);
231     SRARI_H4_UH(out0, out1, out2, out3, VP8_FILTER_SHIFT);
232     SRARI_H4_UH(out4, out5, out6, out7, VP8_FILTER_SHIFT);
233     PCKEV_ST_SB(out0, out1, dst);
234     dst += dst_stride;
235     PCKEV_ST_SB(out2, out3, dst);
236     dst += dst_stride;
237     PCKEV_ST_SB(out4, out5, dst);
238     dst += dst_stride;
239     PCKEV_ST_SB(out6, out7, dst);
240     dst += dst_stride;
241   }
242 }
243 
common_vt_2t_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)244 static void common_vt_2t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
245                                  uint8_t *RESTRICT dst, int32_t dst_stride,
246                                  const int8_t *filter) {
247   v16i8 src0, src1, src2, src3, src4;
248   v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
249   v16u8 filt0;
250   v8i16 filt;
251   v8u16 tmp0, tmp1;
252 
253   filt = LD_SH(filter);
254   filt0 = (v16u8)__msa_splati_h(filt, 0);
255 
256   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
257   src += (5 * src_stride);
258 
259   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
260              src32_r, src43_r);
261   ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
262   DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
263   SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
264   src2110 = __msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
265   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
266 }
267 
common_vt_2t_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)268 static void common_vt_2t_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
269                                  uint8_t *RESTRICT dst, int32_t dst_stride,
270                                  const int8_t *filter) {
271   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
272   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
273   v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
274   v8u16 tmp0, tmp1, tmp2, tmp3;
275   v16u8 filt0;
276   v8i16 filt;
277 
278   filt = LD_SH(filter);
279   filt0 = (v16u8)__msa_splati_h(filt, 0);
280 
281   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
282   src += (8 * src_stride);
283 
284   src8 = LD_SB(src);
285   src += src_stride;
286 
287   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
288              src32_r, src43_r);
289   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
290              src76_r, src87_r);
291   ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
292              src76_r, src2110, src4332, src6554, src8776);
293   DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
294               tmp0, tmp1, tmp2, tmp3);
295   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
296   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
297   ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
298   ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
299 }
300 
common_vt_2t_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)301 static void common_vt_2t_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
302                                 uint8_t *RESTRICT dst, int32_t dst_stride,
303                                 const int8_t *filter, int32_t height) {
304   if (4 == height) {
305     common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
306   } else if (8 == height) {
307     common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
308   }
309 }
310 
common_vt_2t_8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)311 static void common_vt_2t_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
312                                  uint8_t *RESTRICT dst, int32_t dst_stride,
313                                  const int8_t *filter) {
314   v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
315   v16i8 out0, out1;
316   v8u16 tmp0, tmp1, tmp2, tmp3;
317   v8i16 filt;
318 
319   filt = LD_SH(filter);
320   filt0 = (v16u8)__msa_splati_h(filt, 0);
321 
322   LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
323   ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
324   ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
325   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
326               tmp2, tmp3);
327   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
328   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
329   ST8x4_UB(out0, out1, dst, dst_stride);
330 }
331 
common_vt_2t_8x8mult_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)332 static void common_vt_2t_8x8mult_msa(uint8_t *RESTRICT src, int32_t src_stride,
333                                      uint8_t *RESTRICT dst, int32_t dst_stride,
334                                      const int8_t *filter, int32_t height) {
335   uint32_t loop_cnt;
336   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
337   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
338   v16i8 out0, out1;
339   v8u16 tmp0, tmp1, tmp2, tmp3;
340   v8i16 filt;
341 
342   filt = LD_SH(filter);
343   filt0 = (v16u8)__msa_splati_h(filt, 0);
344 
345   src0 = LD_UB(src);
346   src += src_stride;
347 
348   for (loop_cnt = (height >> 3); loop_cnt--;) {
349     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
350     src += (8 * src_stride);
351 
352     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
353                vec3);
354     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
355                vec7);
356     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
357                 tmp2, tmp3);
358     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
359     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
360     ST8x4_UB(out0, out1, dst, dst_stride);
361     dst += (4 * dst_stride);
362 
363     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
364                 tmp2, tmp3);
365     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
366     PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
367     ST8x4_UB(out0, out1, dst, dst_stride);
368     dst += (4 * dst_stride);
369 
370     src0 = src8;
371   }
372 }
373 
common_vt_2t_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)374 static void common_vt_2t_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
375                                 uint8_t *RESTRICT dst, int32_t dst_stride,
376                                 const int8_t *filter, int32_t height) {
377   if (4 == height) {
378     common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
379   } else {
380     common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
381   }
382 }
383 
common_vt_2t_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)384 static void common_vt_2t_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
385                                  uint8_t *RESTRICT dst, int32_t dst_stride,
386                                  const int8_t *filter, int32_t height) {
387   uint32_t loop_cnt;
388   v16u8 src0, src1, src2, src3, src4;
389   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
390   v8u16 tmp0, tmp1, tmp2, tmp3;
391   v8i16 filt;
392 
393   filt = LD_SH(filter);
394   filt0 = (v16u8)__msa_splati_h(filt, 0);
395 
396   src0 = LD_UB(src);
397   src += src_stride;
398 
399   for (loop_cnt = (height >> 2); loop_cnt--;) {
400     LD_UB4(src, src_stride, src1, src2, src3, src4);
401     src += (4 * src_stride);
402 
403     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
404     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
405     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
406     SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
407     PCKEV_ST_SB(tmp0, tmp1, dst);
408     dst += dst_stride;
409 
410     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
411     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
412     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
413     SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
414     PCKEV_ST_SB(tmp2, tmp3, dst);
415     dst += dst_stride;
416 
417     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
418     SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
419     PCKEV_ST_SB(tmp0, tmp1, dst);
420     dst += dst_stride;
421 
422     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
423     SRARI_H2_UH(tmp2, tmp3, VP8_FILTER_SHIFT);
424     PCKEV_ST_SB(tmp2, tmp3, dst);
425     dst += dst_stride;
426 
427     src0 = src4;
428   }
429 }
430 
common_hv_2ht_2vt_4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)431 static void common_hv_2ht_2vt_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
432                                       uint8_t *RESTRICT dst, int32_t dst_stride,
433                                       const int8_t *filter_horiz,
434                                       const int8_t *filter_vert) {
435   v16i8 src0, src1, src2, src3, src4, mask;
436   v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
437   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
438 
439   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
440 
441   filt = LD_UH(filter_horiz);
442   filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
443   filt = LD_UH(filter_vert);
444   filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
445 
446   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
447   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
448   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
449   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
450   hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
451   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
452 
453   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
454   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
455   SRARI_H2_UH(tmp0, tmp1, VP8_FILTER_SHIFT);
456   PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
457   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
458 }
459 
common_hv_2ht_2vt_4x8_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)460 static void common_hv_2ht_2vt_4x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
461                                       uint8_t *RESTRICT dst, int32_t dst_stride,
462                                       const int8_t *filter_horiz,
463                                       const int8_t *filter_vert) {
464   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
465   v16i8 res0, res1, res2, res3;
466   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
467   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
468   v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
469 
470   mask = LD_SB(&vp8_mc_filt_mask_arr[16]);
471 
472   filt = LD_UH(filter_horiz);
473   filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
474   filt = LD_UH(filter_vert);
475   filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
476 
477   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
478   src += (8 * src_stride);
479   src8 = LD_SB(src);
480 
481   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, VP8_FILTER_SHIFT);
482   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, VP8_FILTER_SHIFT);
483   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, VP8_FILTER_SHIFT);
484   hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, VP8_FILTER_SHIFT);
485   hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, VP8_FILTER_SHIFT);
486   SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
487              hz_out3, hz_out5, 8);
488   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
489 
490   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
491   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
492   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, vec4,
493               vec5, vec6, vec7);
494   SRARI_H4_UH(vec4, vec5, vec6, vec7, VP8_FILTER_SHIFT);
495   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
496               res3);
497   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
498   dst += (4 * dst_stride);
499   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
500 }
501 
common_hv_2ht_2vt_4w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)502 static void common_hv_2ht_2vt_4w_msa(uint8_t *RESTRICT src, int32_t src_stride,
503                                      uint8_t *RESTRICT dst, int32_t dst_stride,
504                                      const int8_t *filter_horiz,
505                                      const int8_t *filter_vert,
506                                      int32_t height) {
507   if (4 == height) {
508     common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
509                               filter_vert);
510   } else if (8 == height) {
511     common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride, filter_horiz,
512                               filter_vert);
513   }
514 }
515 
common_hv_2ht_2vt_8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert)516 static void common_hv_2ht_2vt_8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
517                                       uint8_t *RESTRICT dst, int32_t dst_stride,
518                                       const int8_t *filter_horiz,
519                                       const int8_t *filter_vert) {
520   v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
521   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
522   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
523   v8i16 filt;
524 
525   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
526 
527   filt = LD_SH(filter_horiz);
528   filt_hz = (v16u8)__msa_splati_h(filt, 0);
529   filt = LD_SH(filter_vert);
530   filt_vt = (v16u8)__msa_splati_h(filt, 0);
531 
532   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
533 
534   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
535   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
536   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
537   tmp0 = __msa_dotp_u_h(vec0, filt_vt);
538 
539   hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
540   vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
541   tmp1 = __msa_dotp_u_h(vec1, filt_vt);
542 
543   hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
544   vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
545   tmp2 = __msa_dotp_u_h(vec2, filt_vt);
546 
547   hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
548   vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
549   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
550 
551   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, VP8_FILTER_SHIFT);
552   PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
553   ST8x4_UB(out0, out1, dst, dst_stride);
554 }
555 
common_hv_2ht_2vt_8x8mult_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)556 static void common_hv_2ht_2vt_8x8mult_msa(
557     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
558     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
559     int32_t height) {
560   uint32_t loop_cnt;
561   v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
562   v16u8 filt_hz, filt_vt, vec0;
563   v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
564   v8i16 filt;
565 
566   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
567 
568   filt = LD_SH(filter_horiz);
569   filt_hz = (v16u8)__msa_splati_h(filt, 0);
570   filt = LD_SH(filter_vert);
571   filt_vt = (v16u8)__msa_splati_h(filt, 0);
572 
573   src0 = LD_SB(src);
574   src += src_stride;
575 
576   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
577 
578   for (loop_cnt = (height >> 3); loop_cnt--;) {
579     LD_SB4(src, src_stride, src1, src2, src3, src4);
580     src += (4 * src_stride);
581 
582     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
583     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
584     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
585 
586     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
587     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
588     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
589 
590     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
591 
592     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
593     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
594     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
595 
596     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
597     LD_SB4(src, src_stride, src1, src2, src3, src4);
598     src += (4 * src_stride);
599     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
600     tmp4 = __msa_dotp_u_h(vec0, filt_vt);
601 
602     SRARI_H2_UH(tmp3, tmp4, VP8_FILTER_SHIFT);
603     PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
604     ST8x4_UB(out0, out1, dst, dst_stride);
605     dst += (4 * dst_stride);
606 
607     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
608     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
609     tmp5 = __msa_dotp_u_h(vec0, filt_vt);
610 
611     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
612     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
613     tmp6 = __msa_dotp_u_h(vec0, filt_vt);
614 
615     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
616     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
617     tmp7 = __msa_dotp_u_h(vec0, filt_vt);
618 
619     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
620     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
621     tmp8 = __msa_dotp_u_h(vec0, filt_vt);
622 
623     SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, VP8_FILTER_SHIFT);
624     PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
625     ST8x4_UB(out0, out1, dst, dst_stride);
626     dst += (4 * dst_stride);
627   }
628 }
629 
common_hv_2ht_2vt_8w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)630 static void common_hv_2ht_2vt_8w_msa(uint8_t *RESTRICT src, int32_t src_stride,
631                                      uint8_t *RESTRICT dst, int32_t dst_stride,
632                                      const int8_t *filter_horiz,
633                                      const int8_t *filter_vert,
634                                      int32_t height) {
635   if (4 == height) {
636     common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride, filter_horiz,
637                               filter_vert);
638   } else {
639     common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
640                                   filter_horiz, filter_vert, height);
641   }
642 }
643 
common_hv_2ht_2vt_16w_msa(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)644 static void common_hv_2ht_2vt_16w_msa(uint8_t *RESTRICT src, int32_t src_stride,
645                                       uint8_t *RESTRICT dst, int32_t dst_stride,
646                                       const int8_t *filter_horiz,
647                                       const int8_t *filter_vert,
648                                       int32_t height) {
649   uint32_t loop_cnt;
650   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
651   v16u8 filt_hz, filt_vt, vec0, vec1;
652   v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
653   v8i16 filt;
654 
655   mask = LD_SB(&vp8_mc_filt_mask_arr[0]);
656 
657   /* rearranging filter */
658   filt = LD_SH(filter_horiz);
659   filt_hz = (v16u8)__msa_splati_h(filt, 0);
660   filt = LD_SH(filter_vert);
661   filt_vt = (v16u8)__msa_splati_h(filt, 0);
662 
663   LD_SB2(src, 8, src0, src1);
664   src += src_stride;
665 
666   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
667   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
668 
669   for (loop_cnt = (height >> 2); loop_cnt--;) {
670     LD_SB4(src, src_stride, src0, src2, src4, src6);
671     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
672     src += (4 * src_stride);
673 
674     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, VP8_FILTER_SHIFT);
675     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, VP8_FILTER_SHIFT);
676     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
677     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
678     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
679     PCKEV_ST_SB(tmp1, tmp2, dst);
680     dst += dst_stride;
681 
682     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, VP8_FILTER_SHIFT);
683     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, VP8_FILTER_SHIFT);
684     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
685     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
686     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
687     PCKEV_ST_SB(tmp1, tmp2, dst);
688     dst += dst_stride;
689 
690     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, VP8_FILTER_SHIFT);
691     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, VP8_FILTER_SHIFT);
692     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
693     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
694     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
695     PCKEV_ST_SB(tmp1, tmp2, dst);
696     dst += dst_stride;
697 
698     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, VP8_FILTER_SHIFT);
699     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, VP8_FILTER_SHIFT);
700     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
701     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
702     SRARI_H2_UH(tmp1, tmp2, VP8_FILTER_SHIFT);
703     PCKEV_ST_SB(tmp1, tmp2, dst);
704     dst += dst_stride;
705   }
706 }
707 
vp8_bilinear_predict4x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)708 void vp8_bilinear_predict4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
709                                  int32_t xoffset, int32_t yoffset,
710                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
711   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
712   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
713 
714   if (yoffset) {
715     if (xoffset) {
716       common_hv_2ht_2vt_4w_msa(src, src_stride, dst, dst_stride, h_filter,
717                                v_filter, 4);
718     } else {
719       common_vt_2t_4w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
720     }
721   } else {
722     if (xoffset) {
723       common_hz_2t_4w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
724     } else {
725       uint32_t tp0, tp1, tp2, tp3;
726 
727       LW4(src, src_stride, tp0, tp1, tp2, tp3);
728       SW4(tp0, tp1, tp2, tp3, dst, dst_stride);
729     }
730   }
731 }
732 
vp8_bilinear_predict8x4_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)733 void vp8_bilinear_predict8x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
734                                  int32_t xoffset, int32_t yoffset,
735                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
736   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
737   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
738 
739   if (yoffset) {
740     if (xoffset) {
741       common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
742                                v_filter, 4);
743     } else {
744       common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 4);
745     }
746   } else {
747     if (xoffset) {
748       common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 4);
749     } else {
750       vp8_copy_mem8x4(src, src_stride, dst, dst_stride);
751     }
752   }
753 }
754 
vp8_bilinear_predict8x8_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)755 void vp8_bilinear_predict8x8_msa(uint8_t *RESTRICT src, int32_t src_stride,
756                                  int32_t xoffset, int32_t yoffset,
757                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
758   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
759   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
760 
761   if (yoffset) {
762     if (xoffset) {
763       common_hv_2ht_2vt_8w_msa(src, src_stride, dst, dst_stride, h_filter,
764                                v_filter, 8);
765     } else {
766       common_vt_2t_8w_msa(src, src_stride, dst, dst_stride, v_filter, 8);
767     }
768   } else {
769     if (xoffset) {
770       common_hz_2t_8w_msa(src, src_stride, dst, dst_stride, h_filter, 8);
771     } else {
772       vp8_copy_mem8x8(src, src_stride, dst, dst_stride);
773     }
774   }
775 }
776 
vp8_bilinear_predict16x16_msa(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)777 void vp8_bilinear_predict16x16_msa(uint8_t *RESTRICT src, int32_t src_stride,
778                                    int32_t xoffset, int32_t yoffset,
779                                    uint8_t *RESTRICT dst, int32_t dst_stride) {
780   const int8_t *h_filter = vp8_bilinear_filters_msa[xoffset - 1];
781   const int8_t *v_filter = vp8_bilinear_filters_msa[yoffset - 1];
782 
783   if (yoffset) {
784     if (xoffset) {
785       common_hv_2ht_2vt_16w_msa(src, src_stride, dst, dst_stride, h_filter,
786                                 v_filter, 16);
787     } else {
788       common_vt_2t_16w_msa(src, src_stride, dst, dst_stride, v_filter, 16);
789     }
790   } else {
791     if (xoffset) {
792       common_hz_2t_16w_msa(src, src_stride, dst, dst_stride, h_filter, 16);
793     } else {
794       vp8_copy_mem16x16(src, src_stride, dst, dst_stride);
795     }
796   }
797 }
798