1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14 
common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)15 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
16     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
17     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
18   uint32_t loop_cnt;
19   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
20   v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
21   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
22   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
23   v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
24   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
25 
26   mask0 = LD_UB(&mc_filt_mask_arr[16]);
27   src -= (3 + 3 * src_stride);
28 
29   /* rearranging filter */
30   filt = LD_SH(filter_horiz);
31   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
32 
33   mask1 = mask0 + 2;
34   mask2 = mask0 + 4;
35   mask3 = mask0 + 6;
36 
37   LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
38   XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
39   src += (7 * src_stride);
40 
41   hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
42                             filt_hz1, filt_hz2, filt_hz3);
43   hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
44                             filt_hz1, filt_hz2, filt_hz3);
45   hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
46                             filt_hz1, filt_hz2, filt_hz3);
47   hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
48                             filt_hz1, filt_hz2, filt_hz3);
49   SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
50 
51   filt = LD_SH(filter_vert);
52   SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
53 
54   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
55   vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
56 
57   for (loop_cnt = (height >> 2); loop_cnt--;) {
58     LD_SB4(src, src_stride, src7, src8, src9, src10);
59     XORI_B4_128_SB(src7, src8, src9, src10);
60     src += (4 * src_stride);
61 
62     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
63     hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
64                               filt_hz1, filt_hz2, filt_hz3);
65     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
66     vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
67     res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
68                                filt_vt2, filt_vt3);
69 
70     hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
71                               filt_hz1, filt_hz2, filt_hz3);
72     hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
73     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
74     res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
75                                filt_vt2, filt_vt3);
76     ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
77 
78     SRARI_H2_SH(res0, res1, FILTER_BITS);
79     SAT_SH2_SH(res0, res1, 7);
80     PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
81     XORI_B2_128_UB(tmp0, tmp1);
82     AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
83     ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
84     dst += (4 * dst_stride);
85 
86     hz_out5 = hz_out9;
87     vec0 = vec2;
88     vec1 = vec3;
89     vec2 = vec4;
90   }
91 }
92 
common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)93 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
94     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
95     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
96   uint32_t loop_cnt;
97   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
98   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
99   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
100   v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
101   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
102   v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
103   v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
104 
105   mask0 = LD_UB(&mc_filt_mask_arr[0]);
106   src -= (3 + 3 * src_stride);
107 
108   /* rearranging filter */
109   filt = LD_SH(filter_horiz);
110   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
111 
112   mask1 = mask0 + 2;
113   mask2 = mask0 + 4;
114   mask3 = mask0 + 6;
115 
116   LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
117   src += (7 * src_stride);
118 
119   XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
120   hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
121                             filt_hz1, filt_hz2, filt_hz3);
122   hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
123                             filt_hz1, filt_hz2, filt_hz3);
124   hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
125                             filt_hz1, filt_hz2, filt_hz3);
126   hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
127                             filt_hz1, filt_hz2, filt_hz3);
128   hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
129                             filt_hz1, filt_hz2, filt_hz3);
130   hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
131                             filt_hz1, filt_hz2, filt_hz3);
132   hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
133                             filt_hz1, filt_hz2, filt_hz3);
134 
135   filt = LD_SH(filter_vert);
136   SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
137 
138   ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
139   ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
140   ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
141 
142   for (loop_cnt = (height >> 2); loop_cnt--;) {
143     LD_SB4(src, src_stride, src7, src8, src9, src10);
144     XORI_B4_128_SB(src7, src8, src9, src10);
145     src += (4 * src_stride);
146 
147     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
148 
149     hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
150                               filt_hz1, filt_hz2, filt_hz3);
151     out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
152     tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
153                                filt_vt2, filt_vt3);
154 
155     hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
156                               filt_hz1, filt_hz2, filt_hz3);
157     out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
158     tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
159                                filt_vt2, filt_vt3);
160 
161     hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
162                               filt_hz1, filt_hz2, filt_hz3);
163     out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
164     tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
165                                filt_vt2, filt_vt3);
166 
167     hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
168                                filt_hz0, filt_hz1, filt_hz2, filt_hz3);
169     out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
170     tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
171                                filt_vt2, filt_vt3);
172 
173     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
174     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
175     CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
176                             dst_stride);
177     dst += (4 * dst_stride);
178 
179     hz_out6 = hz_out10;
180     out0 = out2;
181     out1 = out3;
182     out2 = out8;
183     out4 = out6;
184     out5 = out7;
185     out6 = out9;
186   }
187 }
188 
common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)189 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
190     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
191     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
192   int32_t multiple8_cnt;
193   for (multiple8_cnt = 2; multiple8_cnt--;) {
194     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
195                                           filter_horiz, filter_vert, height);
196     src += 8;
197     dst += 8;
198   }
199 }
200 
common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)201 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
202     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
203     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
204   int32_t multiple8_cnt;
205   for (multiple8_cnt = 4; multiple8_cnt--;) {
206     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
207                                           filter_horiz, filter_vert, height);
208     src += 8;
209     dst += 8;
210   }
211 }
212 
common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)213 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
214     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
215     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
216   int32_t multiple8_cnt;
217   for (multiple8_cnt = 8; multiple8_cnt--;) {
218     common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
219                                           filter_horiz, filter_vert, height);
220     src += 8;
221     dst += 8;
222   }
223 }
224 
common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)225 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
226     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
227     int8_t *filter_horiz, int8_t *filter_vert) {
228   v16i8 src0, src1, src2, src3, src4, mask;
229   v16u8 filt_hz, filt_vt, vec0, vec1;
230   v16u8 dst0, dst1, dst2, dst3, res0, res1;
231   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
232 
233   mask = LD_SB(&mc_filt_mask_arr[16]);
234 
235   /* rearranging filter */
236   filt = LD_UH(filter_horiz);
237   filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
238 
239   filt = LD_UH(filter_vert);
240   filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
241 
242   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
243 
244   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
245   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
246   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
247   hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
248   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
249   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
250 
251   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
252   ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
253   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
254   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
255   PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
256   AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
257   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
258 }
259 
common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)260 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
261     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
262     int8_t *filter_horiz, int8_t *filter_vert) {
263   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
264   v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
265   v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
266   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
267   v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
268   v8i16 filt;
269 
270   mask = LD_SB(&mc_filt_mask_arr[16]);
271 
272   /* rearranging filter */
273   filt = LD_SH(filter_horiz);
274   filt_hz = (v16u8)__msa_splati_h(filt, 0);
275 
276   filt = LD_SH(filter_vert);
277   filt_vt = (v16u8)__msa_splati_h(filt, 0);
278 
279   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
280   src += (8 * src_stride);
281   src8 = LD_SB(src);
282 
283   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
284   hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
285   hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
286   hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
287   hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
288   SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
289              hz_out3, hz_out5, 8);
290   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
291 
292   LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
293   ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
294              dst6);
295   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
296   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
297   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
298               tmp1, tmp2, tmp3);
299   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
300   PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
301               res3);
302   AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
303               res3);
304   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
305   dst += (4 * dst_stride);
306   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
307 }
308 
common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)309 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
310     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
311     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
312   if (4 == height) {
313     common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
314                                            filter_horiz, filter_vert);
315   } else if (8 == height) {
316     common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
317                                            filter_horiz, filter_vert);
318   }
319 }
320 
common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)321 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
322     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
323     int8_t *filter_horiz, int8_t *filter_vert) {
324   v16i8 src0, src1, src2, src3, src4, mask;
325   v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
326   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
327   v8i16 filt;
328 
329   mask = LD_SB(&mc_filt_mask_arr[0]);
330 
331   /* rearranging filter */
332   filt = LD_SH(filter_horiz);
333   filt_hz = (v16u8)__msa_splati_h(filt, 0);
334 
335   filt = LD_SH(filter_vert);
336   filt_vt = (v16u8)__msa_splati_h(filt, 0);
337 
338   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
339   src += (5 * src_stride);
340 
341   LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
342   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
343   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
344   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
345   tmp0 = __msa_dotp_u_h(vec0, filt_vt);
346 
347   hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
348   vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
349   tmp1 = __msa_dotp_u_h(vec1, filt_vt);
350 
351   hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
352   vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
353   tmp2 = __msa_dotp_u_h(vec2, filt_vt);
354 
355   hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
356   vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
357   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
358 
359   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
360   PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
361                      dst_stride);
362 }
363 
common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)364 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
365     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
366     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
367   uint32_t loop_cnt;
368   v16i8 src0, src1, src2, src3, src4, mask;
369   v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
370   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
371   v8i16 filt;
372 
373   mask = LD_SB(&mc_filt_mask_arr[0]);
374 
375   /* rearranging filter */
376   filt = LD_SH(filter_horiz);
377   filt_hz = (v16u8)__msa_splati_h(filt, 0);
378 
379   filt = LD_SH(filter_vert);
380   filt_vt = (v16u8)__msa_splati_h(filt, 0);
381 
382   src0 = LD_SB(src);
383   src += src_stride;
384 
385   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
386 
387   for (loop_cnt = (height >> 2); loop_cnt--;) {
388     LD_SB4(src, src_stride, src1, src2, src3, src4);
389     src += (4 * src_stride);
390 
391     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
392     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
393     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
394 
395     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
396     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
397     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
398 
399     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
400 
401     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
402     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
403     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
404 
405     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
406     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
407     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
408 
409     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
410     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
411     PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
412                        dst_stride);
413     dst += (4 * dst_stride);
414   }
415 }
416 
common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)417 static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
418     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
419     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
420   if (4 == height) {
421     common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
422                                            filter_horiz, filter_vert);
423   } else {
424     common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
425         src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
426   }
427 }
428 
common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)429 static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
430     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
431     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
432   uint32_t loop_cnt;
433   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
434   v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
435   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
436   v8i16 filt;
437 
438   mask = LD_SB(&mc_filt_mask_arr[0]);
439 
440   /* rearranging filter */
441   filt = LD_SH(filter_horiz);
442   filt_hz = (v16u8)__msa_splati_h(filt, 0);
443 
444   filt = LD_SH(filter_vert);
445   filt_vt = (v16u8)__msa_splati_h(filt, 0);
446 
447   LD_SB2(src, 8, src0, src1);
448   src += src_stride;
449 
450   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
451   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
452 
453   for (loop_cnt = (height >> 2); loop_cnt--;) {
454     LD_SB4(src, src_stride, src0, src2, src4, src6);
455     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
456     src += (4 * src_stride);
457     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
458 
459     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
460     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
461     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
462     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
463     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
464     PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
465     dst += dst_stride;
466 
467     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
468     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
469     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
470     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
471     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
472     PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
473     dst += dst_stride;
474 
475     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
476     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
477     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
478     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
479     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
480     PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
481     dst += dst_stride;
482 
483     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
484     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
485     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
486     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
487     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
488     PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
489     dst += dst_stride;
490   }
491 }
492 
common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)493 static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
494     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
495     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
496   int32_t multiple8_cnt;
497   for (multiple8_cnt = 2; multiple8_cnt--;) {
498     common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
499                                            filter_horiz, filter_vert, height);
500     src += 16;
501     dst += 16;
502   }
503 }
504 
common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)505 static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
506     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
507     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
508   int32_t multiple8_cnt;
509   for (multiple8_cnt = 4; multiple8_cnt--;) {
510     common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
511                                            filter_horiz, filter_vert, height);
512     src += 16;
513     dst += 16;
514   }
515 }
516 
vpx_convolve8_avg_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)517 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
518                            uint8_t *dst, ptrdiff_t dst_stride,
519                            const int16_t *filter_x, int x_step_q4,
520                            const int16_t *filter_y, int y_step_q4, int w,
521                            int h) {
522   int8_t cnt, filt_hor[8], filt_ver[8];
523 
524   assert(x_step_q4 == 16);
525   assert(y_step_q4 == 16);
526   assert(((const int32_t *)filter_x)[1] != 0x800000);
527   assert(((const int32_t *)filter_y)[1] != 0x800000);
528 
529   for (cnt = 0; cnt < 8; ++cnt) {
530     filt_hor[cnt] = filter_x[cnt];
531     filt_ver[cnt] = filter_y[cnt];
532   }
533 
534   if (((const int32_t *)filter_x)[0] == 0 &&
535       ((const int32_t *)filter_y)[0] == 0) {
536     switch (w) {
537       case 4:
538         common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
539                                               (int32_t)dst_stride, &filt_hor[3],
540                                               &filt_ver[3], h);
541         break;
542       case 8:
543         common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
544                                               (int32_t)dst_stride, &filt_hor[3],
545                                               &filt_ver[3], h);
546         break;
547       case 16:
548         common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
549                                                (int32_t)dst_stride,
550                                                &filt_hor[3], &filt_ver[3], h);
551         break;
552       case 32:
553         common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
554                                                (int32_t)dst_stride,
555                                                &filt_hor[3], &filt_ver[3], h);
556         break;
557       case 64:
558         common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
559                                                (int32_t)dst_stride,
560                                                &filt_hor[3], &filt_ver[3], h);
561         break;
562       default:
563         vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
564                             x_step_q4, filter_y, y_step_q4, w, h);
565         break;
566     }
567   } else if (((const int32_t *)filter_x)[0] == 0 ||
568              ((const int32_t *)filter_y)[0] == 0) {
569     vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
570                         filter_y, y_step_q4, w, h);
571   } else {
572     switch (w) {
573       case 4:
574         common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
575                                               (int32_t)dst_stride, filt_hor,
576                                               filt_ver, h);
577         break;
578       case 8:
579         common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
580                                               (int32_t)dst_stride, filt_hor,
581                                               filt_ver, h);
582         break;
583       case 16:
584         common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
585                                                (int32_t)dst_stride, filt_hor,
586                                                filt_ver, h);
587         break;
588       case 32:
589         common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
590                                                (int32_t)dst_stride, filt_hor,
591                                                filt_ver, h);
592         break;
593       case 64:
594         common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
595                                                (int32_t)dst_stride, filt_hor,
596                                                filt_ver, h);
597         break;
598       default:
599         vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
600                             x_step_q4, filter_y, y_step_q4, w, h);
601         break;
602     }
603   }
604 }
605