1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14 
common_hz_8t_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)15 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
16                                  uint8_t *dst, int32_t dst_stride,
17                                  int8_t *filter) {
18   v16u8 mask0, mask1, mask2, mask3, out;
19   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
20   v8i16 filt, out0, out1;
21 
22   mask0 = LD_UB(&mc_filt_mask_arr[16]);
23   src -= 3;
24 
25   /* rearranging filter */
26   filt = LD_SH(filter);
27   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
28 
29   mask1 = mask0 + 2;
30   mask2 = mask0 + 4;
31   mask3 = mask0 + 6;
32 
33   LD_SB4(src, src_stride, src0, src1, src2, src3);
34   XORI_B4_128_SB(src0, src1, src2, src3);
35   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
36                              filt0, filt1, filt2, filt3, out0, out1);
37   SRARI_H2_SH(out0, out1, FILTER_BITS);
38   SAT_SH2_SH(out0, out1, 7);
39   out = PCKEV_XORI128_UB(out0, out1);
40   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
41 }
42 
common_hz_8t_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)43 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
44                                  uint8_t *dst, int32_t dst_stride,
45                                  int8_t *filter) {
46   v16i8 filt0, filt1, filt2, filt3;
47   v16i8 src0, src1, src2, src3;
48   v16u8 mask0, mask1, mask2, mask3, out;
49   v8i16 filt, out0, out1, out2, out3;
50 
51   mask0 = LD_UB(&mc_filt_mask_arr[16]);
52   src -= 3;
53 
54   /* rearranging filter */
55   filt = LD_SH(filter);
56   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
57 
58   mask1 = mask0 + 2;
59   mask2 = mask0 + 4;
60   mask3 = mask0 + 6;
61 
62   LD_SB4(src, src_stride, src0, src1, src2, src3);
63   XORI_B4_128_SB(src0, src1, src2, src3);
64   src += (4 * src_stride);
65   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
66                              filt0, filt1, filt2, filt3, out0, out1);
67   LD_SB4(src, src_stride, src0, src1, src2, src3);
68   XORI_B4_128_SB(src0, src1, src2, src3);
69   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
70                              filt0, filt1, filt2, filt3, out2, out3);
71   SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
72   SAT_SH4_SH(out0, out1, out2, out3, 7);
73   out = PCKEV_XORI128_UB(out0, out1);
74   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
75   dst += (4 * dst_stride);
76   out = PCKEV_XORI128_UB(out2, out3);
77   ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
78 }
79 
common_hz_8t_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)80 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
81                                 uint8_t *dst, int32_t dst_stride,
82                                 int8_t *filter, int32_t height) {
83   if (4 == height) {
84     common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
85   } else if (8 == height) {
86     common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
87   }
88 }
89 
common_hz_8t_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)90 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
91                                  uint8_t *dst, int32_t dst_stride,
92                                  int8_t *filter) {
93   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
94   v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
95   v8i16 filt, out0, out1, out2, out3;
96 
97   mask0 = LD_UB(&mc_filt_mask_arr[0]);
98   src -= 3;
99 
100   /* rearranging filter */
101   filt = LD_SH(filter);
102   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
103 
104   mask1 = mask0 + 2;
105   mask2 = mask0 + 4;
106   mask3 = mask0 + 6;
107 
108   LD_SB4(src, src_stride, src0, src1, src2, src3);
109   XORI_B4_128_SB(src0, src1, src2, src3);
110   HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
111                              filt0, filt1, filt2, filt3, out0, out1, out2,
112                              out3);
113   SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
114   SAT_SH4_SH(out0, out1, out2, out3, 7);
115   tmp0 = PCKEV_XORI128_UB(out0, out1);
116   tmp1 = PCKEV_XORI128_UB(out2, out3);
117   ST8x4_UB(tmp0, tmp1, dst, dst_stride);
118 }
119 
common_hz_8t_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)120 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
121                                      uint8_t *dst, int32_t dst_stride,
122                                      int8_t *filter, int32_t height) {
123   uint32_t loop_cnt;
124   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
125   v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
126   v8i16 filt, out0, out1, out2, out3;
127 
128   mask0 = LD_UB(&mc_filt_mask_arr[0]);
129   src -= 3;
130 
131   /* rearranging filter */
132   filt = LD_SH(filter);
133   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
134 
135   mask1 = mask0 + 2;
136   mask2 = mask0 + 4;
137   mask3 = mask0 + 6;
138 
139   for (loop_cnt = (height >> 2); loop_cnt--;) {
140     LD_SB4(src, src_stride, src0, src1, src2, src3);
141     XORI_B4_128_SB(src0, src1, src2, src3);
142     src += (4 * src_stride);
143     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
144                                mask3, filt0, filt1, filt2, filt3, out0, out1,
145                                out2, out3);
146     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
147     SAT_SH4_SH(out0, out1, out2, out3, 7);
148     tmp0 = PCKEV_XORI128_UB(out0, out1);
149     tmp1 = PCKEV_XORI128_UB(out2, out3);
150     ST8x4_UB(tmp0, tmp1, dst, dst_stride);
151     dst += (4 * dst_stride);
152   }
153 }
154 
common_hz_8t_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)155 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
156                                 uint8_t *dst, int32_t dst_stride,
157                                 int8_t *filter, int32_t height) {
158   if (4 == height) {
159     common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
160   } else {
161     common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
162   }
163 }
164 
common_hz_8t_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)165 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
166                                  uint8_t *dst, int32_t dst_stride,
167                                  int8_t *filter, int32_t height) {
168   uint32_t loop_cnt;
169   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
170   v16u8 mask0, mask1, mask2, mask3, out;
171   v8i16 filt, out0, out1, out2, out3;
172 
173   mask0 = LD_UB(&mc_filt_mask_arr[0]);
174   src -= 3;
175 
176   /* rearranging filter */
177   filt = LD_SH(filter);
178   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
179 
180   mask1 = mask0 + 2;
181   mask2 = mask0 + 4;
182   mask3 = mask0 + 6;
183 
184   for (loop_cnt = (height >> 1); loop_cnt--;) {
185     LD_SB2(src, src_stride, src0, src2);
186     LD_SB2(src + 8, src_stride, src1, src3);
187     XORI_B4_128_SB(src0, src1, src2, src3);
188     src += (2 * src_stride);
189     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
190                                mask3, filt0, filt1, filt2, filt3, out0, out1,
191                                out2, out3);
192     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
193     SAT_SH4_SH(out0, out1, out2, out3, 7);
194     out = PCKEV_XORI128_UB(out0, out1);
195     ST_UB(out, dst);
196     dst += dst_stride;
197     out = PCKEV_XORI128_UB(out2, out3);
198     ST_UB(out, dst);
199     dst += dst_stride;
200   }
201 }
202 
common_hz_8t_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)203 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
204                                  uint8_t *dst, int32_t dst_stride,
205                                  int8_t *filter, int32_t height) {
206   uint32_t loop_cnt;
207   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
208   v16u8 mask0, mask1, mask2, mask3, out;
209   v8i16 filt, out0, out1, out2, out3;
210 
211   mask0 = LD_UB(&mc_filt_mask_arr[0]);
212   src -= 3;
213 
214   /* rearranging filter */
215   filt = LD_SH(filter);
216   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
217 
218   mask1 = mask0 + 2;
219   mask2 = mask0 + 4;
220   mask3 = mask0 + 6;
221 
222   for (loop_cnt = (height >> 1); loop_cnt--;) {
223     src0 = LD_SB(src);
224     src2 = LD_SB(src + 16);
225     src3 = LD_SB(src + 24);
226     src1 = __msa_sldi_b(src2, src0, 8);
227     src += src_stride;
228     XORI_B4_128_SB(src0, src1, src2, src3);
229     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
230                                mask3, filt0, filt1, filt2, filt3, out0, out1,
231                                out2, out3);
232     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
233     SAT_SH4_SH(out0, out1, out2, out3, 7);
234 
235     src0 = LD_SB(src);
236     src2 = LD_SB(src + 16);
237     src3 = LD_SB(src + 24);
238     src1 = __msa_sldi_b(src2, src0, 8);
239     src += src_stride;
240 
241     out = PCKEV_XORI128_UB(out0, out1);
242     ST_UB(out, dst);
243     out = PCKEV_XORI128_UB(out2, out3);
244     ST_UB(out, dst + 16);
245     dst += dst_stride;
246 
247     XORI_B4_128_SB(src0, src1, src2, src3);
248     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
249                                mask3, filt0, filt1, filt2, filt3, out0, out1,
250                                out2, out3);
251     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
252     SAT_SH4_SH(out0, out1, out2, out3, 7);
253     out = PCKEV_XORI128_UB(out0, out1);
254     ST_UB(out, dst);
255     out = PCKEV_XORI128_UB(out2, out3);
256     ST_UB(out, dst + 16);
257     dst += dst_stride;
258   }
259 }
260 
common_hz_8t_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)261 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
262                                  uint8_t *dst, int32_t dst_stride,
263                                  int8_t *filter, int32_t height) {
264   int32_t loop_cnt;
265   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
266   v16u8 mask0, mask1, mask2, mask3, out;
267   v8i16 filt, out0, out1, out2, out3;
268 
269   mask0 = LD_UB(&mc_filt_mask_arr[0]);
270   src -= 3;
271 
272   /* rearranging filter */
273   filt = LD_SH(filter);
274   SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
275 
276   mask1 = mask0 + 2;
277   mask2 = mask0 + 4;
278   mask3 = mask0 + 6;
279 
280   for (loop_cnt = height; loop_cnt--;) {
281     src0 = LD_SB(src);
282     src2 = LD_SB(src + 16);
283     src3 = LD_SB(src + 24);
284     src1 = __msa_sldi_b(src2, src0, 8);
285 
286     XORI_B4_128_SB(src0, src1, src2, src3);
287     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
288                                mask3, filt0, filt1, filt2, filt3, out0, out1,
289                                out2, out3);
290     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
291     SAT_SH4_SH(out0, out1, out2, out3, 7);
292     out = PCKEV_XORI128_UB(out0, out1);
293     ST_UB(out, dst);
294     out = PCKEV_XORI128_UB(out2, out3);
295     ST_UB(out, dst + 16);
296 
297     src0 = LD_SB(src + 32);
298     src2 = LD_SB(src + 48);
299     src3 = LD_SB(src + 56);
300     src1 = __msa_sldi_b(src2, src0, 8);
301     src += src_stride;
302 
303     XORI_B4_128_SB(src0, src1, src2, src3);
304     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
305                                mask3, filt0, filt1, filt2, filt3, out0, out1,
306                                out2, out3);
307     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
308     SAT_SH4_SH(out0, out1, out2, out3, 7);
309     out = PCKEV_XORI128_UB(out0, out1);
310     ST_UB(out, dst + 32);
311     out = PCKEV_XORI128_UB(out2, out3);
312     ST_UB(out, dst + 48);
313     dst += dst_stride;
314   }
315 }
316 
common_hz_2t_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)317 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
318                                  uint8_t *dst, int32_t dst_stride,
319                                  int8_t *filter) {
320   v16i8 src0, src1, src2, src3, mask;
321   v16u8 filt0, vec0, vec1, res0, res1;
322   v8u16 vec2, vec3, filt;
323 
324   mask = LD_SB(&mc_filt_mask_arr[16]);
325 
326   /* rearranging filter */
327   filt = LD_UH(filter);
328   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
329 
330   LD_SB4(src, src_stride, src0, src1, src2, src3);
331   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
332   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
333   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
334   PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
335   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
336 }
337 
common_hz_2t_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)338 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
339                                  uint8_t *dst, int32_t dst_stride,
340                                  int8_t *filter) {
341   v16u8 vec0, vec1, vec2, vec3, filt0;
342   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
343   v16i8 res0, res1, res2, res3;
344   v8u16 vec4, vec5, vec6, vec7, filt;
345 
346   mask = LD_SB(&mc_filt_mask_arr[16]);
347 
348   /* rearranging filter */
349   filt = LD_UH(filter);
350   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
351 
352   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
353   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
354   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
355   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
356               vec6, vec7);
357   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
358   PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
359               res3);
360   ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
361   dst += (4 * dst_stride);
362   ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
363 }
364 
common_hz_2t_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)365 static void common_hz_2t_4w_msa(const uint8_t *src, int32_t src_stride,
366                                 uint8_t *dst, int32_t dst_stride,
367                                 int8_t *filter, int32_t height) {
368   if (4 == height) {
369     common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
370   } else if (8 == height) {
371     common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
372   }
373 }
374 
common_hz_2t_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)375 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
376                                  uint8_t *dst, int32_t dst_stride,
377                                  int8_t *filter) {
378   v16u8 filt0;
379   v16i8 src0, src1, src2, src3, mask;
380   v8u16 vec0, vec1, vec2, vec3, filt;
381 
382   mask = LD_SB(&mc_filt_mask_arr[0]);
383 
384   /* rearranging filter */
385   filt = LD_UH(filter);
386   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
387 
388   LD_SB4(src, src_stride, src0, src1, src2, src3);
389   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
390   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
391   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
392               vec2, vec3);
393   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
394   PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
395   ST8x4_UB(src0, src1, dst, dst_stride);
396 }
397 
common_hz_2t_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)398 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
399                                      uint8_t *dst, int32_t dst_stride,
400                                      int8_t *filter, int32_t height) {
401   v16u8 filt0;
402   v16i8 src0, src1, src2, src3, mask, out0, out1;
403   v8u16 vec0, vec1, vec2, vec3, filt;
404 
405   mask = LD_SB(&mc_filt_mask_arr[0]);
406 
407   /* rearranging filter */
408   filt = LD_UH(filter);
409   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
410 
411   LD_SB4(src, src_stride, src0, src1, src2, src3);
412   src += (4 * src_stride);
413 
414   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
415   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
416   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
417               vec2, vec3);
418   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
419 
420   LD_SB4(src, src_stride, src0, src1, src2, src3);
421   src += (4 * src_stride);
422 
423   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
424   ST8x4_UB(out0, out1, dst, dst_stride);
425   dst += (4 * dst_stride);
426 
427   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
428   VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
429   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
430               vec2, vec3);
431   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
432   PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
433   ST8x4_UB(out0, out1, dst, dst_stride);
434   dst += (4 * dst_stride);
435 
436   if (16 == height) {
437     LD_SB4(src, src_stride, src0, src1, src2, src3);
438     src += (4 * src_stride);
439 
440     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
441     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
442     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
443                 vec2, vec3);
444     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
445     LD_SB4(src, src_stride, src0, src1, src2, src3);
446     src += (4 * src_stride);
447 
448     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
449     ST8x4_UB(out0, out1, dst, dst_stride);
450 
451     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
452     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
453     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
454                 vec2, vec3);
455     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
456     PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
457     ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
458   }
459 }
460 
common_hz_2t_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)461 static void common_hz_2t_8w_msa(const uint8_t *src, int32_t src_stride,
462                                 uint8_t *dst, int32_t dst_stride,
463                                 int8_t *filter, int32_t height) {
464   if (4 == height) {
465     common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
466   } else {
467     common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter, height);
468   }
469 }
470 
common_hz_2t_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)471 static void common_hz_2t_16w_msa(const uint8_t *src, int32_t src_stride,
472                                  uint8_t *dst, int32_t dst_stride,
473                                  int8_t *filter, int32_t height) {
474   uint32_t loop_cnt;
475   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
476   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
477   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
478 
479   mask = LD_SB(&mc_filt_mask_arr[0]);
480 
481   loop_cnt = (height >> 2) - 1;
482 
483   /* rearranging filter */
484   filt = LD_UH(filter);
485   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
486 
487   LD_SB4(src, src_stride, src0, src2, src4, src6);
488   LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
489   src += (4 * src_stride);
490 
491   VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
492   VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
493   VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
494   VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
495   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
496               out2, out3);
497   DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
498               out6, out7);
499   SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
500   SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
501   PCKEV_ST_SB(out0, out1, dst);
502   dst += dst_stride;
503   PCKEV_ST_SB(out2, out3, dst);
504   dst += dst_stride;
505   PCKEV_ST_SB(out4, out5, dst);
506   dst += dst_stride;
507   PCKEV_ST_SB(out6, out7, dst);
508   dst += dst_stride;
509 
510   for (; loop_cnt--;) {
511     LD_SB4(src, src_stride, src0, src2, src4, src6);
512     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
513     src += (4 * src_stride);
514 
515     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
516     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
517     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
518     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
519     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
520                 out2, out3);
521     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
522                 out6, out7);
523     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
524     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
525     PCKEV_ST_SB(out0, out1, dst);
526     dst += dst_stride;
527     PCKEV_ST_SB(out2, out3, dst);
528     dst += dst_stride;
529     PCKEV_ST_SB(out4, out5, dst);
530     dst += dst_stride;
531     PCKEV_ST_SB(out6, out7, dst);
532     dst += dst_stride;
533   }
534 }
535 
common_hz_2t_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)536 static void common_hz_2t_32w_msa(const uint8_t *src, int32_t src_stride,
537                                  uint8_t *dst, int32_t dst_stride,
538                                  int8_t *filter, int32_t height) {
539   uint32_t loop_cnt;
540   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
541   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
542   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
543 
544   mask = LD_SB(&mc_filt_mask_arr[0]);
545 
546   /* rearranging filter */
547   filt = LD_UH(filter);
548   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
549 
550   for (loop_cnt = height >> 1; loop_cnt--;) {
551     src0 = LD_SB(src);
552     src2 = LD_SB(src + 16);
553     src3 = LD_SB(src + 24);
554     src1 = __msa_sldi_b(src2, src0, 8);
555     src += src_stride;
556     src4 = LD_SB(src);
557     src6 = LD_SB(src + 16);
558     src7 = LD_SB(src + 24);
559     src5 = __msa_sldi_b(src6, src4, 8);
560     src += src_stride;
561 
562     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
563     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
564     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
565     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
566     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
567                 out2, out3);
568     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
569                 out6, out7);
570     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
571     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
572     PCKEV_ST_SB(out0, out1, dst);
573     PCKEV_ST_SB(out2, out3, dst + 16);
574     dst += dst_stride;
575     PCKEV_ST_SB(out4, out5, dst);
576     PCKEV_ST_SB(out6, out7, dst + 16);
577     dst += dst_stride;
578   }
579 }
580 
common_hz_2t_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)581 static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
582                                  uint8_t *dst, int32_t dst_stride,
583                                  int8_t *filter, int32_t height) {
584   uint32_t loop_cnt;
585   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
586   v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
587   v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
588 
589   mask = LD_SB(&mc_filt_mask_arr[0]);
590 
591   /* rearranging filter */
592   filt = LD_UH(filter);
593   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
594 
595   for (loop_cnt = height; loop_cnt--;) {
596     src0 = LD_SB(src);
597     src2 = LD_SB(src + 16);
598     src4 = LD_SB(src + 32);
599     src6 = LD_SB(src + 48);
600     src7 = LD_SB(src + 56);
601     SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
602     src += src_stride;
603 
604     VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
605     VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
606     VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
607     VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
608     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
609                 out2, out3);
610     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
611                 out6, out7);
612     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
613     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
614     PCKEV_ST_SB(out0, out1, dst);
615     PCKEV_ST_SB(out2, out3, dst + 16);
616     PCKEV_ST_SB(out4, out5, dst + 32);
617     PCKEV_ST_SB(out6, out7, dst + 48);
618     dst += dst_stride;
619   }
620 }
621 
vpx_convolve8_horiz_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)622 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
623                              uint8_t *dst, ptrdiff_t dst_stride,
624                              const InterpKernel *filter, int x0_q4,
625                              int x_step_q4, int y0_q4, int y_step_q4, int w,
626                              int h) {
627   const int16_t *const filter_x = filter[x0_q4];
628   int8_t cnt, filt_hor[8];
629 
630   assert(x_step_q4 == 16);
631   assert(((const int32_t *)filter_x)[1] != 0x800000);
632 
633   for (cnt = 0; cnt < 8; ++cnt) {
634     filt_hor[cnt] = filter_x[cnt];
635   }
636 
637   if (vpx_get_filter_taps(filter_x) == 2) {
638     switch (w) {
639       case 4:
640         common_hz_2t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
641                             &filt_hor[3], h);
642         break;
643       case 8:
644         common_hz_2t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
645                             &filt_hor[3], h);
646         break;
647       case 16:
648         common_hz_2t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
649                              &filt_hor[3], h);
650         break;
651       case 32:
652         common_hz_2t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
653                              &filt_hor[3], h);
654         break;
655       case 64:
656         common_hz_2t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
657                              &filt_hor[3], h);
658         break;
659       default:
660         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
661                               x_step_q4, y0_q4, y_step_q4, w, h);
662         break;
663     }
664   } else {
665     switch (w) {
666       case 4:
667         common_hz_8t_4w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
668                             filt_hor, h);
669         break;
670       case 8:
671         common_hz_8t_8w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
672                             filt_hor, h);
673         break;
674       case 16:
675         common_hz_8t_16w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
676                              filt_hor, h);
677         break;
678       case 32:
679         common_hz_8t_32w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
680                              filt_hor, h);
681         break;
682       case 64:
683         common_hz_8t_64w_msa(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
684                              filt_hor, h);
685         break;
686       default:
687         vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
688                               x_step_q4, y0_q4, y_step_q4, w, h);
689         break;
690     }
691   }
692 }
693