1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14 #include "vpx_dsp/variance.h"
15 
16 static const uint8_t bilinear_filters_msa[8][2] = {
17   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
18   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
19 };
20 
21 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
22   {                                                                 \
23     v16u8 src_l0_m, src_l1_m;                                       \
24     v8i16 res_l0_m, res_l1_m;                                       \
25                                                                     \
26     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
27     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
28     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
29                                                                     \
30     (sub) += res_l0_m + res_l1_m;                                   \
31   }
32 
33 #define VARIANCE_WxH(sse, diff, shift) \
34   (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
35 
36 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
37   (sse) - (((int64_t)(diff) * (diff)) >> (shift))
38 
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)39 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
40                                         int32_t src_stride,
41                                         const uint8_t *ref_ptr,
42                                         int32_t ref_stride,
43                                         const uint8_t *sec_pred, int32_t height,
44                                         int32_t *diff) {
45   int32_t ht_cnt;
46   uint32_t src0, src1, src2, src3;
47   uint32_t ref0, ref1, ref2, ref3;
48   v16u8 pred, src = { 0 };
49   v16u8 ref = { 0 };
50   v8i16 avg = { 0 };
51   v4i32 vec, var = { 0 };
52 
53   for (ht_cnt = (height >> 2); ht_cnt--;) {
54     pred = LD_UB(sec_pred);
55     sec_pred += 16;
56     LW4(src_ptr, src_stride, src0, src1, src2, src3);
57     src_ptr += (4 * src_stride);
58     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
59     ref_ptr += (4 * ref_stride);
60 
61     INSERT_W4_UB(src0, src1, src2, src3, src);
62     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
63 
64     src = __msa_aver_u_b(src, pred);
65     CALC_MSE_AVG_B(src, ref, var, avg);
66   }
67 
68   vec = __msa_hadd_s_w(avg, avg);
69   *diff = HADD_SW_S32(vec);
70 
71   return HADD_SW_S32(var);
72 }
73 
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)74 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
75                                         int32_t src_stride,
76                                         const uint8_t *ref_ptr,
77                                         int32_t ref_stride,
78                                         const uint8_t *sec_pred, int32_t height,
79                                         int32_t *diff) {
80   int32_t ht_cnt;
81   v16u8 src0, src1, src2, src3;
82   v16u8 ref0, ref1, ref2, ref3;
83   v16u8 pred0, pred1;
84   v8i16 avg = { 0 };
85   v4i32 vec, var = { 0 };
86 
87   for (ht_cnt = (height >> 2); ht_cnt--;) {
88     LD_UB2(sec_pred, 16, pred0, pred1);
89     sec_pred += 32;
90     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
91     src_ptr += (4 * src_stride);
92     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
93     ref_ptr += (4 * ref_stride);
94 
95     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
96                 ref0, ref1);
97     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
98     CALC_MSE_AVG_B(src0, ref0, var, avg);
99     CALC_MSE_AVG_B(src1, ref1, var, avg);
100   }
101 
102   vec = __msa_hadd_s_w(avg, avg);
103   *diff = HADD_SW_S32(vec);
104 
105   return HADD_SW_S32(var);
106 }
107 
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)108 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
109                                          int32_t src_stride,
110                                          const uint8_t *ref_ptr,
111                                          int32_t ref_stride,
112                                          const uint8_t *sec_pred,
113                                          int32_t height, int32_t *diff) {
114   int32_t ht_cnt;
115   v16u8 src, ref, pred;
116   v8i16 avg = { 0 };
117   v4i32 vec, var = { 0 };
118 
119   for (ht_cnt = (height >> 2); ht_cnt--;) {
120     pred = LD_UB(sec_pred);
121     sec_pred += 16;
122     src = LD_UB(src_ptr);
123     src_ptr += src_stride;
124     ref = LD_UB(ref_ptr);
125     ref_ptr += ref_stride;
126     src = __msa_aver_u_b(src, pred);
127     CALC_MSE_AVG_B(src, ref, var, avg);
128 
129     pred = LD_UB(sec_pred);
130     sec_pred += 16;
131     src = LD_UB(src_ptr);
132     src_ptr += src_stride;
133     ref = LD_UB(ref_ptr);
134     ref_ptr += ref_stride;
135     src = __msa_aver_u_b(src, pred);
136     CALC_MSE_AVG_B(src, ref, var, avg);
137 
138     pred = LD_UB(sec_pred);
139     sec_pred += 16;
140     src = LD_UB(src_ptr);
141     src_ptr += src_stride;
142     ref = LD_UB(ref_ptr);
143     ref_ptr += ref_stride;
144     src = __msa_aver_u_b(src, pred);
145     CALC_MSE_AVG_B(src, ref, var, avg);
146 
147     pred = LD_UB(sec_pred);
148     sec_pred += 16;
149     src = LD_UB(src_ptr);
150     src_ptr += src_stride;
151     ref = LD_UB(ref_ptr);
152     ref_ptr += ref_stride;
153     src = __msa_aver_u_b(src, pred);
154     CALC_MSE_AVG_B(src, ref, var, avg);
155   }
156 
157   vec = __msa_hadd_s_w(avg, avg);
158   *diff = HADD_SW_S32(vec);
159 
160   return HADD_SW_S32(var);
161 }
162 
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)163 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
164                                          int32_t src_stride,
165                                          const uint8_t *ref_ptr,
166                                          int32_t ref_stride,
167                                          const uint8_t *sec_pred,
168                                          int32_t height, int32_t *diff) {
169   int32_t ht_cnt;
170   v16u8 src0, src1, ref0, ref1, pred0, pred1;
171   v8i16 avg = { 0 };
172   v4i32 vec, var = { 0 };
173 
174   for (ht_cnt = (height >> 2); ht_cnt--;) {
175     LD_UB2(sec_pred, 16, pred0, pred1);
176     sec_pred += 32;
177     LD_UB2(src_ptr, 16, src0, src1);
178     src_ptr += src_stride;
179     LD_UB2(ref_ptr, 16, ref0, ref1);
180     ref_ptr += ref_stride;
181     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
182     CALC_MSE_AVG_B(src0, ref0, var, avg);
183     CALC_MSE_AVG_B(src1, ref1, var, avg);
184 
185     LD_UB2(sec_pred, 16, pred0, pred1);
186     sec_pred += 32;
187     LD_UB2(src_ptr, 16, src0, src1);
188     src_ptr += src_stride;
189     LD_UB2(ref_ptr, 16, ref0, ref1);
190     ref_ptr += ref_stride;
191     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
192     CALC_MSE_AVG_B(src0, ref0, var, avg);
193     CALC_MSE_AVG_B(src1, ref1, var, avg);
194 
195     LD_UB2(sec_pred, 16, pred0, pred1);
196     sec_pred += 32;
197     LD_UB2(src_ptr, 16, src0, src1);
198     src_ptr += src_stride;
199     LD_UB2(ref_ptr, 16, ref0, ref1);
200     ref_ptr += ref_stride;
201     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
202     CALC_MSE_AVG_B(src0, ref0, var, avg);
203     CALC_MSE_AVG_B(src1, ref1, var, avg);
204 
205     LD_UB2(sec_pred, 16, pred0, pred1);
206     sec_pred += 32;
207     LD_UB2(src_ptr, 16, src0, src1);
208     src_ptr += src_stride;
209     LD_UB2(ref_ptr, 16, ref0, ref1);
210     ref_ptr += ref_stride;
211     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
212     CALC_MSE_AVG_B(src0, ref0, var, avg);
213     CALC_MSE_AVG_B(src1, ref1, var, avg);
214   }
215 
216   vec = __msa_hadd_s_w(avg, avg);
217   *diff = HADD_SW_S32(vec);
218 
219   return HADD_SW_S32(var);
220 }
221 
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)222 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
223                                        int32_t src_stride,
224                                        const uint8_t *ref_ptr,
225                                        int32_t ref_stride,
226                                        const uint8_t *sec_pred, int32_t *diff) {
227   int32_t ht_cnt;
228   v16u8 src0, src1, ref0, ref1, pred0, pred1;
229   v8i16 avg0 = { 0 };
230   v8i16 avg1 = { 0 };
231   v4i32 vec, var = { 0 };
232 
233   for (ht_cnt = 16; ht_cnt--;) {
234     LD_UB2(sec_pred, 16, pred0, pred1);
235     sec_pred += 32;
236     LD_UB2(src_ptr, 16, src0, src1);
237     src_ptr += src_stride;
238     LD_UB2(ref_ptr, 16, ref0, ref1);
239     ref_ptr += ref_stride;
240     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
241     CALC_MSE_AVG_B(src0, ref0, var, avg0);
242     CALC_MSE_AVG_B(src1, ref1, var, avg1);
243 
244     LD_UB2(sec_pred, 16, pred0, pred1);
245     sec_pred += 32;
246     LD_UB2(src_ptr, 16, src0, src1);
247     src_ptr += src_stride;
248     LD_UB2(ref_ptr, 16, ref0, ref1);
249     ref_ptr += ref_stride;
250     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
251     CALC_MSE_AVG_B(src0, ref0, var, avg0);
252     CALC_MSE_AVG_B(src1, ref1, var, avg1);
253 
254     LD_UB2(sec_pred, 16, pred0, pred1);
255     sec_pred += 32;
256     LD_UB2(src_ptr, 16, src0, src1);
257     src_ptr += src_stride;
258     LD_UB2(ref_ptr, 16, ref0, ref1);
259     ref_ptr += ref_stride;
260     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
261     CALC_MSE_AVG_B(src0, ref0, var, avg0);
262     CALC_MSE_AVG_B(src1, ref1, var, avg1);
263 
264     LD_UB2(sec_pred, 16, pred0, pred1);
265     sec_pred += 32;
266     LD_UB2(src_ptr, 16, src0, src1);
267     src_ptr += src_stride;
268     LD_UB2(ref_ptr, 16, ref0, ref1);
269     ref_ptr += ref_stride;
270     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
271     CALC_MSE_AVG_B(src0, ref0, var, avg0);
272     CALC_MSE_AVG_B(src1, ref1, var, avg1);
273   }
274 
275   vec = __msa_hadd_s_w(avg0, avg0);
276   vec += __msa_hadd_s_w(avg1, avg1);
277   *diff = HADD_SW_S32(vec);
278 
279   return HADD_SW_S32(var);
280 }
281 
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)282 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
283                                        int32_t src_stride,
284                                        const uint8_t *ref_ptr,
285                                        int32_t ref_stride,
286                                        const uint8_t *sec_pred, int32_t *diff) {
287   int32_t ht_cnt;
288   v16u8 src0, src1, src2, src3;
289   v16u8 ref0, ref1, ref2, ref3;
290   v16u8 pred0, pred1, pred2, pred3;
291   v8i16 avg0 = { 0 };
292   v8i16 avg1 = { 0 };
293   v4i32 vec, var = { 0 };
294 
295   for (ht_cnt = 16; ht_cnt--;) {
296     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
297     sec_pred += 64;
298     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
299     src_ptr += src_stride;
300     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
301     ref_ptr += ref_stride;
302     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
303                 src2, src3);
304     CALC_MSE_AVG_B(src0, ref0, var, avg0);
305     CALC_MSE_AVG_B(src2, ref2, var, avg0);
306     CALC_MSE_AVG_B(src1, ref1, var, avg1);
307     CALC_MSE_AVG_B(src3, ref3, var, avg1);
308 
309     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
310     sec_pred += 64;
311     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
312     src_ptr += src_stride;
313     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
314     ref_ptr += ref_stride;
315     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
316                 src2, src3);
317     CALC_MSE_AVG_B(src0, ref0, var, avg0);
318     CALC_MSE_AVG_B(src2, ref2, var, avg0);
319     CALC_MSE_AVG_B(src1, ref1, var, avg1);
320     CALC_MSE_AVG_B(src3, ref3, var, avg1);
321   }
322 
323   vec = __msa_hadd_s_w(avg0, avg0);
324   vec += __msa_hadd_s_w(avg1, avg1);
325 
326   *diff = HADD_SW_S32(vec);
327 
328   return HADD_SW_S32(var);
329 }
330 
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)331 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
332                                        int32_t src_stride,
333                                        const uint8_t *ref_ptr,
334                                        int32_t ref_stride,
335                                        const uint8_t *sec_pred, int32_t *diff) {
336   int32_t ht_cnt;
337   v16u8 src0, src1, src2, src3;
338   v16u8 ref0, ref1, ref2, ref3;
339   v16u8 pred0, pred1, pred2, pred3;
340   v8i16 avg0 = { 0 };
341   v8i16 avg1 = { 0 };
342   v8i16 avg2 = { 0 };
343   v8i16 avg3 = { 0 };
344   v4i32 vec, var = { 0 };
345 
346   for (ht_cnt = 32; ht_cnt--;) {
347     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
348     sec_pred += 64;
349     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
350     src_ptr += src_stride;
351     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
352     ref_ptr += ref_stride;
353     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
354                 src2, src3);
355     CALC_MSE_AVG_B(src0, ref0, var, avg0);
356     CALC_MSE_AVG_B(src1, ref1, var, avg1);
357     CALC_MSE_AVG_B(src2, ref2, var, avg2);
358     CALC_MSE_AVG_B(src3, ref3, var, avg3);
359 
360     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
361     sec_pred += 64;
362     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
363     src_ptr += src_stride;
364     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
365     ref_ptr += ref_stride;
366     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
367                 src2, src3);
368     CALC_MSE_AVG_B(src0, ref0, var, avg0);
369     CALC_MSE_AVG_B(src1, ref1, var, avg1);
370     CALC_MSE_AVG_B(src2, ref2, var, avg2);
371     CALC_MSE_AVG_B(src3, ref3, var, avg3);
372   }
373 
374   vec = __msa_hadd_s_w(avg0, avg0);
375   vec += __msa_hadd_s_w(avg1, avg1);
376   vec += __msa_hadd_s_w(avg2, avg2);
377   vec += __msa_hadd_s_w(avg3, avg3);
378   *diff = HADD_SW_S32(vec);
379 
380   return HADD_SW_S32(var);
381 }
382 
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)383 static uint32_t sub_pixel_sse_diff_4width_h_msa(
384     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
385     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
386   int16_t filtval;
387   uint32_t loop_cnt;
388   uint32_t ref0, ref1, ref2, ref3;
389   v16u8 filt0, ref = { 0 };
390   v16i8 src0, src1, src2, src3;
391   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
392   v8u16 vec0, vec1, vec2, vec3;
393   v8i16 avg = { 0 };
394   v4i32 vec, var = { 0 };
395 
396   filtval = LH(filter);
397   filt0 = (v16u8)__msa_fill_h(filtval);
398 
399   for (loop_cnt = (height >> 2); loop_cnt--;) {
400     LD_SB4(src, src_stride, src0, src1, src2, src3);
401     src += (4 * src_stride);
402     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
403     dst += (4 * dst_stride);
404     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
405     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
406     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
407     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
408                 vec2, vec3);
409     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
410     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
411                 src2, src3);
412     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
413     src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
414     CALC_MSE_AVG_B(src0, ref, var, avg);
415   }
416 
417   vec = __msa_hadd_s_w(avg, avg);
418   *diff = HADD_SW_S32(vec);
419 
420   return HADD_SW_S32(var);
421 }
422 
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)423 static uint32_t sub_pixel_sse_diff_8width_h_msa(
424     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
425     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
426   int16_t filtval;
427   uint32_t loop_cnt;
428   v16u8 filt0, out, ref0, ref1, ref2, ref3;
429   v16i8 src0, src1, src2, src3;
430   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
431   v8u16 vec0, vec1, vec2, vec3;
432   v8i16 avg = { 0 };
433   v4i32 vec, var = { 0 };
434 
435   filtval = LH(filter);
436   filt0 = (v16u8)__msa_fill_h(filtval);
437 
438   for (loop_cnt = (height >> 2); loop_cnt--;) {
439     LD_SB4(src, src_stride, src0, src1, src2, src3);
440     src += (4 * src_stride);
441     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
442     dst += (4 * dst_stride);
443 
444     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
445     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
446     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
447     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
448                 vec2, vec3);
449     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
450     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
451                 src2, src3);
452     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
453     CALC_MSE_AVG_B(out, ref0, var, avg);
454     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
455     CALC_MSE_AVG_B(out, ref1, var, avg);
456   }
457 
458   vec = __msa_hadd_s_w(avg, avg);
459   *diff = HADD_SW_S32(vec);
460 
461   return HADD_SW_S32(var);
462 }
463 
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)464 static uint32_t sub_pixel_sse_diff_16width_h_msa(
465     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
466     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
467   int16_t filtval;
468   uint32_t loop_cnt;
469   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
470   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
471   v16u8 dst0, dst1, dst2, dst3, filt0;
472   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
473   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
474   v8i16 avg = { 0 };
475   v4i32 vec, var = { 0 };
476 
477   filtval = LH(filter);
478   filt0 = (v16u8)__msa_fill_h(filtval);
479 
480   for (loop_cnt = (height >> 2); loop_cnt--;) {
481     LD_SB4(src, src_stride, src0, src2, src4, src6);
482     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
483     src += (4 * src_stride);
484     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
485     dst += (4 * dst_stride);
486 
487     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
488     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
489     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
490     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
491     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
492                 out2, out3);
493     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
494                 out6, out7);
495     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
496     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
497     PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
498                 src2, src3);
499     CALC_MSE_AVG_B(src0, dst0, var, avg);
500     CALC_MSE_AVG_B(src1, dst1, var, avg);
501     CALC_MSE_AVG_B(src2, dst2, var, avg);
502     CALC_MSE_AVG_B(src3, dst3, var, avg);
503   }
504 
505   vec = __msa_hadd_s_w(avg, avg);
506   *diff = HADD_SW_S32(vec);
507 
508   return HADD_SW_S32(var);
509 }
510 
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)511 static uint32_t sub_pixel_sse_diff_32width_h_msa(
512     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
513     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
514   uint32_t loop_cnt, sse = 0;
515   int32_t diff0[2];
516 
517   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
518     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
519                                             filter, height, &diff0[loop_cnt]);
520     src += 16;
521     dst += 16;
522   }
523 
524   *diff = diff0[0] + diff0[1];
525 
526   return sse;
527 }
528 
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)529 static uint32_t sub_pixel_sse_diff_64width_h_msa(
530     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
531     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
532   uint32_t loop_cnt, sse = 0;
533   int32_t diff0[4];
534 
535   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
536     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
537                                             filter, height, &diff0[loop_cnt]);
538     src += 16;
539     dst += 16;
540   }
541 
542   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
543 
544   return sse;
545 }
546 
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)547 static uint32_t sub_pixel_sse_diff_4width_v_msa(
548     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
549     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
550   int16_t filtval;
551   uint32_t loop_cnt;
552   uint32_t ref0, ref1, ref2, ref3;
553   v16u8 src0, src1, src2, src3, src4, out;
554   v16u8 src10_r, src32_r, src21_r, src43_r;
555   v16u8 ref = { 0 };
556   v16u8 src2110, src4332;
557   v16u8 filt0;
558   v8i16 avg = { 0 };
559   v4i32 vec, var = { 0 };
560   v8u16 tmp0, tmp1;
561 
562   filtval = LH(filter);
563   filt0 = (v16u8)__msa_fill_h(filtval);
564 
565   src0 = LD_UB(src);
566   src += src_stride;
567 
568   for (loop_cnt = (height >> 2); loop_cnt--;) {
569     LD_UB4(src, src_stride, src1, src2, src3, src4);
570     src += (4 * src_stride);
571     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
572     dst += (4 * dst_stride);
573 
574     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
575     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
576                src32_r, src43_r);
577     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
578     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
579     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
580     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
581     CALC_MSE_AVG_B(out, ref, var, avg);
582     src0 = src4;
583   }
584 
585   vec = __msa_hadd_s_w(avg, avg);
586   *diff = HADD_SW_S32(vec);
587 
588   return HADD_SW_S32(var);
589 }
590 
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)591 static uint32_t sub_pixel_sse_diff_8width_v_msa(
592     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
593     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
594   int16_t filtval;
595   uint32_t loop_cnt;
596   v16u8 src0, src1, src2, src3, src4;
597   v16u8 ref0, ref1, ref2, ref3;
598   v8u16 vec0, vec1, vec2, vec3;
599   v8u16 tmp0, tmp1, tmp2, tmp3;
600   v16u8 filt0;
601   v8i16 avg = { 0 };
602   v4i32 vec, var = { 0 };
603 
604   filtval = LH(filter);
605   filt0 = (v16u8)__msa_fill_h(filtval);
606 
607   src0 = LD_UB(src);
608   src += src_stride;
609 
610   for (loop_cnt = (height >> 2); loop_cnt--;) {
611     LD_UB4(src, src_stride, src1, src2, src3, src4);
612     src += (4 * src_stride);
613     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
614     dst += (4 * dst_stride);
615 
616     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
617     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
618                vec3);
619     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
620                 tmp2, tmp3);
621     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
622     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
623     CALC_MSE_AVG_B(src0, ref0, var, avg);
624     CALC_MSE_AVG_B(src1, ref1, var, avg);
625     src0 = src4;
626   }
627 
628   vec = __msa_hadd_s_w(avg, avg);
629   *diff = HADD_SW_S32(vec);
630 
631   return HADD_SW_S32(var);
632 }
633 
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)634 static uint32_t sub_pixel_sse_diff_16width_v_msa(
635     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
636     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
637   int16_t filtval;
638   uint32_t loop_cnt;
639   v16u8 ref0, ref1, ref2, ref3;
640   v16u8 src0, src1, src2, src3, src4;
641   v16u8 out0, out1, out2, out3;
642   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
643   v8u16 tmp0, tmp1, tmp2, tmp3;
644   v16u8 filt0;
645   v8i16 avg = { 0 };
646   v4i32 vec, var = { 0 };
647 
648   filtval = LH(filter);
649   filt0 = (v16u8)__msa_fill_h(filtval);
650 
651   src0 = LD_UB(src);
652   src += src_stride;
653 
654   for (loop_cnt = (height >> 2); loop_cnt--;) {
655     LD_UB4(src, src_stride, src1, src2, src3, src4);
656     src += (4 * src_stride);
657     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
658     dst += (4 * dst_stride);
659 
660     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
661     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
662     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
663     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
664     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
665 
666     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
667     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
668     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
669     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
670     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
671 
672     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
673     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
674     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
675     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
676     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
677     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
678 
679     src0 = src4;
680 
681     CALC_MSE_AVG_B(out0, ref0, var, avg);
682     CALC_MSE_AVG_B(out1, ref1, var, avg);
683     CALC_MSE_AVG_B(out2, ref2, var, avg);
684     CALC_MSE_AVG_B(out3, ref3, var, avg);
685   }
686 
687   vec = __msa_hadd_s_w(avg, avg);
688   *diff = HADD_SW_S32(vec);
689 
690   return HADD_SW_S32(var);
691 }
692 
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)693 static uint32_t sub_pixel_sse_diff_32width_v_msa(
694     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
695     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
696   uint32_t loop_cnt, sse = 0;
697   int32_t diff0[2];
698 
699   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
700     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
701                                             filter, height, &diff0[loop_cnt]);
702     src += 16;
703     dst += 16;
704   }
705 
706   *diff = diff0[0] + diff0[1];
707 
708   return sse;
709 }
710 
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)711 static uint32_t sub_pixel_sse_diff_64width_v_msa(
712     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
713     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
714   uint32_t loop_cnt, sse = 0;
715   int32_t diff0[4];
716 
717   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
718     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
719                                             filter, height, &diff0[loop_cnt]);
720     src += 16;
721     dst += 16;
722   }
723 
724   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
725 
726   return sse;
727 }
728 
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)729 static uint32_t sub_pixel_sse_diff_4width_hv_msa(
730     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
731     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
732     int32_t height, int32_t *diff) {
733   int16_t filtval;
734   uint32_t loop_cnt;
735   uint32_t ref0, ref1, ref2, ref3;
736   v16u8 src0, src1, src2, src3, src4;
737   v16u8 out, ref = { 0 };
738   v16u8 filt_vt, filt_hz, vec0, vec1;
739   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
740   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
741   v8u16 tmp0, tmp1;
742   v8i16 avg = { 0 };
743   v4i32 vec, var = { 0 };
744 
745   filtval = LH(filter_horiz);
746   filt_hz = (v16u8)__msa_fill_h(filtval);
747   filtval = LH(filter_vert);
748   filt_vt = (v16u8)__msa_fill_h(filtval);
749 
750   src0 = LD_UB(src);
751   src += src_stride;
752 
753   for (loop_cnt = (height >> 2); loop_cnt--;) {
754     LD_UB4(src, src_stride, src1, src2, src3, src4);
755     src += (4 * src_stride);
756     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
757     dst += (4 * dst_stride);
758     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
759     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
760     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
761     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
762     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
763     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
764     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
765     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
766     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
767     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
768     CALC_MSE_AVG_B(out, ref, var, avg);
769     src0 = src4;
770   }
771 
772   vec = __msa_hadd_s_w(avg, avg);
773   *diff = HADD_SW_S32(vec);
774 
775   return HADD_SW_S32(var);
776 }
777 
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)778 static uint32_t sub_pixel_sse_diff_8width_hv_msa(
779     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
780     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
781     int32_t height, int32_t *diff) {
782   int16_t filtval;
783   uint32_t loop_cnt;
784   v16u8 ref0, ref1, ref2, ref3;
785   v16u8 src0, src1, src2, src3, src4;
786   v16u8 out0, out1;
787   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
788   v8u16 hz_out0, hz_out1;
789   v8u16 tmp0, tmp1, tmp2, tmp3;
790   v16u8 filt_vt, filt_hz, vec0;
791   v8i16 avg = { 0 };
792   v4i32 vec, var = { 0 };
793 
794   filtval = LH(filter_horiz);
795   filt_hz = (v16u8)__msa_fill_h(filtval);
796   filtval = LH(filter_vert);
797   filt_vt = (v16u8)__msa_fill_h(filtval);
798 
799   src0 = LD_UB(src);
800   src += src_stride;
801   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
802 
803   for (loop_cnt = (height >> 2); loop_cnt--;) {
804     LD_UB4(src, src_stride, src1, src2, src3, src4);
805     src += (4 * src_stride);
806     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
807     dst += (4 * dst_stride);
808 
809     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
810     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
811     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
812     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
813     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
814     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
815     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
816     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
817     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
818     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
819     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
820     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
821     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
822     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
823     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
824     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
825     CALC_MSE_AVG_B(out0, ref0, var, avg);
826     CALC_MSE_AVG_B(out1, ref1, var, avg);
827   }
828 
829   vec = __msa_hadd_s_w(avg, avg);
830   *diff = HADD_SW_S32(vec);
831 
832   return HADD_SW_S32(var);
833 }
834 
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)835 static uint32_t sub_pixel_sse_diff_16width_hv_msa(
836     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
837     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
838     int32_t height, int32_t *diff) {
839   int16_t filtval;
840   uint32_t loop_cnt;
841   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
842   v16u8 ref0, ref1, ref2, ref3;
843   v16u8 filt_hz, filt_vt, vec0, vec1;
844   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
845   v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
846   v8u16 tmp0, tmp1;
847   v8i16 avg = { 0 };
848   v4i32 vec, var = { 0 };
849 
850   filtval = LH(filter_horiz);
851   filt_hz = (v16u8)__msa_fill_h(filtval);
852   filtval = LH(filter_vert);
853   filt_vt = (v16u8)__msa_fill_h(filtval);
854 
855   LD_UB2(src, 8, src0, src1);
856   src += src_stride;
857 
858   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
859   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
860 
861   for (loop_cnt = (height >> 2); loop_cnt--;) {
862     LD_UB4(src, src_stride, src0, src2, src4, src6);
863     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
864     src += (4 * src_stride);
865     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
866     dst += (4 * dst_stride);
867 
868     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
869     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
870     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
871     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
872     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
873     src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
874 
875     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
876     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
877     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
878     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
879     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
880     src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
881 
882     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
883     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
884     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
885     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
886     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
887     src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
888 
889     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
890     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
891     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
892     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
893     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
894     src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
895 
896     CALC_MSE_AVG_B(src0, ref0, var, avg);
897     CALC_MSE_AVG_B(src1, ref1, var, avg);
898     CALC_MSE_AVG_B(src2, ref2, var, avg);
899     CALC_MSE_AVG_B(src3, ref3, var, avg);
900   }
901 
902   vec = __msa_hadd_s_w(avg, avg);
903   *diff = HADD_SW_S32(vec);
904 
905   return HADD_SW_S32(var);
906 }
907 
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)908 static uint32_t sub_pixel_sse_diff_32width_hv_msa(
909     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
910     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
911     int32_t height, int32_t *diff) {
912   uint32_t loop_cnt, sse = 0;
913   int32_t diff0[2];
914 
915   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
916     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
917                                              filter_horiz, filter_vert, height,
918                                              &diff0[loop_cnt]);
919     src += 16;
920     dst += 16;
921   }
922 
923   *diff = diff0[0] + diff0[1];
924 
925   return sse;
926 }
927 
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)928 static uint32_t sub_pixel_sse_diff_64width_hv_msa(
929     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
930     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
931     int32_t height, int32_t *diff) {
932   uint32_t loop_cnt, sse = 0;
933   int32_t diff0[4];
934 
935   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
936     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
937                                              filter_horiz, filter_vert, height,
938                                              &diff0[loop_cnt]);
939     src += 16;
940     dst += 16;
941   }
942 
943   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
944 
945   return sse;
946 }
947 
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)948 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
949     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
950     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
951     int32_t height, int32_t *diff) {
952   int16_t filtval;
953   uint32_t loop_cnt;
954   uint32_t ref0, ref1, ref2, ref3;
955   v16u8 out, pred, filt0, ref = { 0 };
956   v16i8 src0, src1, src2, src3;
957   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
958   v8u16 vec0, vec1, vec2, vec3;
959   v8i16 avg = { 0 };
960   v4i32 vec, var = { 0 };
961 
962   filtval = LH(filter);
963   filt0 = (v16u8)__msa_fill_h(filtval);
964 
965   for (loop_cnt = (height >> 2); loop_cnt--;) {
966     LD_SB4(src, src_stride, src0, src1, src2, src3);
967     src += (4 * src_stride);
968     pred = LD_UB(sec_pred);
969     sec_pred += 16;
970     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
971     dst += (4 * dst_stride);
972 
973     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
974     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
975     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
976     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
977                 vec2, vec3);
978     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
979     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
980                 src2, src3);
981     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
982     out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
983     out = __msa_aver_u_b(out, pred);
984     CALC_MSE_AVG_B(out, ref, var, avg);
985   }
986 
987   vec = __msa_hadd_s_w(avg, avg);
988   *diff = HADD_SW_S32(vec);
989 
990   return HADD_SW_S32(var);
991 }
992 
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)993 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
994     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
995     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
996     int32_t height, int32_t *diff) {
997   int16_t filtval;
998   uint32_t loop_cnt;
999   v16u8 out, pred, filt0;
1000   v16u8 ref0, ref1, ref2, ref3;
1001   v16i8 src0, src1, src2, src3;
1002   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1003   v8u16 vec0, vec1, vec2, vec3;
1004   v8i16 avg = { 0 };
1005   v4i32 vec, var = { 0 };
1006 
1007   filtval = LH(filter);
1008   filt0 = (v16u8)__msa_fill_h(filtval);
1009 
1010   for (loop_cnt = (height >> 2); loop_cnt--;) {
1011     LD_SB4(src, src_stride, src0, src1, src2, src3);
1012     src += (4 * src_stride);
1013     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1014     dst += (4 * dst_stride);
1015 
1016     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1017     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1018     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1019     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
1020                 vec2, vec3);
1021     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1022     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
1023                 src2, src3);
1024     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1025 
1026     pred = LD_UB(sec_pred);
1027     sec_pred += 16;
1028     out = __msa_aver_u_b(out, pred);
1029     CALC_MSE_AVG_B(out, ref0, var, avg);
1030     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1031     pred = LD_UB(sec_pred);
1032     sec_pred += 16;
1033     out = __msa_aver_u_b(out, pred);
1034     CALC_MSE_AVG_B(out, ref1, var, avg);
1035   }
1036 
1037   vec = __msa_hadd_s_w(avg, avg);
1038   *diff = HADD_SW_S32(vec);
1039 
1040   return HADD_SW_S32(var);
1041 }
1042 
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1043 static uint32_t subpel_avg_ssediff_16w_h_msa(
1044     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1045     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1046     int32_t height, int32_t *diff, int32_t width) {
1047   int16_t filtval;
1048   uint32_t loop_cnt;
1049   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1050   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1051   v16u8 dst0, dst1, dst2, dst3;
1052   v16u8 tmp0, tmp1, tmp2, tmp3;
1053   v16u8 pred0, pred1, pred2, pred3, filt0;
1054   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1055   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1056   v8i16 avg = { 0 };
1057   v4i32 vec, var = { 0 };
1058 
1059   filtval = LH(filter);
1060   filt0 = (v16u8)__msa_fill_h(filtval);
1061 
1062   for (loop_cnt = (height >> 2); loop_cnt--;) {
1063     LD_SB4(src, src_stride, src0, src2, src4, src6);
1064     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1065     src += (4 * src_stride);
1066     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1067     dst += (4 * dst_stride);
1068     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1069     sec_pred += (4 * width);
1070 
1071     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1072     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1073     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1074     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1075     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
1076                 out2, out3);
1077     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
1078                 out6, out7);
1079     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1080     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1081     PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
1082                 tmp2, tmp3);
1083     AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
1084                 tmp2, tmp3);
1085 
1086     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1087     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1088     CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1089     CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1090   }
1091 
1092   vec = __msa_hadd_s_w(avg, avg);
1093   *diff = HADD_SW_S32(vec);
1094 
1095   return HADD_SW_S32(var);
1096 }
1097 
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1098 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
1099     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1100     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1101     int32_t height, int32_t *diff) {
1102   return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1103                                       sec_pred, filter, height, diff, 16);
1104 }
1105 
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1106 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
1107     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1108     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1109     int32_t height, int32_t *diff) {
1110   uint32_t loop_cnt, sse = 0;
1111   int32_t diff0[2];
1112 
1113   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1114     sse +=
1115         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1116                                      filter, height, &diff0[loop_cnt], 32);
1117     src += 16;
1118     dst += 16;
1119     sec_pred += 16;
1120   }
1121 
1122   *diff = diff0[0] + diff0[1];
1123 
1124   return sse;
1125 }
1126 
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1127 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
1128     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1129     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1130     int32_t height, int32_t *diff) {
1131   uint32_t loop_cnt, sse = 0;
1132   int32_t diff0[4];
1133 
1134   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1135     sse +=
1136         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1137                                      filter, height, &diff0[loop_cnt], 64);
1138     src += 16;
1139     dst += 16;
1140     sec_pred += 16;
1141   }
1142 
1143   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1144 
1145   return sse;
1146 }
1147 
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1148 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
1149     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1150     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1151     int32_t height, int32_t *diff) {
1152   int16_t filtval;
1153   uint32_t loop_cnt;
1154   uint32_t ref0, ref1, ref2, ref3;
1155   v16u8 src0, src1, src2, src3, src4;
1156   v16u8 src10_r, src32_r, src21_r, src43_r;
1157   v16u8 out, pred, ref = { 0 };
1158   v16u8 src2110, src4332, filt0;
1159   v8i16 avg = { 0 };
1160   v4i32 vec, var = { 0 };
1161   v8u16 tmp0, tmp1;
1162 
1163   filtval = LH(filter);
1164   filt0 = (v16u8)__msa_fill_h(filtval);
1165 
1166   src0 = LD_UB(src);
1167   src += src_stride;
1168 
1169   for (loop_cnt = (height >> 2); loop_cnt--;) {
1170     LD_UB4(src, src_stride, src1, src2, src3, src4);
1171     src += (4 * src_stride);
1172     pred = LD_UB(sec_pred);
1173     sec_pred += 16;
1174     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1175     dst += (4 * dst_stride);
1176 
1177     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1178     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1179                src32_r, src43_r);
1180     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1181     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1182     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1183 
1184     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1185     out = __msa_aver_u_b(out, pred);
1186     CALC_MSE_AVG_B(out, ref, var, avg);
1187     src0 = src4;
1188   }
1189 
1190   vec = __msa_hadd_s_w(avg, avg);
1191   *diff = HADD_SW_S32(vec);
1192 
1193   return HADD_SW_S32(var);
1194 }
1195 
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1196 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
1197     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1198     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1199     int32_t height, int32_t *diff) {
1200   int16_t filtval;
1201   uint32_t loop_cnt;
1202   v16u8 src0, src1, src2, src3, src4;
1203   v16u8 ref0, ref1, ref2, ref3;
1204   v16u8 pred0, pred1, filt0;
1205   v8u16 vec0, vec1, vec2, vec3;
1206   v8u16 tmp0, tmp1, tmp2, tmp3;
1207   v8i16 avg = { 0 };
1208   v4i32 vec, var = { 0 };
1209 
1210   filtval = LH(filter);
1211   filt0 = (v16u8)__msa_fill_h(filtval);
1212 
1213   src0 = LD_UB(src);
1214   src += src_stride;
1215 
1216   for (loop_cnt = (height >> 2); loop_cnt--;) {
1217     LD_UB4(src, src_stride, src1, src2, src3, src4);
1218     src += (4 * src_stride);
1219     LD_UB2(sec_pred, 16, pred0, pred1);
1220     sec_pred += 32;
1221     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1222     dst += (4 * dst_stride);
1223     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1224     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
1225                vec3);
1226     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
1227                 tmp2, tmp3);
1228     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1229     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1230     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1231     CALC_MSE_AVG_B(src0, ref0, var, avg);
1232     CALC_MSE_AVG_B(src1, ref1, var, avg);
1233 
1234     src0 = src4;
1235   }
1236 
1237   vec = __msa_hadd_s_w(avg, avg);
1238   *diff = HADD_SW_S32(vec);
1239 
1240   return HADD_SW_S32(var);
1241 }
1242 
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1243 static uint32_t subpel_avg_ssediff_16w_v_msa(
1244     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1245     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1246     int32_t height, int32_t *diff, int32_t width) {
1247   int16_t filtval;
1248   uint32_t loop_cnt;
1249   v16u8 ref0, ref1, ref2, ref3;
1250   v16u8 pred0, pred1, pred2, pred3;
1251   v16u8 src0, src1, src2, src3, src4;
1252   v16u8 out0, out1, out2, out3, filt0;
1253   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1254   v8u16 tmp0, tmp1, tmp2, tmp3;
1255   v8i16 avg = { 0 };
1256   v4i32 vec, var = { 0 };
1257 
1258   filtval = LH(filter);
1259   filt0 = (v16u8)__msa_fill_h(filtval);
1260 
1261   src0 = LD_UB(src);
1262   src += src_stride;
1263 
1264   for (loop_cnt = (height >> 2); loop_cnt--;) {
1265     LD_UB4(src, src_stride, src1, src2, src3, src4);
1266     src += (4 * src_stride);
1267     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1268     sec_pred += (4 * width);
1269 
1270     ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1271     ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1272     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1273     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1274     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1275 
1276     ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1277     ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1278     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1279     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1280     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1281 
1282     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1283     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1284     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1285 
1286     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1287     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1288     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1289 
1290     src0 = src4;
1291     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1292     dst += (4 * dst_stride);
1293 
1294     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1295                 out2, out3);
1296 
1297     CALC_MSE_AVG_B(out0, ref0, var, avg);
1298     CALC_MSE_AVG_B(out1, ref1, var, avg);
1299     CALC_MSE_AVG_B(out2, ref2, var, avg);
1300     CALC_MSE_AVG_B(out3, ref3, var, avg);
1301   }
1302 
1303   vec = __msa_hadd_s_w(avg, avg);
1304   *diff = HADD_SW_S32(vec);
1305 
1306   return HADD_SW_S32(var);
1307 }
1308 
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1309 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
1310     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1311     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1312     int32_t height, int32_t *diff) {
1313   return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1314                                       sec_pred, filter, height, diff, 16);
1315 }
1316 
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1317 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
1318     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1319     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1320     int32_t height, int32_t *diff) {
1321   uint32_t loop_cnt, sse = 0;
1322   int32_t diff0[2];
1323 
1324   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1325     sse +=
1326         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1327                                      filter, height, &diff0[loop_cnt], 32);
1328     src += 16;
1329     dst += 16;
1330     sec_pred += 16;
1331   }
1332 
1333   *diff = diff0[0] + diff0[1];
1334 
1335   return sse;
1336 }
1337 
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1338 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
1339     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1340     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1341     int32_t height, int32_t *diff) {
1342   uint32_t loop_cnt, sse = 0;
1343   int32_t diff0[4];
1344 
1345   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1346     sse +=
1347         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1348                                      filter, height, &diff0[loop_cnt], 64);
1349     src += 16;
1350     dst += 16;
1351     sec_pred += 16;
1352   }
1353 
1354   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1355 
1356   return sse;
1357 }
1358 
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1359 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1360     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1361     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1362     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1363   int16_t filtval;
1364   uint32_t loop_cnt;
1365   uint32_t ref0, ref1, ref2, ref3;
1366   v16u8 src0, src1, src2, src3, src4;
1367   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1368   v16u8 filt_hz, filt_vt, vec0, vec1;
1369   v16u8 out, pred, ref = { 0 };
1370   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1371   v8i16 avg = { 0 };
1372   v4i32 vec, var = { 0 };
1373 
1374   filtval = LH(filter_horiz);
1375   filt_hz = (v16u8)__msa_fill_h(filtval);
1376   filtval = LH(filter_vert);
1377   filt_vt = (v16u8)__msa_fill_h(filtval);
1378 
1379   src0 = LD_UB(src);
1380   src += src_stride;
1381 
1382   for (loop_cnt = (height >> 2); loop_cnt--;) {
1383     LD_UB4(src, src_stride, src1, src2, src3, src4);
1384     src += (4 * src_stride);
1385     pred = LD_UB(sec_pred);
1386     sec_pred += 16;
1387     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1388     dst += (4 * dst_stride);
1389     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1390     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1391     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1392     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1393     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1394     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1395     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1396     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1397     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1398     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1399     out = __msa_aver_u_b(out, pred);
1400     CALC_MSE_AVG_B(out, ref, var, avg);
1401     src0 = src4;
1402   }
1403 
1404   vec = __msa_hadd_s_w(avg, avg);
1405   *diff = HADD_SW_S32(vec);
1406 
1407   return HADD_SW_S32(var);
1408 }
1409 
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1410 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1411     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1412     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1413     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1414   int16_t filtval;
1415   uint32_t loop_cnt;
1416   v16u8 ref0, ref1, ref2, ref3;
1417   v16u8 src0, src1, src2, src3, src4;
1418   v16u8 pred0, pred1, out0, out1;
1419   v16u8 filt_hz, filt_vt, vec0;
1420   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1421   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1422   v8i16 avg = { 0 };
1423   v4i32 vec, var = { 0 };
1424 
1425   filtval = LH(filter_horiz);
1426   filt_hz = (v16u8)__msa_fill_h(filtval);
1427   filtval = LH(filter_vert);
1428   filt_vt = (v16u8)__msa_fill_h(filtval);
1429 
1430   src0 = LD_UB(src);
1431   src += src_stride;
1432   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1433 
1434   for (loop_cnt = (height >> 2); loop_cnt--;) {
1435     LD_UB4(src, src_stride, src1, src2, src3, src4);
1436     src += (4 * src_stride);
1437     LD_UB2(sec_pred, 16, pred0, pred1);
1438     sec_pred += 32;
1439     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1440     dst += (4 * dst_stride);
1441 
1442     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1443     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1444 
1445     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1446     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1447     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1448 
1449     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1450     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1451     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1452     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1453 
1454     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1455     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1456     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1457 
1458     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1459     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1460 
1461     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1462     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1463     AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1464 
1465     CALC_MSE_AVG_B(out0, ref0, var, avg);
1466     CALC_MSE_AVG_B(out1, ref1, var, avg);
1467   }
1468 
1469   vec = __msa_hadd_s_w(avg, avg);
1470   *diff = HADD_SW_S32(vec);
1471 
1472   return HADD_SW_S32(var);
1473 }
1474 
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1475 static uint32_t subpel_avg_ssediff_16w_hv_msa(
1476     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1477     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1478     const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1479   int16_t filtval;
1480   uint32_t loop_cnt;
1481   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1482   v16u8 ref0, ref1, ref2, ref3;
1483   v16u8 pred0, pred1, pred2, pred3;
1484   v16u8 out0, out1, out2, out3;
1485   v16u8 filt_hz, filt_vt, vec0, vec1;
1486   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1487   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1488   v8i16 avg = { 0 };
1489   v4i32 vec, var = { 0 };
1490 
1491   filtval = LH(filter_horiz);
1492   filt_hz = (v16u8)__msa_fill_h(filtval);
1493   filtval = LH(filter_vert);
1494   filt_vt = (v16u8)__msa_fill_h(filtval);
1495 
1496   LD_UB2(src, 8, src0, src1);
1497   src += src_stride;
1498 
1499   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1500   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1501 
1502   for (loop_cnt = (height >> 2); loop_cnt--;) {
1503     LD_UB4(src, src_stride, src0, src2, src4, src6);
1504     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1505     src += (4 * src_stride);
1506     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1507     sec_pred += (4 * width);
1508 
1509     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1510     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1511     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1512     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1513     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1514     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1515 
1516     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1517     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1518     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1519     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1520     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1521     out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1522 
1523     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1524     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1525     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1526     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1527     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1528     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1529 
1530     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1531     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1532     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1533     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1534     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1535     out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1536 
1537     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1538     dst += (4 * dst_stride);
1539 
1540     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1541                 out2, out3);
1542 
1543     CALC_MSE_AVG_B(out0, ref0, var, avg);
1544     CALC_MSE_AVG_B(out1, ref1, var, avg);
1545     CALC_MSE_AVG_B(out2, ref2, var, avg);
1546     CALC_MSE_AVG_B(out3, ref3, var, avg);
1547   }
1548 
1549   vec = __msa_hadd_s_w(avg, avg);
1550   *diff = HADD_SW_S32(vec);
1551 
1552   return HADD_SW_S32(var);
1553 }
1554 
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1555 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1556     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1557     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1558     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1559   return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1560                                        sec_pred, filter_horiz, filter_vert,
1561                                        height, diff, 16);
1562 }
1563 
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1564 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1565     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1566     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1567     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1568   uint32_t loop_cnt, sse = 0;
1569   int32_t diff0[2];
1570 
1571   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1572     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1573                                          sec_pred, filter_horiz, filter_vert,
1574                                          height, &diff0[loop_cnt], 32);
1575     src += 16;
1576     dst += 16;
1577     sec_pred += 16;
1578   }
1579 
1580   *diff = diff0[0] + diff0[1];
1581 
1582   return sse;
1583 }
1584 
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1585 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1586     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1587     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1588     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1589   uint32_t loop_cnt, sse = 0;
1590   int32_t diff0[4];
1591 
1592   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1593     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1594                                          sec_pred, filter_horiz, filter_vert,
1595                                          height, &diff0[loop_cnt], 64);
1596     src += 16;
1597     dst += 16;
1598     sec_pred += 16;
1599   }
1600 
1601   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1602 
1603   return sse;
1604 }
1605 
1606 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1607 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1608 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1609 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1610 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1611 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1612 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1613 
1614 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1615 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1616 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1617 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1618 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1619 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1620 
1621 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
1622   uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(                           \
1623       const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
1624       int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
1625       uint32_t *sse) {                                                        \
1626     int32_t diff;                                                             \
1627     uint32_t var;                                                             \
1628     const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
1629     const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
1630                                                                               \
1631     if (y_offset) {                                                           \
1632       if (x_offset) {                                                         \
1633         *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
1634             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
1635       } else {                                                                \
1636         *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
1637             src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
1638       }                                                                       \
1639                                                                               \
1640       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
1641     } else {                                                                  \
1642       if (x_offset) {                                                         \
1643         *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
1644             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
1645                                                                               \
1646         var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
1647       } else {                                                                \
1648         var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
1649                                             sse);                             \
1650       }                                                                       \
1651     }                                                                         \
1652                                                                               \
1653     return var;                                                               \
1654   }
1655 
1656 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1657 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1658 
1659 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1660 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1661 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1662 
1663 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1664 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1665 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1666 
1667 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1668 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1669 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1670 
1671 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1672 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1673 
1674 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
1675   uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
1676       const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
1677       int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
1678       uint32_t *sse, const uint8_t *sec_pred) {                               \
1679     int32_t diff;                                                             \
1680     const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
1681     const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
1682                                                                               \
1683     if (y_offset) {                                                           \
1684       if (x_offset) {                                                         \
1685         *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
1686             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1687             v_filter, ht, &diff);                                             \
1688       } else {                                                                \
1689         *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
1690             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1691             &diff);                                                           \
1692       }                                                                       \
1693     } else {                                                                  \
1694       if (x_offset) {                                                         \
1695         *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
1696             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1697             &diff);                                                           \
1698       } else {                                                                \
1699         *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
1700                                             ref_stride, sec_pred, ht, &diff); \
1701       }                                                                       \
1702     }                                                                         \
1703                                                                               \
1704     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
1705   }
1706 
1707 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1708 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1709 
1710 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1711 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1712 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1713 
1714 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1715 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1716 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1717 
1718 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1719 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1720 
vpx_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t x_offset,int32_t y_offset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1721 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1722                                              int32_t src_stride,
1723                                              int32_t x_offset, int32_t y_offset,
1724                                              const uint8_t *ref_ptr,
1725                                              int32_t ref_stride, uint32_t *sse,
1726                                              const uint8_t *sec_pred) {
1727   int32_t diff;
1728   const uint8_t *h_filter = bilinear_filters_msa[x_offset];
1729   const uint8_t *v_filter = bilinear_filters_msa[y_offset];
1730 
1731   if (y_offset) {
1732     if (x_offset) {
1733       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
1734           src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
1735           v_filter, 64, &diff);
1736     } else {
1737       *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
1738                                                   ref_stride, sec_pred,
1739                                                   v_filter, 64, &diff);
1740     }
1741   } else {
1742     if (x_offset) {
1743       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
1744                                                   ref_stride, sec_pred,
1745                                                   h_filter, 64, &diff);
1746     } else {
1747       *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1748                                     sec_pred, &diff);
1749     }
1750   }
1751 
1752   return VARIANCE_32Wx64H(*sse, diff);
1753 }
1754 
1755 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
1756   uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa(                           \
1757       const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
1758       int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
1759       uint32_t *sse, const uint8_t *sec_pred) {                               \
1760     int32_t diff;                                                             \
1761     const uint8_t *h_filter = bilinear_filters_msa[x_offset];                 \
1762     const uint8_t *v_filter = bilinear_filters_msa[y_offset];                 \
1763                                                                               \
1764     if (y_offset) {                                                           \
1765       if (x_offset) {                                                         \
1766         *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
1767             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1768             v_filter, ht, &diff);                                             \
1769       } else {                                                                \
1770         *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
1771             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1772             &diff);                                                           \
1773       }                                                                       \
1774     } else {                                                                  \
1775       if (x_offset) {                                                         \
1776         *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
1777             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1778             &diff);                                                           \
1779       } else {                                                                \
1780         *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
1781                                           ref_stride, sec_pred, &diff);       \
1782       }                                                                       \
1783     }                                                                         \
1784                                                                               \
1785     return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
1786   }
1787 
1788 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1789 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1790