1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_dsp_rtcd.h"
13 
14 #include "aom_ports/mem.h"
15 #include "aom_dsp/mips/macros_msa.h"
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/variance.h"
18 
19 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
20   {                                                                 \
21     v16u8 src_l0_m, src_l1_m;                                       \
22     v8i16 res_l0_m, res_l1_m;                                       \
23                                                                     \
24     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
25     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
26     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
27                                                                     \
28     sub += res_l0_m + res_l1_m;                                     \
29   }
30 
31 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
32 
33 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
34   sse - (((int64_t)diff * diff) >> shift)
35 
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)36 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
37                                         int32_t src_stride,
38                                         const uint8_t *ref_ptr,
39                                         int32_t ref_stride,
40                                         const uint8_t *sec_pred, int32_t height,
41                                         int32_t *diff) {
42   int32_t ht_cnt;
43   uint32_t src0, src1, src2, src3;
44   uint32_t ref0, ref1, ref2, ref3;
45   v16u8 pred, src = { 0 };
46   v16u8 ref = { 0 };
47   v8i16 avg = { 0 };
48   v4i32 vec, var = { 0 };
49 
50   for (ht_cnt = (height >> 2); ht_cnt--;) {
51     pred = LD_UB(sec_pred);
52     sec_pred += 16;
53     LW4(src_ptr, src_stride, src0, src1, src2, src3);
54     src_ptr += (4 * src_stride);
55     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
56     ref_ptr += (4 * ref_stride);
57 
58     INSERT_W4_UB(src0, src1, src2, src3, src);
59     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
60 
61     src = __msa_aver_u_b(src, pred);
62     CALC_MSE_AVG_B(src, ref, var, avg);
63   }
64 
65   vec = __msa_hadd_s_w(avg, avg);
66   *diff = HADD_SW_S32(vec);
67 
68   return HADD_SW_S32(var);
69 }
70 
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)71 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
72                                         int32_t src_stride,
73                                         const uint8_t *ref_ptr,
74                                         int32_t ref_stride,
75                                         const uint8_t *sec_pred, int32_t height,
76                                         int32_t *diff) {
77   int32_t ht_cnt;
78   v16u8 src0, src1, src2, src3;
79   v16u8 ref0, ref1, ref2, ref3;
80   v16u8 pred0, pred1;
81   v8i16 avg = { 0 };
82   v4i32 vec, var = { 0 };
83 
84   for (ht_cnt = (height >> 2); ht_cnt--;) {
85     LD_UB2(sec_pred, 16, pred0, pred1);
86     sec_pred += 32;
87     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
88     src_ptr += (4 * src_stride);
89     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
90     ref_ptr += (4 * ref_stride);
91 
92     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
93                 ref0, ref1);
94     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
95     CALC_MSE_AVG_B(src0, ref0, var, avg);
96     CALC_MSE_AVG_B(src1, ref1, var, avg);
97   }
98 
99   vec = __msa_hadd_s_w(avg, avg);
100   *diff = HADD_SW_S32(vec);
101 
102   return HADD_SW_S32(var);
103 }
104 
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)105 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
106                                          int32_t src_stride,
107                                          const uint8_t *ref_ptr,
108                                          int32_t ref_stride,
109                                          const uint8_t *sec_pred,
110                                          int32_t height, int32_t *diff) {
111   int32_t ht_cnt;
112   v16u8 src, ref, pred;
113   v8i16 avg = { 0 };
114   v4i32 vec, var = { 0 };
115 
116   for (ht_cnt = (height >> 2); ht_cnt--;) {
117     pred = LD_UB(sec_pred);
118     sec_pred += 16;
119     src = LD_UB(src_ptr);
120     src_ptr += src_stride;
121     ref = LD_UB(ref_ptr);
122     ref_ptr += ref_stride;
123     src = __msa_aver_u_b(src, pred);
124     CALC_MSE_AVG_B(src, ref, var, avg);
125 
126     pred = LD_UB(sec_pred);
127     sec_pred += 16;
128     src = LD_UB(src_ptr);
129     src_ptr += src_stride;
130     ref = LD_UB(ref_ptr);
131     ref_ptr += ref_stride;
132     src = __msa_aver_u_b(src, pred);
133     CALC_MSE_AVG_B(src, ref, var, avg);
134 
135     pred = LD_UB(sec_pred);
136     sec_pred += 16;
137     src = LD_UB(src_ptr);
138     src_ptr += src_stride;
139     ref = LD_UB(ref_ptr);
140     ref_ptr += ref_stride;
141     src = __msa_aver_u_b(src, pred);
142     CALC_MSE_AVG_B(src, ref, var, avg);
143 
144     pred = LD_UB(sec_pred);
145     sec_pred += 16;
146     src = LD_UB(src_ptr);
147     src_ptr += src_stride;
148     ref = LD_UB(ref_ptr);
149     ref_ptr += ref_stride;
150     src = __msa_aver_u_b(src, pred);
151     CALC_MSE_AVG_B(src, ref, var, avg);
152   }
153 
154   vec = __msa_hadd_s_w(avg, avg);
155   *diff = HADD_SW_S32(vec);
156 
157   return HADD_SW_S32(var);
158 }
159 
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)160 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
161                                          int32_t src_stride,
162                                          const uint8_t *ref_ptr,
163                                          int32_t ref_stride,
164                                          const uint8_t *sec_pred,
165                                          int32_t height, int32_t *diff) {
166   int32_t ht_cnt;
167   v16u8 src0, src1, ref0, ref1, pred0, pred1;
168   v8i16 avg = { 0 };
169   v4i32 vec, var = { 0 };
170 
171   for (ht_cnt = (height >> 2); ht_cnt--;) {
172     LD_UB2(sec_pred, 16, pred0, pred1);
173     sec_pred += 32;
174     LD_UB2(src_ptr, 16, src0, src1);
175     src_ptr += src_stride;
176     LD_UB2(ref_ptr, 16, ref0, ref1);
177     ref_ptr += ref_stride;
178     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
179     CALC_MSE_AVG_B(src0, ref0, var, avg);
180     CALC_MSE_AVG_B(src1, ref1, var, avg);
181 
182     LD_UB2(sec_pred, 16, pred0, pred1);
183     sec_pred += 32;
184     LD_UB2(src_ptr, 16, src0, src1);
185     src_ptr += src_stride;
186     LD_UB2(ref_ptr, 16, ref0, ref1);
187     ref_ptr += ref_stride;
188     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
189     CALC_MSE_AVG_B(src0, ref0, var, avg);
190     CALC_MSE_AVG_B(src1, ref1, var, avg);
191 
192     LD_UB2(sec_pred, 16, pred0, pred1);
193     sec_pred += 32;
194     LD_UB2(src_ptr, 16, src0, src1);
195     src_ptr += src_stride;
196     LD_UB2(ref_ptr, 16, ref0, ref1);
197     ref_ptr += ref_stride;
198     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
199     CALC_MSE_AVG_B(src0, ref0, var, avg);
200     CALC_MSE_AVG_B(src1, ref1, var, avg);
201 
202     LD_UB2(sec_pred, 16, pred0, pred1);
203     sec_pred += 32;
204     LD_UB2(src_ptr, 16, src0, src1);
205     src_ptr += src_stride;
206     LD_UB2(ref_ptr, 16, ref0, ref1);
207     ref_ptr += ref_stride;
208     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
209     CALC_MSE_AVG_B(src0, ref0, var, avg);
210     CALC_MSE_AVG_B(src1, ref1, var, avg);
211   }
212 
213   vec = __msa_hadd_s_w(avg, avg);
214   *diff = HADD_SW_S32(vec);
215 
216   return HADD_SW_S32(var);
217 }
218 
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)219 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
220                                        int32_t src_stride,
221                                        const uint8_t *ref_ptr,
222                                        int32_t ref_stride,
223                                        const uint8_t *sec_pred, int32_t *diff) {
224   int32_t ht_cnt;
225   v16u8 src0, src1, ref0, ref1, pred0, pred1;
226   v8i16 avg0 = { 0 };
227   v8i16 avg1 = { 0 };
228   v4i32 vec, var = { 0 };
229 
230   for (ht_cnt = 16; ht_cnt--;) {
231     LD_UB2(sec_pred, 16, pred0, pred1);
232     sec_pred += 32;
233     LD_UB2(src_ptr, 16, src0, src1);
234     src_ptr += src_stride;
235     LD_UB2(ref_ptr, 16, ref0, ref1);
236     ref_ptr += ref_stride;
237     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
238     CALC_MSE_AVG_B(src0, ref0, var, avg0);
239     CALC_MSE_AVG_B(src1, ref1, var, avg1);
240 
241     LD_UB2(sec_pred, 16, pred0, pred1);
242     sec_pred += 32;
243     LD_UB2(src_ptr, 16, src0, src1);
244     src_ptr += src_stride;
245     LD_UB2(ref_ptr, 16, ref0, ref1);
246     ref_ptr += ref_stride;
247     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
248     CALC_MSE_AVG_B(src0, ref0, var, avg0);
249     CALC_MSE_AVG_B(src1, ref1, var, avg1);
250 
251     LD_UB2(sec_pred, 16, pred0, pred1);
252     sec_pred += 32;
253     LD_UB2(src_ptr, 16, src0, src1);
254     src_ptr += src_stride;
255     LD_UB2(ref_ptr, 16, ref0, ref1);
256     ref_ptr += ref_stride;
257     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
258     CALC_MSE_AVG_B(src0, ref0, var, avg0);
259     CALC_MSE_AVG_B(src1, ref1, var, avg1);
260 
261     LD_UB2(sec_pred, 16, pred0, pred1);
262     sec_pred += 32;
263     LD_UB2(src_ptr, 16, src0, src1);
264     src_ptr += src_stride;
265     LD_UB2(ref_ptr, 16, ref0, ref1);
266     ref_ptr += ref_stride;
267     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
268     CALC_MSE_AVG_B(src0, ref0, var, avg0);
269     CALC_MSE_AVG_B(src1, ref1, var, avg1);
270   }
271 
272   vec = __msa_hadd_s_w(avg0, avg0);
273   vec += __msa_hadd_s_w(avg1, avg1);
274   *diff = HADD_SW_S32(vec);
275 
276   return HADD_SW_S32(var);
277 }
278 
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)279 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
280                                        int32_t src_stride,
281                                        const uint8_t *ref_ptr,
282                                        int32_t ref_stride,
283                                        const uint8_t *sec_pred, int32_t *diff) {
284   int32_t ht_cnt;
285   v16u8 src0, src1, src2, src3;
286   v16u8 ref0, ref1, ref2, ref3;
287   v16u8 pred0, pred1, pred2, pred3;
288   v8i16 avg0 = { 0 };
289   v8i16 avg1 = { 0 };
290   v4i32 vec, var = { 0 };
291 
292   for (ht_cnt = 16; ht_cnt--;) {
293     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
294     sec_pred += 64;
295     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
296     src_ptr += src_stride;
297     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
298     ref_ptr += ref_stride;
299     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
300                 src2, src3);
301     CALC_MSE_AVG_B(src0, ref0, var, avg0);
302     CALC_MSE_AVG_B(src2, ref2, var, avg0);
303     CALC_MSE_AVG_B(src1, ref1, var, avg1);
304     CALC_MSE_AVG_B(src3, ref3, var, avg1);
305 
306     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
307     sec_pred += 64;
308     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
309     src_ptr += src_stride;
310     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
311     ref_ptr += ref_stride;
312     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
313                 src2, src3);
314     CALC_MSE_AVG_B(src0, ref0, var, avg0);
315     CALC_MSE_AVG_B(src2, ref2, var, avg0);
316     CALC_MSE_AVG_B(src1, ref1, var, avg1);
317     CALC_MSE_AVG_B(src3, ref3, var, avg1);
318   }
319 
320   vec = __msa_hadd_s_w(avg0, avg0);
321   vec += __msa_hadd_s_w(avg1, avg1);
322 
323   *diff = HADD_SW_S32(vec);
324 
325   return HADD_SW_S32(var);
326 }
327 
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)328 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
329                                        int32_t src_stride,
330                                        const uint8_t *ref_ptr,
331                                        int32_t ref_stride,
332                                        const uint8_t *sec_pred, int32_t *diff) {
333   int32_t ht_cnt;
334   v16u8 src0, src1, src2, src3;
335   v16u8 ref0, ref1, ref2, ref3;
336   v16u8 pred0, pred1, pred2, pred3;
337   v8i16 avg0 = { 0 };
338   v8i16 avg1 = { 0 };
339   v8i16 avg2 = { 0 };
340   v8i16 avg3 = { 0 };
341   v4i32 vec, var = { 0 };
342 
343   for (ht_cnt = 32; ht_cnt--;) {
344     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
345     sec_pred += 64;
346     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
347     src_ptr += src_stride;
348     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
349     ref_ptr += ref_stride;
350     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
351                 src2, src3);
352     CALC_MSE_AVG_B(src0, ref0, var, avg0);
353     CALC_MSE_AVG_B(src1, ref1, var, avg1);
354     CALC_MSE_AVG_B(src2, ref2, var, avg2);
355     CALC_MSE_AVG_B(src3, ref3, var, avg3);
356 
357     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
358     sec_pred += 64;
359     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
360     src_ptr += src_stride;
361     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
362     ref_ptr += ref_stride;
363     AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
364                 src2, src3);
365     CALC_MSE_AVG_B(src0, ref0, var, avg0);
366     CALC_MSE_AVG_B(src1, ref1, var, avg1);
367     CALC_MSE_AVG_B(src2, ref2, var, avg2);
368     CALC_MSE_AVG_B(src3, ref3, var, avg3);
369   }
370 
371   vec = __msa_hadd_s_w(avg0, avg0);
372   vec += __msa_hadd_s_w(avg1, avg1);
373   vec += __msa_hadd_s_w(avg2, avg2);
374   vec += __msa_hadd_s_w(avg3, avg3);
375   *diff = HADD_SW_S32(vec);
376 
377   return HADD_SW_S32(var);
378 }
379 
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)380 static uint32_t sub_pixel_sse_diff_4width_h_msa(
381     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
382     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
383   int16_t filtval;
384   uint32_t loop_cnt;
385   uint32_t ref0, ref1, ref2, ref3;
386   v16u8 filt0, ref = { 0 };
387   v16i8 src0, src1, src2, src3;
388   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
389   v8u16 vec0, vec1, vec2, vec3;
390   v8i16 avg = { 0 };
391   v4i32 vec, var = { 0 };
392 
393   filtval = LH(filter);
394   filt0 = (v16u8)__msa_fill_h(filtval);
395 
396   for (loop_cnt = (height >> 2); loop_cnt--;) {
397     LD_SB4(src, src_stride, src0, src1, src2, src3);
398     src += (4 * src_stride);
399     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
400     dst += (4 * dst_stride);
401     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
402     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
403     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
404     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
405                 vec2, vec3);
406     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
407     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
408                 src2, src3);
409     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
410     src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
411     CALC_MSE_AVG_B(src0, ref, var, avg);
412   }
413 
414   vec = __msa_hadd_s_w(avg, avg);
415   *diff = HADD_SW_S32(vec);
416 
417   return HADD_SW_S32(var);
418 }
419 
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)420 static uint32_t sub_pixel_sse_diff_8width_h_msa(
421     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
422     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
423   int16_t filtval;
424   uint32_t loop_cnt;
425   v16u8 filt0, out, ref0, ref1, ref2, ref3;
426   v16i8 src0, src1, src2, src3;
427   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
428   v8u16 vec0, vec1, vec2, vec3;
429   v8i16 avg = { 0 };
430   v4i32 vec, var = { 0 };
431 
432   filtval = LH(filter);
433   filt0 = (v16u8)__msa_fill_h(filtval);
434 
435   for (loop_cnt = (height >> 2); loop_cnt--;) {
436     LD_SB4(src, src_stride, src0, src1, src2, src3);
437     src += (4 * src_stride);
438     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
439     dst += (4 * dst_stride);
440 
441     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
442     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
443     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
444     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
445                 vec2, vec3);
446     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
447     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
448                 src2, src3);
449     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
450     CALC_MSE_AVG_B(out, ref0, var, avg);
451     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
452     CALC_MSE_AVG_B(out, ref1, var, avg);
453   }
454 
455   vec = __msa_hadd_s_w(avg, avg);
456   *diff = HADD_SW_S32(vec);
457 
458   return HADD_SW_S32(var);
459 }
460 
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)461 static uint32_t sub_pixel_sse_diff_16width_h_msa(
462     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
463     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
464   int16_t filtval;
465   uint32_t loop_cnt;
466   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
467   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
468   v16u8 dst0, dst1, dst2, dst3, filt0;
469   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
470   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
471   v8i16 avg = { 0 };
472   v4i32 vec, var = { 0 };
473 
474   filtval = LH(filter);
475   filt0 = (v16u8)__msa_fill_h(filtval);
476 
477   for (loop_cnt = (height >> 2); loop_cnt--;) {
478     LD_SB4(src, src_stride, src0, src2, src4, src6);
479     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
480     src += (4 * src_stride);
481     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
482     dst += (4 * dst_stride);
483 
484     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
485     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
486     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
487     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
488     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
489                 out2, out3);
490     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
491                 out6, out7);
492     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
493     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
494     PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
495                 src2, src3);
496     CALC_MSE_AVG_B(src0, dst0, var, avg);
497     CALC_MSE_AVG_B(src1, dst1, var, avg);
498     CALC_MSE_AVG_B(src2, dst2, var, avg);
499     CALC_MSE_AVG_B(src3, dst3, var, avg);
500   }
501 
502   vec = __msa_hadd_s_w(avg, avg);
503   *diff = HADD_SW_S32(vec);
504 
505   return HADD_SW_S32(var);
506 }
507 
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)508 static uint32_t sub_pixel_sse_diff_32width_h_msa(
509     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
510     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
511   uint32_t loop_cnt, sse = 0;
512   int32_t diff0[2];
513 
514   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
515     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
516                                             filter, height, &diff0[loop_cnt]);
517     src += 16;
518     dst += 16;
519   }
520 
521   *diff = diff0[0] + diff0[1];
522 
523   return sse;
524 }
525 
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)526 static uint32_t sub_pixel_sse_diff_64width_h_msa(
527     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
528     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
529   uint32_t loop_cnt, sse = 0;
530   int32_t diff0[4];
531 
532   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
533     sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
534                                             filter, height, &diff0[loop_cnt]);
535     src += 16;
536     dst += 16;
537   }
538 
539   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
540 
541   return sse;
542 }
543 
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)544 static uint32_t sub_pixel_sse_diff_4width_v_msa(
545     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
546     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
547   int16_t filtval;
548   uint32_t loop_cnt;
549   uint32_t ref0, ref1, ref2, ref3;
550   v16u8 src0, src1, src2, src3, src4, out;
551   v16u8 src10_r, src32_r, src21_r, src43_r;
552   v16u8 ref = { 0 };
553   v16u8 src2110, src4332;
554   v16u8 filt0;
555   v8i16 avg = { 0 };
556   v4i32 vec, var = { 0 };
557   v8u16 tmp0, tmp1;
558 
559   filtval = LH(filter);
560   filt0 = (v16u8)__msa_fill_h(filtval);
561 
562   src0 = LD_UB(src);
563   src += src_stride;
564 
565   for (loop_cnt = (height >> 2); loop_cnt--;) {
566     LD_UB4(src, src_stride, src1, src2, src3, src4);
567     src += (4 * src_stride);
568     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
569     dst += (4 * dst_stride);
570 
571     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
572     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
573                src32_r, src43_r);
574     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
575     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
576     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
577     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
578     CALC_MSE_AVG_B(out, ref, var, avg);
579     src0 = src4;
580   }
581 
582   vec = __msa_hadd_s_w(avg, avg);
583   *diff = HADD_SW_S32(vec);
584 
585   return HADD_SW_S32(var);
586 }
587 
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)588 static uint32_t sub_pixel_sse_diff_8width_v_msa(
589     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
590     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
591   int16_t filtval;
592   uint32_t loop_cnt;
593   v16u8 src0, src1, src2, src3, src4;
594   v16u8 ref0, ref1, ref2, ref3;
595   v8u16 vec0, vec1, vec2, vec3;
596   v8u16 tmp0, tmp1, tmp2, tmp3;
597   v16u8 filt0;
598   v8i16 avg = { 0 };
599   v4i32 vec, var = { 0 };
600 
601   filtval = LH(filter);
602   filt0 = (v16u8)__msa_fill_h(filtval);
603 
604   src0 = LD_UB(src);
605   src += src_stride;
606 
607   for (loop_cnt = (height >> 2); loop_cnt--;) {
608     LD_UB4(src, src_stride, src1, src2, src3, src4);
609     src += (4 * src_stride);
610     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
611     dst += (4 * dst_stride);
612 
613     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
614     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
615                vec3);
616     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
617                 tmp2, tmp3);
618     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
619     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
620     CALC_MSE_AVG_B(src0, ref0, var, avg);
621     CALC_MSE_AVG_B(src1, ref1, var, avg);
622     src0 = src4;
623   }
624 
625   vec = __msa_hadd_s_w(avg, avg);
626   *diff = HADD_SW_S32(vec);
627 
628   return HADD_SW_S32(var);
629 }
630 
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)631 static uint32_t sub_pixel_sse_diff_16width_v_msa(
632     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
633     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
634   int16_t filtval;
635   uint32_t loop_cnt;
636   v16u8 ref0, ref1, ref2, ref3;
637   v16u8 src0, src1, src2, src3, src4;
638   v16u8 out0, out1, out2, out3;
639   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
640   v8u16 tmp0, tmp1, tmp2, tmp3;
641   v16u8 filt0;
642   v8i16 avg = { 0 };
643   v4i32 vec, var = { 0 };
644 
645   filtval = LH(filter);
646   filt0 = (v16u8)__msa_fill_h(filtval);
647 
648   src0 = LD_UB(src);
649   src += src_stride;
650 
651   for (loop_cnt = (height >> 2); loop_cnt--;) {
652     LD_UB4(src, src_stride, src1, src2, src3, src4);
653     src += (4 * src_stride);
654     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
655     dst += (4 * dst_stride);
656 
657     ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
658     ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
659     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
660     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
661     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
662 
663     ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
664     ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
665     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
666     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
667     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
668 
669     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
670     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
671     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
672     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
673     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
674     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
675 
676     src0 = src4;
677 
678     CALC_MSE_AVG_B(out0, ref0, var, avg);
679     CALC_MSE_AVG_B(out1, ref1, var, avg);
680     CALC_MSE_AVG_B(out2, ref2, var, avg);
681     CALC_MSE_AVG_B(out3, ref3, var, avg);
682   }
683 
684   vec = __msa_hadd_s_w(avg, avg);
685   *diff = HADD_SW_S32(vec);
686 
687   return HADD_SW_S32(var);
688 }
689 
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)690 static uint32_t sub_pixel_sse_diff_32width_v_msa(
691     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
692     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
693   uint32_t loop_cnt, sse = 0;
694   int32_t diff0[2];
695 
696   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
697     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
698                                             filter, height, &diff0[loop_cnt]);
699     src += 16;
700     dst += 16;
701   }
702 
703   *diff = diff0[0] + diff0[1];
704 
705   return sse;
706 }
707 
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)708 static uint32_t sub_pixel_sse_diff_64width_v_msa(
709     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
710     int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
711   uint32_t loop_cnt, sse = 0;
712   int32_t diff0[4];
713 
714   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
715     sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
716                                             filter, height, &diff0[loop_cnt]);
717     src += 16;
718     dst += 16;
719   }
720 
721   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
722 
723   return sse;
724 }
725 
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)726 static uint32_t sub_pixel_sse_diff_4width_hv_msa(
727     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
728     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
729     int32_t height, int32_t *diff) {
730   int16_t filtval;
731   uint32_t loop_cnt;
732   uint32_t ref0, ref1, ref2, ref3;
733   v16u8 src0, src1, src2, src3, src4;
734   v16u8 out, ref = { 0 };
735   v16u8 filt_vt, filt_hz, vec0, vec1;
736   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
737   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
738   v8u16 tmp0, tmp1;
739   v8i16 avg = { 0 };
740   v4i32 vec, var = { 0 };
741 
742   filtval = LH(filter_horiz);
743   filt_hz = (v16u8)__msa_fill_h(filtval);
744   filtval = LH(filter_vert);
745   filt_vt = (v16u8)__msa_fill_h(filtval);
746 
747   src0 = LD_UB(src);
748   src += src_stride;
749 
750   for (loop_cnt = (height >> 2); loop_cnt--;) {
751     LD_UB4(src, src_stride, src1, src2, src3, src4);
752     src += (4 * src_stride);
753     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
754     dst += (4 * dst_stride);
755     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
756     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
757     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
758     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
759     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
760     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
761     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
762     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
763     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
764     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
765     CALC_MSE_AVG_B(out, ref, var, avg);
766     src0 = src4;
767   }
768 
769   vec = __msa_hadd_s_w(avg, avg);
770   *diff = HADD_SW_S32(vec);
771 
772   return HADD_SW_S32(var);
773 }
774 
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)775 static uint32_t sub_pixel_sse_diff_8width_hv_msa(
776     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
777     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
778     int32_t height, int32_t *diff) {
779   int16_t filtval;
780   uint32_t loop_cnt;
781   v16u8 ref0, ref1, ref2, ref3;
782   v16u8 src0, src1, src2, src3, src4;
783   v16u8 out0, out1;
784   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
785   v8u16 hz_out0, hz_out1;
786   v8u16 tmp0, tmp1, tmp2, tmp3;
787   v16u8 filt_vt, filt_hz, vec0;
788   v8i16 avg = { 0 };
789   v4i32 vec, var = { 0 };
790 
791   filtval = LH(filter_horiz);
792   filt_hz = (v16u8)__msa_fill_h(filtval);
793   filtval = LH(filter_vert);
794   filt_vt = (v16u8)__msa_fill_h(filtval);
795 
796   src0 = LD_UB(src);
797   src += src_stride;
798   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
799 
800   for (loop_cnt = (height >> 2); loop_cnt--;) {
801     LD_UB4(src, src_stride, src1, src2, src3, src4);
802     src += (4 * src_stride);
803     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
804     dst += (4 * dst_stride);
805 
806     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
807     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
808     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
809     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
810     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
811     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
812     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
813     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
814     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
815     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
816     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
817     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
818     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
819     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
820     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
821     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
822     CALC_MSE_AVG_B(out0, ref0, var, avg);
823     CALC_MSE_AVG_B(out1, ref1, var, avg);
824   }
825 
826   vec = __msa_hadd_s_w(avg, avg);
827   *diff = HADD_SW_S32(vec);
828 
829   return HADD_SW_S32(var);
830 }
831 
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)832 static uint32_t sub_pixel_sse_diff_16width_hv_msa(
833     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
834     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
835     int32_t height, int32_t *diff) {
836   int16_t filtval;
837   uint32_t loop_cnt;
838   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
839   v16u8 ref0, ref1, ref2, ref3;
840   v16u8 filt_hz, filt_vt, vec0, vec1;
841   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
842   v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
843   v8u16 tmp0, tmp1;
844   v8i16 avg = { 0 };
845   v4i32 vec, var = { 0 };
846 
847   filtval = LH(filter_horiz);
848   filt_hz = (v16u8)__msa_fill_h(filtval);
849   filtval = LH(filter_vert);
850   filt_vt = (v16u8)__msa_fill_h(filtval);
851 
852   LD_UB2(src, 8, src0, src1);
853   src += src_stride;
854 
855   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
856   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
857 
858   for (loop_cnt = (height >> 2); loop_cnt--;) {
859     LD_UB4(src, src_stride, src0, src2, src4, src6);
860     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
861     src += (4 * src_stride);
862     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
863     dst += (4 * dst_stride);
864 
865     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
866     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
867     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
868     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
869     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
870     src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
871 
872     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
873     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
874     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
875     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
876     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
877     src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
878 
879     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
880     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
881     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
882     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
883     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
884     src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
885 
886     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
887     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
888     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
889     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
890     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
891     src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
892 
893     CALC_MSE_AVG_B(src0, ref0, var, avg);
894     CALC_MSE_AVG_B(src1, ref1, var, avg);
895     CALC_MSE_AVG_B(src2, ref2, var, avg);
896     CALC_MSE_AVG_B(src3, ref3, var, avg);
897   }
898 
899   vec = __msa_hadd_s_w(avg, avg);
900   *diff = HADD_SW_S32(vec);
901 
902   return HADD_SW_S32(var);
903 }
904 
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)905 static uint32_t sub_pixel_sse_diff_32width_hv_msa(
906     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
907     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
908     int32_t height, int32_t *diff) {
909   uint32_t loop_cnt, sse = 0;
910   int32_t diff0[2];
911 
912   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
913     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
914                                              filter_horiz, filter_vert, height,
915                                              &diff0[loop_cnt]);
916     src += 16;
917     dst += 16;
918   }
919 
920   *diff = diff0[0] + diff0[1];
921 
922   return sse;
923 }
924 
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)925 static uint32_t sub_pixel_sse_diff_64width_hv_msa(
926     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
927     int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
928     int32_t height, int32_t *diff) {
929   uint32_t loop_cnt, sse = 0;
930   int32_t diff0[4];
931 
932   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
933     sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
934                                              filter_horiz, filter_vert, height,
935                                              &diff0[loop_cnt]);
936     src += 16;
937     dst += 16;
938   }
939 
940   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
941 
942   return sse;
943 }
944 
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)945 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
946     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
947     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
948     int32_t height, int32_t *diff) {
949   int16_t filtval;
950   uint32_t loop_cnt;
951   uint32_t ref0, ref1, ref2, ref3;
952   v16u8 out, pred, filt0, ref = { 0 };
953   v16i8 src0, src1, src2, src3;
954   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
955   v8u16 vec0, vec1, vec2, vec3;
956   v8i16 avg = { 0 };
957   v4i32 vec, var = { 0 };
958 
959   filtval = LH(filter);
960   filt0 = (v16u8)__msa_fill_h(filtval);
961 
962   for (loop_cnt = (height >> 2); loop_cnt--;) {
963     LD_SB4(src, src_stride, src0, src1, src2, src3);
964     src += (4 * src_stride);
965     pred = LD_UB(sec_pred);
966     sec_pred += 16;
967     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
968     dst += (4 * dst_stride);
969 
970     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
971     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
972     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
973     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
974                 vec2, vec3);
975     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
976     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
977                 src2, src3);
978     ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
979     out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
980     out = __msa_aver_u_b(out, pred);
981     CALC_MSE_AVG_B(out, ref, var, avg);
982   }
983 
984   vec = __msa_hadd_s_w(avg, avg);
985   *diff = HADD_SW_S32(vec);
986 
987   return HADD_SW_S32(var);
988 }
989 
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)990 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
991     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
992     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
993     int32_t height, int32_t *diff) {
994   int16_t filtval;
995   uint32_t loop_cnt;
996   v16u8 out, pred, filt0;
997   v16u8 ref0, ref1, ref2, ref3;
998   v16i8 src0, src1, src2, src3;
999   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1000   v8u16 vec0, vec1, vec2, vec3;
1001   v8i16 avg = { 0 };
1002   v4i32 vec, var = { 0 };
1003 
1004   filtval = LH(filter);
1005   filt0 = (v16u8)__msa_fill_h(filtval);
1006 
1007   for (loop_cnt = (height >> 2); loop_cnt--;) {
1008     LD_SB4(src, src_stride, src0, src1, src2, src3);
1009     src += (4 * src_stride);
1010     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1011     dst += (4 * dst_stride);
1012 
1013     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1014     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1015     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1016     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
1017                 vec2, vec3);
1018     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1019     PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
1020                 src2, src3);
1021     out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1022 
1023     pred = LD_UB(sec_pred);
1024     sec_pred += 16;
1025     out = __msa_aver_u_b(out, pred);
1026     CALC_MSE_AVG_B(out, ref0, var, avg);
1027     out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1028     pred = LD_UB(sec_pred);
1029     sec_pred += 16;
1030     out = __msa_aver_u_b(out, pred);
1031     CALC_MSE_AVG_B(out, ref1, var, avg);
1032   }
1033 
1034   vec = __msa_hadd_s_w(avg, avg);
1035   *diff = HADD_SW_S32(vec);
1036 
1037   return HADD_SW_S32(var);
1038 }
1039 
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1040 static uint32_t subpel_avg_ssediff_16w_h_msa(
1041     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1042     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1043     int32_t height, int32_t *diff, int32_t width) {
1044   int16_t filtval;
1045   uint32_t loop_cnt;
1046   v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1047   v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1048   v16u8 dst0, dst1, dst2, dst3;
1049   v16u8 tmp0, tmp1, tmp2, tmp3;
1050   v16u8 pred0, pred1, pred2, pred3, filt0;
1051   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1052   v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1053   v8i16 avg = { 0 };
1054   v4i32 vec, var = { 0 };
1055 
1056   filtval = LH(filter);
1057   filt0 = (v16u8)__msa_fill_h(filtval);
1058 
1059   for (loop_cnt = (height >> 2); loop_cnt--;) {
1060     LD_SB4(src, src_stride, src0, src2, src4, src6);
1061     LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1062     src += (4 * src_stride);
1063     LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1064     dst += (4 * dst_stride);
1065     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1066     sec_pred += (4 * width);
1067 
1068     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1069     VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1070     VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1071     VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1072     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
1073                 out2, out3);
1074     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
1075                 out6, out7);
1076     SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1077     SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1078     PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
1079                 tmp2, tmp3);
1080     AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
1081                 tmp2, tmp3);
1082 
1083     CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1084     CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1085     CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1086     CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1087   }
1088 
1089   vec = __msa_hadd_s_w(avg, avg);
1090   *diff = HADD_SW_S32(vec);
1091 
1092   return HADD_SW_S32(var);
1093 }
1094 
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1095 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
1096     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1097     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1098     int32_t height, int32_t *diff) {
1099   return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1100                                       sec_pred, filter, height, diff, 16);
1101 }
1102 
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1103 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
1104     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1105     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1106     int32_t height, int32_t *diff) {
1107   uint32_t loop_cnt, sse = 0;
1108   int32_t diff0[2];
1109 
1110   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1111     sse +=
1112         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1113                                      filter, height, &diff0[loop_cnt], 32);
1114     src += 16;
1115     dst += 16;
1116     sec_pred += 16;
1117   }
1118 
1119   *diff = diff0[0] + diff0[1];
1120 
1121   return sse;
1122 }
1123 
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1124 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
1125     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1126     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1127     int32_t height, int32_t *diff) {
1128   uint32_t loop_cnt, sse = 0;
1129   int32_t diff0[4];
1130 
1131   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1132     sse +=
1133         subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1134                                      filter, height, &diff0[loop_cnt], 64);
1135     src += 16;
1136     dst += 16;
1137     sec_pred += 16;
1138   }
1139 
1140   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1141 
1142   return sse;
1143 }
1144 
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1145 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
1146     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1147     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1148     int32_t height, int32_t *diff) {
1149   int16_t filtval;
1150   uint32_t loop_cnt;
1151   uint32_t ref0, ref1, ref2, ref3;
1152   v16u8 src0, src1, src2, src3, src4;
1153   v16u8 src10_r, src32_r, src21_r, src43_r;
1154   v16u8 out, pred, ref = { 0 };
1155   v16u8 src2110, src4332, filt0;
1156   v8i16 avg = { 0 };
1157   v4i32 vec, var = { 0 };
1158   v8u16 tmp0, tmp1;
1159 
1160   filtval = LH(filter);
1161   filt0 = (v16u8)__msa_fill_h(filtval);
1162 
1163   src0 = LD_UB(src);
1164   src += src_stride;
1165 
1166   for (loop_cnt = (height >> 2); loop_cnt--;) {
1167     LD_UB4(src, src_stride, src1, src2, src3, src4);
1168     src += (4 * src_stride);
1169     pred = LD_UB(sec_pred);
1170     sec_pred += 16;
1171     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1172     dst += (4 * dst_stride);
1173 
1174     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1175     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1176                src32_r, src43_r);
1177     ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1178     DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1179     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1180 
1181     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1182     out = __msa_aver_u_b(out, pred);
1183     CALC_MSE_AVG_B(out, ref, var, avg);
1184     src0 = src4;
1185   }
1186 
1187   vec = __msa_hadd_s_w(avg, avg);
1188   *diff = HADD_SW_S32(vec);
1189 
1190   return HADD_SW_S32(var);
1191 }
1192 
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1193 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
1194     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1195     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1196     int32_t height, int32_t *diff) {
1197   int16_t filtval;
1198   uint32_t loop_cnt;
1199   v16u8 src0, src1, src2, src3, src4;
1200   v16u8 ref0, ref1, ref2, ref3;
1201   v16u8 pred0, pred1, filt0;
1202   v8u16 vec0, vec1, vec2, vec3;
1203   v8u16 tmp0, tmp1, tmp2, tmp3;
1204   v8i16 avg = { 0 };
1205   v4i32 vec, var = { 0 };
1206 
1207   filtval = LH(filter);
1208   filt0 = (v16u8)__msa_fill_h(filtval);
1209 
1210   src0 = LD_UB(src);
1211   src += src_stride;
1212 
1213   for (loop_cnt = (height >> 2); loop_cnt--;) {
1214     LD_UB4(src, src_stride, src1, src2, src3, src4);
1215     src += (4 * src_stride);
1216     LD_UB2(sec_pred, 16, pred0, pred1);
1217     sec_pred += 32;
1218     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1219     dst += (4 * dst_stride);
1220     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1221     ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
1222                vec3);
1223     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
1224                 tmp2, tmp3);
1225     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1226     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1227     AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1228     CALC_MSE_AVG_B(src0, ref0, var, avg);
1229     CALC_MSE_AVG_B(src1, ref1, var, avg);
1230 
1231     src0 = src4;
1232   }
1233 
1234   vec = __msa_hadd_s_w(avg, avg);
1235   *diff = HADD_SW_S32(vec);
1236 
1237   return HADD_SW_S32(var);
1238 }
1239 
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1240 static uint32_t subpel_avg_ssediff_16w_v_msa(
1241     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1242     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1243     int32_t height, int32_t *diff, int32_t width) {
1244   int16_t filtval;
1245   uint32_t loop_cnt;
1246   v16u8 ref0, ref1, ref2, ref3;
1247   v16u8 pred0, pred1, pred2, pred3;
1248   v16u8 src0, src1, src2, src3, src4;
1249   v16u8 out0, out1, out2, out3, filt0;
1250   v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1251   v8u16 tmp0, tmp1, tmp2, tmp3;
1252   v8i16 avg = { 0 };
1253   v4i32 vec, var = { 0 };
1254 
1255   filtval = LH(filter);
1256   filt0 = (v16u8)__msa_fill_h(filtval);
1257 
1258   src0 = LD_UB(src);
1259   src += src_stride;
1260 
1261   for (loop_cnt = (height >> 2); loop_cnt--;) {
1262     LD_UB4(src, src_stride, src1, src2, src3, src4);
1263     src += (4 * src_stride);
1264     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1265     sec_pred += (4 * width);
1266 
1267     ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1268     ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1269     DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1270     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1271     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1272 
1273     ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1274     ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1275     DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1276     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1277     out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1278 
1279     DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1280     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1281     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1282 
1283     DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1284     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1285     out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1286 
1287     src0 = src4;
1288     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1289     dst += (4 * dst_stride);
1290 
1291     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1292                 out2, out3);
1293 
1294     CALC_MSE_AVG_B(out0, ref0, var, avg);
1295     CALC_MSE_AVG_B(out1, ref1, var, avg);
1296     CALC_MSE_AVG_B(out2, ref2, var, avg);
1297     CALC_MSE_AVG_B(out3, ref3, var, avg);
1298   }
1299 
1300   vec = __msa_hadd_s_w(avg, avg);
1301   *diff = HADD_SW_S32(vec);
1302 
1303   return HADD_SW_S32(var);
1304 }
1305 
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1306 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
1307     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1308     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1309     int32_t height, int32_t *diff) {
1310   return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1311                                       sec_pred, filter, height, diff, 16);
1312 }
1313 
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1314 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
1315     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1316     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1317     int32_t height, int32_t *diff) {
1318   uint32_t loop_cnt, sse = 0;
1319   int32_t diff0[2];
1320 
1321   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1322     sse +=
1323         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1324                                      filter, height, &diff0[loop_cnt], 32);
1325     src += 16;
1326     dst += 16;
1327     sec_pred += 16;
1328   }
1329 
1330   *diff = diff0[0] + diff0[1];
1331 
1332   return sse;
1333 }
1334 
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1335 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
1336     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1337     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1338     int32_t height, int32_t *diff) {
1339   uint32_t loop_cnt, sse = 0;
1340   int32_t diff0[4];
1341 
1342   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1343     sse +=
1344         subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1345                                      filter, height, &diff0[loop_cnt], 64);
1346     src += 16;
1347     dst += 16;
1348     sec_pred += 16;
1349   }
1350 
1351   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1352 
1353   return sse;
1354 }
1355 
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1356 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1357     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1358     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1359     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1360   int16_t filtval;
1361   uint32_t loop_cnt;
1362   uint32_t ref0, ref1, ref2, ref3;
1363   v16u8 src0, src1, src2, src3, src4;
1364   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1365   v16u8 filt_hz, filt_vt, vec0, vec1;
1366   v16u8 out, pred, ref = { 0 };
1367   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1368   v8i16 avg = { 0 };
1369   v4i32 vec, var = { 0 };
1370 
1371   filtval = LH(filter_horiz);
1372   filt_hz = (v16u8)__msa_fill_h(filtval);
1373   filtval = LH(filter_vert);
1374   filt_vt = (v16u8)__msa_fill_h(filtval);
1375 
1376   src0 = LD_UB(src);
1377   src += src_stride;
1378 
1379   for (loop_cnt = (height >> 2); loop_cnt--;) {
1380     LD_UB4(src, src_stride, src1, src2, src3, src4);
1381     src += (4 * src_stride);
1382     pred = LD_UB(sec_pred);
1383     sec_pred += 16;
1384     LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1385     dst += (4 * dst_stride);
1386     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1387     hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1388     hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1389     hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1390     hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1391     hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1392     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1393     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1394     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1395     out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1396     out = __msa_aver_u_b(out, pred);
1397     CALC_MSE_AVG_B(out, ref, var, avg);
1398     src0 = src4;
1399   }
1400 
1401   vec = __msa_hadd_s_w(avg, avg);
1402   *diff = HADD_SW_S32(vec);
1403 
1404   return HADD_SW_S32(var);
1405 }
1406 
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1407 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1408     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1409     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1410     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1411   int16_t filtval;
1412   uint32_t loop_cnt;
1413   v16u8 ref0, ref1, ref2, ref3;
1414   v16u8 src0, src1, src2, src3, src4;
1415   v16u8 pred0, pred1, out0, out1;
1416   v16u8 filt_hz, filt_vt, vec0;
1417   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1418   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1419   v8i16 avg = { 0 };
1420   v4i32 vec, var = { 0 };
1421 
1422   filtval = LH(filter_horiz);
1423   filt_hz = (v16u8)__msa_fill_h(filtval);
1424   filtval = LH(filter_vert);
1425   filt_vt = (v16u8)__msa_fill_h(filtval);
1426 
1427   src0 = LD_UB(src);
1428   src += src_stride;
1429   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1430 
1431   for (loop_cnt = (height >> 2); loop_cnt--;) {
1432     LD_UB4(src, src_stride, src1, src2, src3, src4);
1433     src += (4 * src_stride);
1434     LD_UB2(sec_pred, 16, pred0, pred1);
1435     sec_pred += 32;
1436     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1437     dst += (4 * dst_stride);
1438 
1439     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1440     hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1441 
1442     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1443     tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1444     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1445 
1446     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1447     tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1448     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1449     hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1450 
1451     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1452     tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1453     hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1454 
1455     vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1456     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1457 
1458     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1459     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1460     AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1461 
1462     CALC_MSE_AVG_B(out0, ref0, var, avg);
1463     CALC_MSE_AVG_B(out1, ref1, var, avg);
1464   }
1465 
1466   vec = __msa_hadd_s_w(avg, avg);
1467   *diff = HADD_SW_S32(vec);
1468 
1469   return HADD_SW_S32(var);
1470 }
1471 
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1472 static uint32_t subpel_avg_ssediff_16w_hv_msa(
1473     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1474     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1475     const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1476   int16_t filtval;
1477   uint32_t loop_cnt;
1478   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1479   v16u8 ref0, ref1, ref2, ref3;
1480   v16u8 pred0, pred1, pred2, pred3;
1481   v16u8 out0, out1, out2, out3;
1482   v16u8 filt_hz, filt_vt, vec0, vec1;
1483   v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1484   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1485   v8i16 avg = { 0 };
1486   v4i32 vec, var = { 0 };
1487 
1488   filtval = LH(filter_horiz);
1489   filt_hz = (v16u8)__msa_fill_h(filtval);
1490   filtval = LH(filter_vert);
1491   filt_vt = (v16u8)__msa_fill_h(filtval);
1492 
1493   LD_UB2(src, 8, src0, src1);
1494   src += src_stride;
1495 
1496   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1497   hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1498 
1499   for (loop_cnt = (height >> 2); loop_cnt--;) {
1500     LD_UB4(src, src_stride, src0, src2, src4, src6);
1501     LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1502     src += (4 * src_stride);
1503     LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1504     sec_pred += (4 * width);
1505 
1506     hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1507     hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1508     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1509     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1510     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1511     out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1512 
1513     hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1514     hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1515     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1516     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1517     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1518     out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1519 
1520     hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1521     hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1522     ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1523     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1524     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1525     out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1526 
1527     hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1528     hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1529     ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1530     DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1531     SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1532     out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1533 
1534     LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1535     dst += (4 * dst_stride);
1536 
1537     AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1538                 out2, out3);
1539 
1540     CALC_MSE_AVG_B(out0, ref0, var, avg);
1541     CALC_MSE_AVG_B(out1, ref1, var, avg);
1542     CALC_MSE_AVG_B(out2, ref2, var, avg);
1543     CALC_MSE_AVG_B(out3, ref3, var, avg);
1544   }
1545 
1546   vec = __msa_hadd_s_w(avg, avg);
1547   *diff = HADD_SW_S32(vec);
1548 
1549   return HADD_SW_S32(var);
1550 }
1551 
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1552 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1553     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1554     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1555     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1556   return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1557                                        sec_pred, filter_horiz, filter_vert,
1558                                        height, diff, 16);
1559 }
1560 
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1561 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1562     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1563     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1564     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1565   uint32_t loop_cnt, sse = 0;
1566   int32_t diff0[2];
1567 
1568   for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1569     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1570                                          sec_pred, filter_horiz, filter_vert,
1571                                          height, &diff0[loop_cnt], 32);
1572     src += 16;
1573     dst += 16;
1574     sec_pred += 16;
1575   }
1576 
1577   *diff = diff0[0] + diff0[1];
1578 
1579   return sse;
1580 }
1581 
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1582 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1583     const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1584     int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1585     const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1586   uint32_t loop_cnt, sse = 0;
1587   int32_t diff0[4];
1588 
1589   for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1590     sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1591                                          sec_pred, filter_horiz, filter_vert,
1592                                          height, &diff0[loop_cnt], 64);
1593     src += 16;
1594     dst += 16;
1595     sec_pred += 16;
1596   }
1597 
1598   *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1599 
1600   return sse;
1601 }
1602 
1603 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1604 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1605 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1606 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1607 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1608 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1609 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1610 
1611 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1612 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1613 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1614 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1615 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1616 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1617 
1618 #define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                              \
1619   uint32_t aom_sub_pixel_variance##wd##x##ht##_msa(                           \
1620       const uint8_t *src, int32_t src_stride, int32_t xoffset,                \
1621       int32_t yoffset, const uint8_t *ref, int32_t ref_stride,                \
1622       uint32_t *sse) {                                                        \
1623     int32_t diff;                                                             \
1624     uint32_t var;                                                             \
1625     const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
1626     const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
1627                                                                               \
1628     if (yoffset) {                                                            \
1629       if (xoffset) {                                                          \
1630         *sse = sub_pixel_sse_diff_##wd##width_hv_msa(                         \
1631             src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
1632       } else {                                                                \
1633         *sse = sub_pixel_sse_diff_##wd##width_v_msa(                          \
1634             src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
1635       }                                                                       \
1636                                                                               \
1637       var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
1638     } else {                                                                  \
1639       if (xoffset) {                                                          \
1640         *sse = sub_pixel_sse_diff_##wd##width_h_msa(                          \
1641             src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
1642                                                                               \
1643         var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
1644       } else {                                                                \
1645         var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
1646                                             sse);                             \
1647       }                                                                       \
1648     }                                                                         \
1649                                                                               \
1650     return var;                                                               \
1651   }
1652 
1653 /* clang-format off */
1654 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
1655 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
1656 
1657 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
1658 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
1659 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
1660 
1661 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
1662 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
1663 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
1664 
1665 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
1666 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
1667 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
1668 
1669 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
1670 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
1671 /* clang-format on */
1672 
1673 #define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
1674   uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa(                       \
1675       const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
1676       int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
1677       uint32_t *sse, const uint8_t *sec_pred) {                               \
1678     int32_t diff;                                                             \
1679     const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
1680     const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
1681                                                                               \
1682     if (yoffset) {                                                            \
1683       if (xoffset) {                                                          \
1684         *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(                     \
1685             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1686             v_filter, ht, &diff);                                             \
1687       } else {                                                                \
1688         *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(                      \
1689             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1690             &diff);                                                           \
1691       }                                                                       \
1692     } else {                                                                  \
1693       if (xoffset) {                                                          \
1694         *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(                      \
1695             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1696             &diff);                                                           \
1697       } else {                                                                \
1698         *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr,     \
1699                                             ref_stride, sec_pred, ht, &diff); \
1700       }                                                                       \
1701     }                                                                         \
1702                                                                               \
1703     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                              \
1704   }
1705 
1706 /* clang-format off */
1707 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
1708 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
1709 
1710 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
1711 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
1712 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
1713 
1714 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
1715 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
1716 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
1717 
1718 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
1719 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
1720 /* clang-format on */
1721 
aom_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t xoffset,int32_t yoffset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1722 uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1723                                              int32_t src_stride,
1724                                              int32_t xoffset, int32_t yoffset,
1725                                              const uint8_t *ref_ptr,
1726                                              int32_t ref_stride, uint32_t *sse,
1727                                              const uint8_t *sec_pred) {
1728   int32_t diff;
1729   const uint8_t *h_filter = bilinear_filters_2t[xoffset];
1730   const uint8_t *v_filter = bilinear_filters_2t[yoffset];
1731 
1732   if (yoffset) {
1733     if (xoffset) {
1734       *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
1735           src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
1736           v_filter, 64, &diff);
1737     } else {
1738       *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
1739                                                   ref_stride, sec_pred,
1740                                                   v_filter, 64, &diff);
1741     }
1742   } else {
1743     if (xoffset) {
1744       *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
1745                                                   ref_stride, sec_pred,
1746                                                   h_filter, 64, &diff);
1747     } else {
1748       *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1749                                     sec_pred, &diff);
1750     }
1751   }
1752 
1753   return VARIANCE_32Wx64H(*sse, diff);
1754 }
1755 
1756 #define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                           \
1757   uint32_t aom_sub_pixel_avg_variance64x##ht##_msa(                           \
1758       const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset,            \
1759       int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride,            \
1760       uint32_t *sse, const uint8_t *sec_pred) {                               \
1761     int32_t diff;                                                             \
1762     const uint8_t *h_filter = bilinear_filters_2t[xoffset];                   \
1763     const uint8_t *v_filter = bilinear_filters_2t[yoffset];                   \
1764                                                                               \
1765     if (yoffset) {                                                            \
1766       if (xoffset) {                                                          \
1767         *sse = sub_pixel_avg_sse_diff_64width_hv_msa(                         \
1768             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
1769             v_filter, ht, &diff);                                             \
1770       } else {                                                                \
1771         *sse = sub_pixel_avg_sse_diff_64width_v_msa(                          \
1772             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1773             &diff);                                                           \
1774       }                                                                       \
1775     } else {                                                                  \
1776       if (xoffset) {                                                          \
1777         *sse = sub_pixel_avg_sse_diff_64width_h_msa(                          \
1778             src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1779             &diff);                                                           \
1780       } else {                                                                \
1781         *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr,       \
1782                                           ref_stride, sec_pred, &diff);       \
1783       }                                                                       \
1784     }                                                                         \
1785                                                                               \
1786     return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
1787   }
1788 
1789 /* clang-format off */
1790 AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
1791 AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
1792 /* clang-format on */
1793