1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/aom_dsp_rtcd.h"
13 
14 #include "aom_dsp/mips/macros_msa.h"
15 
16 #define CALC_MSE_B(src, ref, var)                                   \
17   {                                                                 \
18     v16u8 src_l0_m, src_l1_m;                                       \
19     v8i16 res_l0_m, res_l1_m;                                       \
20                                                                     \
21     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
22     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
23     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
24   }
25 
26 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
27   {                                                                 \
28     v16u8 src_l0_m, src_l1_m;                                       \
29     v8i16 res_l0_m, res_l1_m;                                       \
30                                                                     \
31     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
32     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
33     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
34                                                                     \
35     sub += res_l0_m + res_l1_m;                                     \
36   }
37 
38 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
39 
40 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
41   sse - (((int64_t)diff * diff) >> shift)
42 
sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)43 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
44                                     const uint8_t *ref_ptr, int32_t ref_stride,
45                                     int32_t height, int32_t *diff) {
46   uint32_t src0, src1, src2, src3;
47   uint32_t ref0, ref1, ref2, ref3;
48   int32_t ht_cnt;
49   v16u8 src = { 0 };
50   v16u8 ref = { 0 };
51   v8i16 avg = { 0 };
52   v4i32 vec, var = { 0 };
53 
54   for (ht_cnt = (height >> 2); ht_cnt--;) {
55     LW4(src_ptr, src_stride, src0, src1, src2, src3);
56     src_ptr += (4 * src_stride);
57     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
58     ref_ptr += (4 * ref_stride);
59 
60     INSERT_W4_UB(src0, src1, src2, src3, src);
61     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
62     CALC_MSE_AVG_B(src, ref, var, avg);
63   }
64 
65   vec = __msa_hadd_s_w(avg, avg);
66   *diff = HADD_SW_S32(vec);
67 
68   return HADD_SW_S32(var);
69 }
70 
sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)71 static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
72                                     const uint8_t *ref_ptr, int32_t ref_stride,
73                                     int32_t height, int32_t *diff) {
74   int32_t ht_cnt;
75   v16u8 src0, src1, src2, src3;
76   v16u8 ref0, ref1, ref2, ref3;
77   v8i16 avg = { 0 };
78   v4i32 vec, var = { 0 };
79 
80   for (ht_cnt = (height >> 2); ht_cnt--;) {
81     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
82     src_ptr += (4 * src_stride);
83     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
84     ref_ptr += (4 * ref_stride);
85 
86     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
87                 ref0, ref1);
88     CALC_MSE_AVG_B(src0, ref0, var, avg);
89     CALC_MSE_AVG_B(src1, ref1, var, avg);
90   }
91 
92   vec = __msa_hadd_s_w(avg, avg);
93   *diff = HADD_SW_S32(vec);
94 
95   return HADD_SW_S32(var);
96 }
97 
sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)98 static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
99                                      const uint8_t *ref_ptr, int32_t ref_stride,
100                                      int32_t height, int32_t *diff) {
101   int32_t ht_cnt;
102   v16u8 src, ref;
103   v8i16 avg = { 0 };
104   v4i32 vec, var = { 0 };
105 
106   for (ht_cnt = (height >> 2); ht_cnt--;) {
107     src = LD_UB(src_ptr);
108     src_ptr += src_stride;
109     ref = LD_UB(ref_ptr);
110     ref_ptr += ref_stride;
111     CALC_MSE_AVG_B(src, ref, var, avg);
112 
113     src = LD_UB(src_ptr);
114     src_ptr += src_stride;
115     ref = LD_UB(ref_ptr);
116     ref_ptr += ref_stride;
117     CALC_MSE_AVG_B(src, ref, var, avg);
118 
119     src = LD_UB(src_ptr);
120     src_ptr += src_stride;
121     ref = LD_UB(ref_ptr);
122     ref_ptr += ref_stride;
123     CALC_MSE_AVG_B(src, ref, var, avg);
124 
125     src = LD_UB(src_ptr);
126     src_ptr += src_stride;
127     ref = LD_UB(ref_ptr);
128     ref_ptr += ref_stride;
129     CALC_MSE_AVG_B(src, ref, var, avg);
130   }
131 
132   vec = __msa_hadd_s_w(avg, avg);
133   *diff = HADD_SW_S32(vec);
134 
135   return HADD_SW_S32(var);
136 }
137 
sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)138 static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
139                                      const uint8_t *ref_ptr, int32_t ref_stride,
140                                      int32_t height, int32_t *diff) {
141   int32_t ht_cnt;
142   v16u8 src0, src1, ref0, ref1;
143   v8i16 avg = { 0 };
144   v4i32 vec, var = { 0 };
145 
146   for (ht_cnt = (height >> 2); ht_cnt--;) {
147     LD_UB2(src_ptr, 16, src0, src1);
148     src_ptr += src_stride;
149     LD_UB2(ref_ptr, 16, ref0, ref1);
150     ref_ptr += ref_stride;
151     CALC_MSE_AVG_B(src0, ref0, var, avg);
152     CALC_MSE_AVG_B(src1, ref1, var, avg);
153 
154     LD_UB2(src_ptr, 16, src0, src1);
155     src_ptr += src_stride;
156     LD_UB2(ref_ptr, 16, ref0, ref1);
157     ref_ptr += ref_stride;
158     CALC_MSE_AVG_B(src0, ref0, var, avg);
159     CALC_MSE_AVG_B(src1, ref1, var, avg);
160 
161     LD_UB2(src_ptr, 16, src0, src1);
162     src_ptr += src_stride;
163     LD_UB2(ref_ptr, 16, ref0, ref1);
164     ref_ptr += ref_stride;
165     CALC_MSE_AVG_B(src0, ref0, var, avg);
166     CALC_MSE_AVG_B(src1, ref1, var, avg);
167 
168     LD_UB2(src_ptr, 16, src0, src1);
169     src_ptr += src_stride;
170     LD_UB2(ref_ptr, 16, ref0, ref1);
171     ref_ptr += ref_stride;
172     CALC_MSE_AVG_B(src0, ref0, var, avg);
173     CALC_MSE_AVG_B(src1, ref1, var, avg);
174   }
175 
176   vec = __msa_hadd_s_w(avg, avg);
177   *diff = HADD_SW_S32(vec);
178 
179   return HADD_SW_S32(var);
180 }
181 
sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)182 static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
183                                    const uint8_t *ref_ptr, int32_t ref_stride,
184                                    int32_t *diff) {
185   int32_t ht_cnt;
186   v16u8 src0, src1, ref0, ref1;
187   v8i16 avg0 = { 0 };
188   v8i16 avg1 = { 0 };
189   v4i32 vec, var = { 0 };
190 
191   for (ht_cnt = 16; ht_cnt--;) {
192     LD_UB2(src_ptr, 16, src0, src1);
193     src_ptr += src_stride;
194     LD_UB2(ref_ptr, 16, ref0, ref1);
195     ref_ptr += ref_stride;
196     CALC_MSE_AVG_B(src0, ref0, var, avg0);
197     CALC_MSE_AVG_B(src1, ref1, var, avg1);
198 
199     LD_UB2(src_ptr, 16, src0, src1);
200     src_ptr += src_stride;
201     LD_UB2(ref_ptr, 16, ref0, ref1);
202     ref_ptr += ref_stride;
203     CALC_MSE_AVG_B(src0, ref0, var, avg0);
204     CALC_MSE_AVG_B(src1, ref1, var, avg1);
205 
206     LD_UB2(src_ptr, 16, src0, src1);
207     src_ptr += src_stride;
208     LD_UB2(ref_ptr, 16, ref0, ref1);
209     ref_ptr += ref_stride;
210     CALC_MSE_AVG_B(src0, ref0, var, avg0);
211     CALC_MSE_AVG_B(src1, ref1, var, avg1);
212 
213     LD_UB2(src_ptr, 16, src0, src1);
214     src_ptr += src_stride;
215     LD_UB2(ref_ptr, 16, ref0, ref1);
216     ref_ptr += ref_stride;
217     CALC_MSE_AVG_B(src0, ref0, var, avg0);
218     CALC_MSE_AVG_B(src1, ref1, var, avg1);
219   }
220 
221   vec = __msa_hadd_s_w(avg0, avg0);
222   vec += __msa_hadd_s_w(avg1, avg1);
223   *diff = HADD_SW_S32(vec);
224 
225   return HADD_SW_S32(var);
226 }
227 
sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)228 static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
229                                    const uint8_t *ref_ptr, int32_t ref_stride,
230                                    int32_t *diff) {
231   int32_t ht_cnt;
232   v16u8 src0, src1, src2, src3;
233   v16u8 ref0, ref1, ref2, ref3;
234   v8i16 avg0 = { 0 };
235   v8i16 avg1 = { 0 };
236   v4i32 vec, var = { 0 };
237 
238   for (ht_cnt = 16; ht_cnt--;) {
239     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
240     src_ptr += src_stride;
241     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
242     ref_ptr += ref_stride;
243     CALC_MSE_AVG_B(src0, ref0, var, avg0);
244     CALC_MSE_AVG_B(src2, ref2, var, avg0);
245     CALC_MSE_AVG_B(src1, ref1, var, avg1);
246     CALC_MSE_AVG_B(src3, ref3, var, avg1);
247 
248     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
249     src_ptr += src_stride;
250     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
251     ref_ptr += ref_stride;
252     CALC_MSE_AVG_B(src0, ref0, var, avg0);
253     CALC_MSE_AVG_B(src2, ref2, var, avg0);
254     CALC_MSE_AVG_B(src1, ref1, var, avg1);
255     CALC_MSE_AVG_B(src3, ref3, var, avg1);
256   }
257 
258   vec = __msa_hadd_s_w(avg0, avg0);
259   vec += __msa_hadd_s_w(avg1, avg1);
260   *diff = HADD_SW_S32(vec);
261 
262   return HADD_SW_S32(var);
263 }
264 
sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)265 static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
266                                    const uint8_t *ref_ptr, int32_t ref_stride,
267                                    int32_t *diff) {
268   int32_t ht_cnt;
269   v16u8 src0, src1, src2, src3;
270   v16u8 ref0, ref1, ref2, ref3;
271   v8i16 avg0 = { 0 };
272   v8i16 avg1 = { 0 };
273   v8i16 avg2 = { 0 };
274   v8i16 avg3 = { 0 };
275   v4i32 vec, var = { 0 };
276 
277   for (ht_cnt = 32; ht_cnt--;) {
278     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
279     src_ptr += src_stride;
280     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
281     ref_ptr += ref_stride;
282 
283     CALC_MSE_AVG_B(src0, ref0, var, avg0);
284     CALC_MSE_AVG_B(src1, ref1, var, avg1);
285     CALC_MSE_AVG_B(src2, ref2, var, avg2);
286     CALC_MSE_AVG_B(src3, ref3, var, avg3);
287     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
288     src_ptr += src_stride;
289     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
290     ref_ptr += ref_stride;
291     CALC_MSE_AVG_B(src0, ref0, var, avg0);
292     CALC_MSE_AVG_B(src1, ref1, var, avg1);
293     CALC_MSE_AVG_B(src2, ref2, var, avg2);
294     CALC_MSE_AVG_B(src3, ref3, var, avg3);
295   }
296 
297   vec = __msa_hadd_s_w(avg0, avg0);
298   vec += __msa_hadd_s_w(avg1, avg1);
299   vec += __msa_hadd_s_w(avg2, avg2);
300   vec += __msa_hadd_s_w(avg3, avg3);
301   *diff = HADD_SW_S32(vec);
302 
303   return HADD_SW_S32(var);
304 }
305 
get_mb_ss_msa(const int16_t * src)306 static uint32_t get_mb_ss_msa(const int16_t *src) {
307   uint32_t sum, cnt;
308   v8i16 src0, src1, src2, src3;
309   v4i32 src0_l, src1_l, src2_l, src3_l;
310   v4i32 src0_r, src1_r, src2_r, src3_r;
311   v2i64 sq_src_l = { 0 };
312   v2i64 sq_src_r = { 0 };
313 
314   for (cnt = 8; cnt--;) {
315     LD_SH4(src, 8, src0, src1, src2, src3);
316     src += 4 * 8;
317 
318     UNPCK_SH_SW(src0, src0_l, src0_r);
319     UNPCK_SH_SW(src1, src1_l, src1_r);
320     UNPCK_SH_SW(src2, src2_l, src2_r);
321     UNPCK_SH_SW(src3, src3_l, src3_r);
322 
323     DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
324     DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
325     DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
326     DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
327   }
328 
329   sq_src_l += __msa_splati_d(sq_src_l, 1);
330   sq_src_r += __msa_splati_d(sq_src_r, 1);
331 
332   sum = __msa_copy_s_d(sq_src_l, 0);
333   sum += __msa_copy_s_d(sq_src_r, 0);
334 
335   return sum;
336 }
337 
sse_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)338 static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
339                                const uint8_t *ref_ptr, int32_t ref_stride,
340                                int32_t height) {
341   int32_t ht_cnt;
342   uint32_t src0, src1, src2, src3;
343   uint32_t ref0, ref1, ref2, ref3;
344   v16u8 src = { 0 };
345   v16u8 ref = { 0 };
346   v4i32 var = { 0 };
347 
348   for (ht_cnt = (height >> 2); ht_cnt--;) {
349     LW4(src_ptr, src_stride, src0, src1, src2, src3);
350     src_ptr += (4 * src_stride);
351     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
352     ref_ptr += (4 * ref_stride);
353 
354     INSERT_W4_UB(src0, src1, src2, src3, src);
355     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
356     CALC_MSE_B(src, ref, var);
357   }
358 
359   return HADD_SW_S32(var);
360 }
361 
sse_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)362 static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
363                                const uint8_t *ref_ptr, int32_t ref_stride,
364                                int32_t height) {
365   int32_t ht_cnt;
366   v16u8 src0, src1, src2, src3;
367   v16u8 ref0, ref1, ref2, ref3;
368   v4i32 var = { 0 };
369 
370   for (ht_cnt = (height >> 2); ht_cnt--;) {
371     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
372     src_ptr += (4 * src_stride);
373     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
374     ref_ptr += (4 * ref_stride);
375 
376     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
377                 ref0, ref1);
378     CALC_MSE_B(src0, ref0, var);
379     CALC_MSE_B(src1, ref1, var);
380   }
381 
382   return HADD_SW_S32(var);
383 }
384 
sse_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)385 static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
386                                 const uint8_t *ref_ptr, int32_t ref_stride,
387                                 int32_t height) {
388   int32_t ht_cnt;
389   v16u8 src, ref;
390   v4i32 var = { 0 };
391 
392   for (ht_cnt = (height >> 2); ht_cnt--;) {
393     src = LD_UB(src_ptr);
394     src_ptr += src_stride;
395     ref = LD_UB(ref_ptr);
396     ref_ptr += ref_stride;
397     CALC_MSE_B(src, ref, var);
398 
399     src = LD_UB(src_ptr);
400     src_ptr += src_stride;
401     ref = LD_UB(ref_ptr);
402     ref_ptr += ref_stride;
403     CALC_MSE_B(src, ref, var);
404 
405     src = LD_UB(src_ptr);
406     src_ptr += src_stride;
407     ref = LD_UB(ref_ptr);
408     ref_ptr += ref_stride;
409     CALC_MSE_B(src, ref, var);
410 
411     src = LD_UB(src_ptr);
412     src_ptr += src_stride;
413     ref = LD_UB(ref_ptr);
414     ref_ptr += ref_stride;
415     CALC_MSE_B(src, ref, var);
416   }
417 
418   return HADD_SW_S32(var);
419 }
420 
sse_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)421 static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
422                                 const uint8_t *ref_ptr, int32_t ref_stride,
423                                 int32_t height) {
424   int32_t ht_cnt;
425   v16u8 src0, src1, ref0, ref1;
426   v4i32 var = { 0 };
427 
428   for (ht_cnt = (height >> 2); ht_cnt--;) {
429     LD_UB2(src_ptr, 16, src0, src1);
430     src_ptr += src_stride;
431     LD_UB2(ref_ptr, 16, ref0, ref1);
432     ref_ptr += ref_stride;
433     CALC_MSE_B(src0, ref0, var);
434     CALC_MSE_B(src1, ref1, var);
435 
436     LD_UB2(src_ptr, 16, src0, src1);
437     src_ptr += src_stride;
438     LD_UB2(ref_ptr, 16, ref0, ref1);
439     ref_ptr += ref_stride;
440     CALC_MSE_B(src0, ref0, var);
441     CALC_MSE_B(src1, ref1, var);
442 
443     LD_UB2(src_ptr, 16, src0, src1);
444     src_ptr += src_stride;
445     LD_UB2(ref_ptr, 16, ref0, ref1);
446     ref_ptr += ref_stride;
447     CALC_MSE_B(src0, ref0, var);
448     CALC_MSE_B(src1, ref1, var);
449 
450     LD_UB2(src_ptr, 16, src0, src1);
451     src_ptr += src_stride;
452     LD_UB2(ref_ptr, 16, ref0, ref1);
453     ref_ptr += ref_stride;
454     CALC_MSE_B(src0, ref0, var);
455     CALC_MSE_B(src1, ref1, var);
456   }
457 
458   return HADD_SW_S32(var);
459 }
460 
sse_64width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)461 static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
462                                 const uint8_t *ref_ptr, int32_t ref_stride,
463                                 int32_t height) {
464   int32_t ht_cnt;
465   v16u8 src0, src1, src2, src3;
466   v16u8 ref0, ref1, ref2, ref3;
467   v4i32 var = { 0 };
468 
469   for (ht_cnt = height >> 1; ht_cnt--;) {
470     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
471     src_ptr += src_stride;
472     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
473     ref_ptr += ref_stride;
474     CALC_MSE_B(src0, ref0, var);
475     CALC_MSE_B(src2, ref2, var);
476     CALC_MSE_B(src1, ref1, var);
477     CALC_MSE_B(src3, ref3, var);
478 
479     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
480     src_ptr += src_stride;
481     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
482     ref_ptr += ref_stride;
483     CALC_MSE_B(src0, ref0, var);
484     CALC_MSE_B(src2, ref2, var);
485     CALC_MSE_B(src1, ref1, var);
486     CALC_MSE_B(src3, ref3, var);
487   }
488 
489   return HADD_SW_S32(var);
490 }
491 
aom_get4x4sse_cs_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride)492 uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
493                               const uint8_t *ref_ptr, int32_t ref_stride) {
494   uint32_t err = 0;
495   uint32_t src0, src1, src2, src3;
496   uint32_t ref0, ref1, ref2, ref3;
497   v16i8 src = { 0 };
498   v16i8 ref = { 0 };
499   v16u8 src_vec0, src_vec1;
500   v8i16 diff0, diff1;
501   v4i32 err0 = { 0 };
502   v4i32 err1 = { 0 };
503 
504   LW4(src_ptr, src_stride, src0, src1, src2, src3);
505   LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
506   INSERT_W4_SB(src0, src1, src2, src3, src);
507   INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
508   ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
509   HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
510   DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
511   err = HADD_SW_S32(err0);
512   err += HADD_SW_S32(err1);
513 
514   return err;
515 }
516 
517 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
518 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
519 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
520 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
521 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
522 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
523 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
524 
525 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
526 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
527 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
528 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
529 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
530 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
531 
532 #define AOM_VARIANCE_WDXHT_MSA(wd, ht)                                         \
533   uint32_t aom_variance##wd##x##ht##_msa(                                      \
534       const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
535       int32_t ref_stride, uint32_t *sse) {                                     \
536     int32_t diff;                                                              \
537                                                                                \
538     *sse =                                                                     \
539         sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
540                                                                                \
541     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
542   }
543 
544 /* clang-format off */
545 AOM_VARIANCE_WDXHT_MSA(4, 4)
546 AOM_VARIANCE_WDXHT_MSA(4, 8)
547 
548 AOM_VARIANCE_WDXHT_MSA(8, 4)
549 AOM_VARIANCE_WDXHT_MSA(8, 8)
550 AOM_VARIANCE_WDXHT_MSA(8, 16)
551 
552 AOM_VARIANCE_WDXHT_MSA(16, 8)
553 AOM_VARIANCE_WDXHT_MSA(16, 16)
554 AOM_VARIANCE_WDXHT_MSA(16, 32)
555 
556 AOM_VARIANCE_WDXHT_MSA(32, 16)
557 AOM_VARIANCE_WDXHT_MSA(32, 32)
558 /* clang-format on */
559 
aom_variance32x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)560 uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
561                                const uint8_t *ref, int32_t ref_stride,
562                                uint32_t *sse) {
563   int32_t diff;
564 
565   *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
566 
567   return VARIANCE_32Wx64H(*sse, diff);
568 }
569 
aom_variance64x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)570 uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
571                                const uint8_t *ref, int32_t ref_stride,
572                                uint32_t *sse) {
573   int32_t diff;
574 
575   *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
576 
577   return VARIANCE_64Wx32H(*sse, diff);
578 }
579 
aom_variance64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)580 uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
581                                const uint8_t *ref, int32_t ref_stride,
582                                uint32_t *sse) {
583   int32_t diff;
584 
585   *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
586 
587   return VARIANCE_64Wx64H(*sse, diff);
588 }
589 
aom_mse8x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)590 uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
591                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
592   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
593 
594   return *sse;
595 }
596 
aom_mse8x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)597 uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
598                          const uint8_t *ref, int32_t ref_stride,
599                          uint32_t *sse) {
600   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
601 
602   return *sse;
603 }
604 
aom_mse16x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)605 uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
606                          const uint8_t *ref, int32_t ref_stride,
607                          uint32_t *sse) {
608   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
609 
610   return *sse;
611 }
612 
aom_mse16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)613 uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
614                           const uint8_t *ref, int32_t ref_stride,
615                           uint32_t *sse) {
616   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
617 
618   return *sse;
619 }
620 
aom_get8x8var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)621 void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
622                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
623                        int32_t *sum) {
624   *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
625 }
626 
aom_get16x16var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)627 void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
628                          const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
629                          int32_t *sum) {
630   *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
631 }
632 
aom_get_mb_ss_msa(const int16_t * src)633 uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
634