1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define CALC_MSE_B(src, ref, var)                                   \
15   {                                                                 \
16     v16u8 src_l0_m, src_l1_m;                                       \
17     v8i16 res_l0_m, res_l1_m;                                       \
18                                                                     \
19     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
20     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
21     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
22   }
23 
24 #define CALC_MSE_AVG_B(src, ref, var, sub)                          \
25   {                                                                 \
26     v16u8 src_l0_m, src_l1_m;                                       \
27     v8i16 res_l0_m, res_l1_m;                                       \
28                                                                     \
29     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                      \
30     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);            \
31     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
32                                                                     \
33     sub += res_l0_m + res_l1_m;                                     \
34   }
35 
36 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
37 
38 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
39   sse - (((int64_t)diff * diff) >> shift)
40 
sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)41 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
42                                     const uint8_t *ref_ptr, int32_t ref_stride,
43                                     int32_t height, int32_t *diff) {
44   uint32_t src0, src1, src2, src3;
45   uint32_t ref0, ref1, ref2, ref3;
46   int32_t ht_cnt;
47   v16u8 src = { 0 };
48   v16u8 ref = { 0 };
49   v8i16 avg = { 0 };
50   v4i32 vec, var = { 0 };
51 
52   for (ht_cnt = (height >> 2); ht_cnt--;) {
53     LW4(src_ptr, src_stride, src0, src1, src2, src3);
54     src_ptr += (4 * src_stride);
55     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
56     ref_ptr += (4 * ref_stride);
57 
58     INSERT_W4_UB(src0, src1, src2, src3, src);
59     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
60     CALC_MSE_AVG_B(src, ref, var, avg);
61   }
62 
63   vec = __msa_hadd_s_w(avg, avg);
64   *diff = HADD_SW_S32(vec);
65 
66   return HADD_SW_S32(var);
67 }
68 
sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)69 static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
70                                     const uint8_t *ref_ptr, int32_t ref_stride,
71                                     int32_t height, int32_t *diff) {
72   int32_t ht_cnt;
73   v16u8 src0, src1, src2, src3;
74   v16u8 ref0, ref1, ref2, ref3;
75   v8i16 avg = { 0 };
76   v4i32 vec, var = { 0 };
77 
78   for (ht_cnt = (height >> 2); ht_cnt--;) {
79     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
80     src_ptr += (4 * src_stride);
81     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
82     ref_ptr += (4 * ref_stride);
83 
84     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
85                 ref0, ref1);
86     CALC_MSE_AVG_B(src0, ref0, var, avg);
87     CALC_MSE_AVG_B(src1, ref1, var, avg);
88   }
89 
90   vec = __msa_hadd_s_w(avg, avg);
91   *diff = HADD_SW_S32(vec);
92 
93   return HADD_SW_S32(var);
94 }
95 
sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)96 static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
97                                      const uint8_t *ref_ptr, int32_t ref_stride,
98                                      int32_t height, int32_t *diff) {
99   int32_t ht_cnt;
100   v16u8 src, ref;
101   v8i16 avg = { 0 };
102   v4i32 vec, var = { 0 };
103 
104   for (ht_cnt = (height >> 2); ht_cnt--;) {
105     src = LD_UB(src_ptr);
106     src_ptr += src_stride;
107     ref = LD_UB(ref_ptr);
108     ref_ptr += ref_stride;
109     CALC_MSE_AVG_B(src, ref, var, avg);
110 
111     src = LD_UB(src_ptr);
112     src_ptr += src_stride;
113     ref = LD_UB(ref_ptr);
114     ref_ptr += ref_stride;
115     CALC_MSE_AVG_B(src, ref, var, avg);
116 
117     src = LD_UB(src_ptr);
118     src_ptr += src_stride;
119     ref = LD_UB(ref_ptr);
120     ref_ptr += ref_stride;
121     CALC_MSE_AVG_B(src, ref, var, avg);
122 
123     src = LD_UB(src_ptr);
124     src_ptr += src_stride;
125     ref = LD_UB(ref_ptr);
126     ref_ptr += ref_stride;
127     CALC_MSE_AVG_B(src, ref, var, avg);
128   }
129 
130   vec = __msa_hadd_s_w(avg, avg);
131   *diff = HADD_SW_S32(vec);
132 
133   return HADD_SW_S32(var);
134 }
135 
sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)136 static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
137                                      const uint8_t *ref_ptr, int32_t ref_stride,
138                                      int32_t height, int32_t *diff) {
139   int32_t ht_cnt;
140   v16u8 src0, src1, ref0, ref1;
141   v8i16 avg = { 0 };
142   v4i32 vec, var = { 0 };
143 
144   for (ht_cnt = (height >> 2); ht_cnt--;) {
145     LD_UB2(src_ptr, 16, src0, src1);
146     src_ptr += src_stride;
147     LD_UB2(ref_ptr, 16, ref0, ref1);
148     ref_ptr += ref_stride;
149     CALC_MSE_AVG_B(src0, ref0, var, avg);
150     CALC_MSE_AVG_B(src1, ref1, var, avg);
151 
152     LD_UB2(src_ptr, 16, src0, src1);
153     src_ptr += src_stride;
154     LD_UB2(ref_ptr, 16, ref0, ref1);
155     ref_ptr += ref_stride;
156     CALC_MSE_AVG_B(src0, ref0, var, avg);
157     CALC_MSE_AVG_B(src1, ref1, var, avg);
158 
159     LD_UB2(src_ptr, 16, src0, src1);
160     src_ptr += src_stride;
161     LD_UB2(ref_ptr, 16, ref0, ref1);
162     ref_ptr += ref_stride;
163     CALC_MSE_AVG_B(src0, ref0, var, avg);
164     CALC_MSE_AVG_B(src1, ref1, var, avg);
165 
166     LD_UB2(src_ptr, 16, src0, src1);
167     src_ptr += src_stride;
168     LD_UB2(ref_ptr, 16, ref0, ref1);
169     ref_ptr += ref_stride;
170     CALC_MSE_AVG_B(src0, ref0, var, avg);
171     CALC_MSE_AVG_B(src1, ref1, var, avg);
172   }
173 
174   vec = __msa_hadd_s_w(avg, avg);
175   *diff = HADD_SW_S32(vec);
176 
177   return HADD_SW_S32(var);
178 }
179 
sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)180 static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
181                                    const uint8_t *ref_ptr, int32_t ref_stride,
182                                    int32_t *diff) {
183   int32_t ht_cnt;
184   v16u8 src0, src1, ref0, ref1;
185   v8i16 avg0 = { 0 };
186   v8i16 avg1 = { 0 };
187   v4i32 vec, var = { 0 };
188 
189   for (ht_cnt = 16; ht_cnt--;) {
190     LD_UB2(src_ptr, 16, src0, src1);
191     src_ptr += src_stride;
192     LD_UB2(ref_ptr, 16, ref0, ref1);
193     ref_ptr += ref_stride;
194     CALC_MSE_AVG_B(src0, ref0, var, avg0);
195     CALC_MSE_AVG_B(src1, ref1, var, avg1);
196 
197     LD_UB2(src_ptr, 16, src0, src1);
198     src_ptr += src_stride;
199     LD_UB2(ref_ptr, 16, ref0, ref1);
200     ref_ptr += ref_stride;
201     CALC_MSE_AVG_B(src0, ref0, var, avg0);
202     CALC_MSE_AVG_B(src1, ref1, var, avg1);
203 
204     LD_UB2(src_ptr, 16, src0, src1);
205     src_ptr += src_stride;
206     LD_UB2(ref_ptr, 16, ref0, ref1);
207     ref_ptr += ref_stride;
208     CALC_MSE_AVG_B(src0, ref0, var, avg0);
209     CALC_MSE_AVG_B(src1, ref1, var, avg1);
210 
211     LD_UB2(src_ptr, 16, src0, src1);
212     src_ptr += src_stride;
213     LD_UB2(ref_ptr, 16, ref0, ref1);
214     ref_ptr += ref_stride;
215     CALC_MSE_AVG_B(src0, ref0, var, avg0);
216     CALC_MSE_AVG_B(src1, ref1, var, avg1);
217   }
218 
219   vec = __msa_hadd_s_w(avg0, avg0);
220   vec += __msa_hadd_s_w(avg1, avg1);
221   *diff = HADD_SW_S32(vec);
222 
223   return HADD_SW_S32(var);
224 }
225 
sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)226 static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
227                                    const uint8_t *ref_ptr, int32_t ref_stride,
228                                    int32_t *diff) {
229   int32_t ht_cnt;
230   v16u8 src0, src1, src2, src3;
231   v16u8 ref0, ref1, ref2, ref3;
232   v8i16 avg0 = { 0 };
233   v8i16 avg1 = { 0 };
234   v4i32 vec, var = { 0 };
235 
236   for (ht_cnt = 16; ht_cnt--;) {
237     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
238     src_ptr += src_stride;
239     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
240     ref_ptr += ref_stride;
241     CALC_MSE_AVG_B(src0, ref0, var, avg0);
242     CALC_MSE_AVG_B(src2, ref2, var, avg0);
243     CALC_MSE_AVG_B(src1, ref1, var, avg1);
244     CALC_MSE_AVG_B(src3, ref3, var, avg1);
245 
246     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
247     src_ptr += src_stride;
248     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
249     ref_ptr += ref_stride;
250     CALC_MSE_AVG_B(src0, ref0, var, avg0);
251     CALC_MSE_AVG_B(src2, ref2, var, avg0);
252     CALC_MSE_AVG_B(src1, ref1, var, avg1);
253     CALC_MSE_AVG_B(src3, ref3, var, avg1);
254   }
255 
256   vec = __msa_hadd_s_w(avg0, avg0);
257   vec += __msa_hadd_s_w(avg1, avg1);
258   *diff = HADD_SW_S32(vec);
259 
260   return HADD_SW_S32(var);
261 }
262 
sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)263 static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
264                                    const uint8_t *ref_ptr, int32_t ref_stride,
265                                    int32_t *diff) {
266   int32_t ht_cnt;
267   v16u8 src0, src1, src2, src3;
268   v16u8 ref0, ref1, ref2, ref3;
269   v8i16 avg0 = { 0 };
270   v8i16 avg1 = { 0 };
271   v8i16 avg2 = { 0 };
272   v8i16 avg3 = { 0 };
273   v4i32 vec, var = { 0 };
274 
275   for (ht_cnt = 32; ht_cnt--;) {
276     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
277     src_ptr += src_stride;
278     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
279     ref_ptr += ref_stride;
280 
281     CALC_MSE_AVG_B(src0, ref0, var, avg0);
282     CALC_MSE_AVG_B(src1, ref1, var, avg1);
283     CALC_MSE_AVG_B(src2, ref2, var, avg2);
284     CALC_MSE_AVG_B(src3, ref3, var, avg3);
285     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
286     src_ptr += src_stride;
287     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
288     ref_ptr += ref_stride;
289     CALC_MSE_AVG_B(src0, ref0, var, avg0);
290     CALC_MSE_AVG_B(src1, ref1, var, avg1);
291     CALC_MSE_AVG_B(src2, ref2, var, avg2);
292     CALC_MSE_AVG_B(src3, ref3, var, avg3);
293   }
294 
295   vec = __msa_hadd_s_w(avg0, avg0);
296   vec += __msa_hadd_s_w(avg1, avg1);
297   vec += __msa_hadd_s_w(avg2, avg2);
298   vec += __msa_hadd_s_w(avg3, avg3);
299   *diff = HADD_SW_S32(vec);
300 
301   return HADD_SW_S32(var);
302 }
303 
get_mb_ss_msa(const int16_t * src)304 static uint32_t get_mb_ss_msa(const int16_t *src) {
305   uint32_t sum, cnt;
306   v8i16 src0, src1, src2, src3;
307   v4i32 src0_l, src1_l, src2_l, src3_l;
308   v4i32 src0_r, src1_r, src2_r, src3_r;
309   v2i64 sq_src_l = { 0 };
310   v2i64 sq_src_r = { 0 };
311 
312   for (cnt = 8; cnt--;) {
313     LD_SH4(src, 8, src0, src1, src2, src3);
314     src += 4 * 8;
315 
316     UNPCK_SH_SW(src0, src0_l, src0_r);
317     UNPCK_SH_SW(src1, src1_l, src1_r);
318     UNPCK_SH_SW(src2, src2_l, src2_r);
319     UNPCK_SH_SW(src3, src3_l, src3_r);
320 
321     DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
322     DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
323     DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
324     DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
325   }
326 
327   sq_src_l += __msa_splati_d(sq_src_l, 1);
328   sq_src_r += __msa_splati_d(sq_src_r, 1);
329 
330   sum = __msa_copy_s_d(sq_src_l, 0);
331   sum += __msa_copy_s_d(sq_src_r, 0);
332 
333   return sum;
334 }
335 
sse_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)336 static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
337                                const uint8_t *ref_ptr, int32_t ref_stride,
338                                int32_t height) {
339   int32_t ht_cnt;
340   uint32_t src0, src1, src2, src3;
341   uint32_t ref0, ref1, ref2, ref3;
342   v16u8 src = { 0 };
343   v16u8 ref = { 0 };
344   v4i32 var = { 0 };
345 
346   for (ht_cnt = (height >> 2); ht_cnt--;) {
347     LW4(src_ptr, src_stride, src0, src1, src2, src3);
348     src_ptr += (4 * src_stride);
349     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
350     ref_ptr += (4 * ref_stride);
351 
352     INSERT_W4_UB(src0, src1, src2, src3, src);
353     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
354     CALC_MSE_B(src, ref, var);
355   }
356 
357   return HADD_SW_S32(var);
358 }
359 
sse_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)360 static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
361                                const uint8_t *ref_ptr, int32_t ref_stride,
362                                int32_t height) {
363   int32_t ht_cnt;
364   v16u8 src0, src1, src2, src3;
365   v16u8 ref0, ref1, ref2, ref3;
366   v4i32 var = { 0 };
367 
368   for (ht_cnt = (height >> 2); ht_cnt--;) {
369     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
370     src_ptr += (4 * src_stride);
371     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
372     ref_ptr += (4 * ref_stride);
373 
374     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
375                 ref0, ref1);
376     CALC_MSE_B(src0, ref0, var);
377     CALC_MSE_B(src1, ref1, var);
378   }
379 
380   return HADD_SW_S32(var);
381 }
382 
sse_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)383 static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
384                                 const uint8_t *ref_ptr, int32_t ref_stride,
385                                 int32_t height) {
386   int32_t ht_cnt;
387   v16u8 src, ref;
388   v4i32 var = { 0 };
389 
390   for (ht_cnt = (height >> 2); ht_cnt--;) {
391     src = LD_UB(src_ptr);
392     src_ptr += src_stride;
393     ref = LD_UB(ref_ptr);
394     ref_ptr += ref_stride;
395     CALC_MSE_B(src, ref, var);
396 
397     src = LD_UB(src_ptr);
398     src_ptr += src_stride;
399     ref = LD_UB(ref_ptr);
400     ref_ptr += ref_stride;
401     CALC_MSE_B(src, ref, var);
402 
403     src = LD_UB(src_ptr);
404     src_ptr += src_stride;
405     ref = LD_UB(ref_ptr);
406     ref_ptr += ref_stride;
407     CALC_MSE_B(src, ref, var);
408 
409     src = LD_UB(src_ptr);
410     src_ptr += src_stride;
411     ref = LD_UB(ref_ptr);
412     ref_ptr += ref_stride;
413     CALC_MSE_B(src, ref, var);
414   }
415 
416   return HADD_SW_S32(var);
417 }
418 
sse_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)419 static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
420                                 const uint8_t *ref_ptr, int32_t ref_stride,
421                                 int32_t height) {
422   int32_t ht_cnt;
423   v16u8 src0, src1, ref0, ref1;
424   v4i32 var = { 0 };
425 
426   for (ht_cnt = (height >> 2); ht_cnt--;) {
427     LD_UB2(src_ptr, 16, src0, src1);
428     src_ptr += src_stride;
429     LD_UB2(ref_ptr, 16, ref0, ref1);
430     ref_ptr += ref_stride;
431     CALC_MSE_B(src0, ref0, var);
432     CALC_MSE_B(src1, ref1, var);
433 
434     LD_UB2(src_ptr, 16, src0, src1);
435     src_ptr += src_stride;
436     LD_UB2(ref_ptr, 16, ref0, ref1);
437     ref_ptr += ref_stride;
438     CALC_MSE_B(src0, ref0, var);
439     CALC_MSE_B(src1, ref1, var);
440 
441     LD_UB2(src_ptr, 16, src0, src1);
442     src_ptr += src_stride;
443     LD_UB2(ref_ptr, 16, ref0, ref1);
444     ref_ptr += ref_stride;
445     CALC_MSE_B(src0, ref0, var);
446     CALC_MSE_B(src1, ref1, var);
447 
448     LD_UB2(src_ptr, 16, src0, src1);
449     src_ptr += src_stride;
450     LD_UB2(ref_ptr, 16, ref0, ref1);
451     ref_ptr += ref_stride;
452     CALC_MSE_B(src0, ref0, var);
453     CALC_MSE_B(src1, ref1, var);
454   }
455 
456   return HADD_SW_S32(var);
457 }
458 
sse_64width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)459 static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
460                                 const uint8_t *ref_ptr, int32_t ref_stride,
461                                 int32_t height) {
462   int32_t ht_cnt;
463   v16u8 src0, src1, src2, src3;
464   v16u8 ref0, ref1, ref2, ref3;
465   v4i32 var = { 0 };
466 
467   for (ht_cnt = height >> 1; ht_cnt--;) {
468     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
469     src_ptr += src_stride;
470     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
471     ref_ptr += ref_stride;
472     CALC_MSE_B(src0, ref0, var);
473     CALC_MSE_B(src2, ref2, var);
474     CALC_MSE_B(src1, ref1, var);
475     CALC_MSE_B(src3, ref3, var);
476 
477     LD_UB4(src_ptr, 16, src0, src1, src2, src3);
478     src_ptr += src_stride;
479     LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
480     ref_ptr += ref_stride;
481     CALC_MSE_B(src0, ref0, var);
482     CALC_MSE_B(src2, ref2, var);
483     CALC_MSE_B(src1, ref1, var);
484     CALC_MSE_B(src3, ref3, var);
485   }
486 
487   return HADD_SW_S32(var);
488 }
489 
vpx_get4x4sse_cs_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride)490 uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
491                               const uint8_t *ref_ptr, int32_t ref_stride) {
492   uint32_t src0, src1, src2, src3;
493   uint32_t ref0, ref1, ref2, ref3;
494   v16i8 src = { 0 };
495   v16i8 ref = { 0 };
496   v4i32 err0 = { 0 };
497 
498   LW4(src_ptr, src_stride, src0, src1, src2, src3);
499   LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
500   INSERT_W4_SB(src0, src1, src2, src3, src);
501   INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
502   CALC_MSE_B(src, ref, err0);
503 
504   return HADD_SW_S32(err0);
505 }
506 
507 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
508 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
509 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
510 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
511 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
512 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
513 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
514 
515 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
516 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
517 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
518 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
519 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
520 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
521 
522 #define VPX_VARIANCE_WDXHT_MSA(wd, ht)                                         \
523   uint32_t vpx_variance##wd##x##ht##_msa(                                      \
524       const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
525       int32_t ref_stride, uint32_t *sse) {                                     \
526     int32_t diff;                                                              \
527                                                                                \
528     *sse =                                                                     \
529         sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
530                                                                                \
531     return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
532   }
533 
534 VPX_VARIANCE_WDXHT_MSA(4, 4);
535 VPX_VARIANCE_WDXHT_MSA(4, 8);
536 
537 VPX_VARIANCE_WDXHT_MSA(8, 4)
538 VPX_VARIANCE_WDXHT_MSA(8, 8)
539 VPX_VARIANCE_WDXHT_MSA(8, 16)
540 
541 VPX_VARIANCE_WDXHT_MSA(16, 8)
542 VPX_VARIANCE_WDXHT_MSA(16, 16)
543 VPX_VARIANCE_WDXHT_MSA(16, 32)
544 
545 VPX_VARIANCE_WDXHT_MSA(32, 16)
546 VPX_VARIANCE_WDXHT_MSA(32, 32)
547 
vpx_variance32x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)548 uint32_t vpx_variance32x64_msa(const uint8_t *src, int32_t src_stride,
549                                const uint8_t *ref, int32_t ref_stride,
550                                uint32_t *sse) {
551   int32_t diff;
552 
553   *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
554 
555   return VARIANCE_32Wx64H(*sse, diff);
556 }
557 
vpx_variance64x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)558 uint32_t vpx_variance64x32_msa(const uint8_t *src, int32_t src_stride,
559                                const uint8_t *ref, int32_t ref_stride,
560                                uint32_t *sse) {
561   int32_t diff;
562 
563   *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
564 
565   return VARIANCE_64Wx32H(*sse, diff);
566 }
567 
vpx_variance64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)568 uint32_t vpx_variance64x64_msa(const uint8_t *src, int32_t src_stride,
569                                const uint8_t *ref, int32_t ref_stride,
570                                uint32_t *sse) {
571   int32_t diff;
572 
573   *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
574 
575   return VARIANCE_64Wx64H(*sse, diff);
576 }
577 
vpx_mse8x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)578 uint32_t vpx_mse8x8_msa(const uint8_t *src, int32_t src_stride,
579                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
580   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
581 
582   return *sse;
583 }
584 
vpx_mse8x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)585 uint32_t vpx_mse8x16_msa(const uint8_t *src, int32_t src_stride,
586                          const uint8_t *ref, int32_t ref_stride,
587                          uint32_t *sse) {
588   *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
589 
590   return *sse;
591 }
592 
vpx_mse16x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)593 uint32_t vpx_mse16x8_msa(const uint8_t *src, int32_t src_stride,
594                          const uint8_t *ref, int32_t ref_stride,
595                          uint32_t *sse) {
596   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
597 
598   return *sse;
599 }
600 
vpx_mse16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)601 uint32_t vpx_mse16x16_msa(const uint8_t *src, int32_t src_stride,
602                           const uint8_t *ref, int32_t ref_stride,
603                           uint32_t *sse) {
604   *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
605 
606   return *sse;
607 }
608 
vpx_get8x8var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)609 void vpx_get8x8var_msa(const uint8_t *src, int32_t src_stride,
610                        const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
611                        int32_t *sum) {
612   *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
613 }
614 
vpx_get16x16var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)615 void vpx_get16x16var_msa(const uint8_t *src, int32_t src_stride,
616                          const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
617                          int32_t *sum) {
618   *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
619 }
620 
vpx_get_mb_ss_msa(const int16_t * src)621 uint32_t vpx_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
622