1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_dsp_rtcd.h"
13
14 #include "aom_dsp/mips/macros_msa.h"
15
16 #define CALC_MSE_B(src, ref, var) \
17 { \
18 v16u8 src_l0_m, src_l1_m; \
19 v8i16 res_l0_m, res_l1_m; \
20 \
21 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
22 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
23 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
24 }
25
26 #define CALC_MSE_AVG_B(src, ref, var, sub) \
27 { \
28 v16u8 src_l0_m, src_l1_m; \
29 v8i16 res_l0_m, res_l1_m; \
30 \
31 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
32 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
33 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
34 \
35 sub += res_l0_m + res_l1_m; \
36 }
37
38 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
39
40 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
41 sse - (((int64_t)diff * diff) >> shift)
42
sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)43 static uint32_t sse_diff_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
44 const uint8_t *ref_ptr, int32_t ref_stride,
45 int32_t height, int32_t *diff) {
46 uint32_t src0, src1, src2, src3;
47 uint32_t ref0, ref1, ref2, ref3;
48 int32_t ht_cnt;
49 v16u8 src = { 0 };
50 v16u8 ref = { 0 };
51 v8i16 avg = { 0 };
52 v4i32 vec, var = { 0 };
53
54 for (ht_cnt = (height >> 2); ht_cnt--;) {
55 LW4(src_ptr, src_stride, src0, src1, src2, src3);
56 src_ptr += (4 * src_stride);
57 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
58 ref_ptr += (4 * ref_stride);
59
60 INSERT_W4_UB(src0, src1, src2, src3, src);
61 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
62 CALC_MSE_AVG_B(src, ref, var, avg);
63 }
64
65 vec = __msa_hadd_s_w(avg, avg);
66 *diff = HADD_SW_S32(vec);
67
68 return HADD_SW_S32(var);
69 }
70
sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)71 static uint32_t sse_diff_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
72 const uint8_t *ref_ptr, int32_t ref_stride,
73 int32_t height, int32_t *diff) {
74 int32_t ht_cnt;
75 v16u8 src0, src1, src2, src3;
76 v16u8 ref0, ref1, ref2, ref3;
77 v8i16 avg = { 0 };
78 v4i32 vec, var = { 0 };
79
80 for (ht_cnt = (height >> 2); ht_cnt--;) {
81 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
82 src_ptr += (4 * src_stride);
83 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
84 ref_ptr += (4 * ref_stride);
85
86 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
87 ref0, ref1);
88 CALC_MSE_AVG_B(src0, ref0, var, avg);
89 CALC_MSE_AVG_B(src1, ref1, var, avg);
90 }
91
92 vec = __msa_hadd_s_w(avg, avg);
93 *diff = HADD_SW_S32(vec);
94
95 return HADD_SW_S32(var);
96 }
97
sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)98 static uint32_t sse_diff_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
99 const uint8_t *ref_ptr, int32_t ref_stride,
100 int32_t height, int32_t *diff) {
101 int32_t ht_cnt;
102 v16u8 src, ref;
103 v8i16 avg = { 0 };
104 v4i32 vec, var = { 0 };
105
106 for (ht_cnt = (height >> 2); ht_cnt--;) {
107 src = LD_UB(src_ptr);
108 src_ptr += src_stride;
109 ref = LD_UB(ref_ptr);
110 ref_ptr += ref_stride;
111 CALC_MSE_AVG_B(src, ref, var, avg);
112
113 src = LD_UB(src_ptr);
114 src_ptr += src_stride;
115 ref = LD_UB(ref_ptr);
116 ref_ptr += ref_stride;
117 CALC_MSE_AVG_B(src, ref, var, avg);
118
119 src = LD_UB(src_ptr);
120 src_ptr += src_stride;
121 ref = LD_UB(ref_ptr);
122 ref_ptr += ref_stride;
123 CALC_MSE_AVG_B(src, ref, var, avg);
124
125 src = LD_UB(src_ptr);
126 src_ptr += src_stride;
127 ref = LD_UB(ref_ptr);
128 ref_ptr += ref_stride;
129 CALC_MSE_AVG_B(src, ref, var, avg);
130 }
131
132 vec = __msa_hadd_s_w(avg, avg);
133 *diff = HADD_SW_S32(vec);
134
135 return HADD_SW_S32(var);
136 }
137
sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,int32_t * diff)138 static uint32_t sse_diff_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
139 const uint8_t *ref_ptr, int32_t ref_stride,
140 int32_t height, int32_t *diff) {
141 int32_t ht_cnt;
142 v16u8 src0, src1, ref0, ref1;
143 v8i16 avg = { 0 };
144 v4i32 vec, var = { 0 };
145
146 for (ht_cnt = (height >> 2); ht_cnt--;) {
147 LD_UB2(src_ptr, 16, src0, src1);
148 src_ptr += src_stride;
149 LD_UB2(ref_ptr, 16, ref0, ref1);
150 ref_ptr += ref_stride;
151 CALC_MSE_AVG_B(src0, ref0, var, avg);
152 CALC_MSE_AVG_B(src1, ref1, var, avg);
153
154 LD_UB2(src_ptr, 16, src0, src1);
155 src_ptr += src_stride;
156 LD_UB2(ref_ptr, 16, ref0, ref1);
157 ref_ptr += ref_stride;
158 CALC_MSE_AVG_B(src0, ref0, var, avg);
159 CALC_MSE_AVG_B(src1, ref1, var, avg);
160
161 LD_UB2(src_ptr, 16, src0, src1);
162 src_ptr += src_stride;
163 LD_UB2(ref_ptr, 16, ref0, ref1);
164 ref_ptr += ref_stride;
165 CALC_MSE_AVG_B(src0, ref0, var, avg);
166 CALC_MSE_AVG_B(src1, ref1, var, avg);
167
168 LD_UB2(src_ptr, 16, src0, src1);
169 src_ptr += src_stride;
170 LD_UB2(ref_ptr, 16, ref0, ref1);
171 ref_ptr += ref_stride;
172 CALC_MSE_AVG_B(src0, ref0, var, avg);
173 CALC_MSE_AVG_B(src1, ref1, var, avg);
174 }
175
176 vec = __msa_hadd_s_w(avg, avg);
177 *diff = HADD_SW_S32(vec);
178
179 return HADD_SW_S32(var);
180 }
181
sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)182 static uint32_t sse_diff_32x64_msa(const uint8_t *src_ptr, int32_t src_stride,
183 const uint8_t *ref_ptr, int32_t ref_stride,
184 int32_t *diff) {
185 int32_t ht_cnt;
186 v16u8 src0, src1, ref0, ref1;
187 v8i16 avg0 = { 0 };
188 v8i16 avg1 = { 0 };
189 v4i32 vec, var = { 0 };
190
191 for (ht_cnt = 16; ht_cnt--;) {
192 LD_UB2(src_ptr, 16, src0, src1);
193 src_ptr += src_stride;
194 LD_UB2(ref_ptr, 16, ref0, ref1);
195 ref_ptr += ref_stride;
196 CALC_MSE_AVG_B(src0, ref0, var, avg0);
197 CALC_MSE_AVG_B(src1, ref1, var, avg1);
198
199 LD_UB2(src_ptr, 16, src0, src1);
200 src_ptr += src_stride;
201 LD_UB2(ref_ptr, 16, ref0, ref1);
202 ref_ptr += ref_stride;
203 CALC_MSE_AVG_B(src0, ref0, var, avg0);
204 CALC_MSE_AVG_B(src1, ref1, var, avg1);
205
206 LD_UB2(src_ptr, 16, src0, src1);
207 src_ptr += src_stride;
208 LD_UB2(ref_ptr, 16, ref0, ref1);
209 ref_ptr += ref_stride;
210 CALC_MSE_AVG_B(src0, ref0, var, avg0);
211 CALC_MSE_AVG_B(src1, ref1, var, avg1);
212
213 LD_UB2(src_ptr, 16, src0, src1);
214 src_ptr += src_stride;
215 LD_UB2(ref_ptr, 16, ref0, ref1);
216 ref_ptr += ref_stride;
217 CALC_MSE_AVG_B(src0, ref0, var, avg0);
218 CALC_MSE_AVG_B(src1, ref1, var, avg1);
219 }
220
221 vec = __msa_hadd_s_w(avg0, avg0);
222 vec += __msa_hadd_s_w(avg1, avg1);
223 *diff = HADD_SW_S32(vec);
224
225 return HADD_SW_S32(var);
226 }
227
sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)228 static uint32_t sse_diff_64x32_msa(const uint8_t *src_ptr, int32_t src_stride,
229 const uint8_t *ref_ptr, int32_t ref_stride,
230 int32_t *diff) {
231 int32_t ht_cnt;
232 v16u8 src0, src1, src2, src3;
233 v16u8 ref0, ref1, ref2, ref3;
234 v8i16 avg0 = { 0 };
235 v8i16 avg1 = { 0 };
236 v4i32 vec, var = { 0 };
237
238 for (ht_cnt = 16; ht_cnt--;) {
239 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
240 src_ptr += src_stride;
241 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
242 ref_ptr += ref_stride;
243 CALC_MSE_AVG_B(src0, ref0, var, avg0);
244 CALC_MSE_AVG_B(src2, ref2, var, avg0);
245 CALC_MSE_AVG_B(src1, ref1, var, avg1);
246 CALC_MSE_AVG_B(src3, ref3, var, avg1);
247
248 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
249 src_ptr += src_stride;
250 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
251 ref_ptr += ref_stride;
252 CALC_MSE_AVG_B(src0, ref0, var, avg0);
253 CALC_MSE_AVG_B(src2, ref2, var, avg0);
254 CALC_MSE_AVG_B(src1, ref1, var, avg1);
255 CALC_MSE_AVG_B(src3, ref3, var, avg1);
256 }
257
258 vec = __msa_hadd_s_w(avg0, avg0);
259 vec += __msa_hadd_s_w(avg1, avg1);
260 *diff = HADD_SW_S32(vec);
261
262 return HADD_SW_S32(var);
263 }
264
sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t * diff)265 static uint32_t sse_diff_64x64_msa(const uint8_t *src_ptr, int32_t src_stride,
266 const uint8_t *ref_ptr, int32_t ref_stride,
267 int32_t *diff) {
268 int32_t ht_cnt;
269 v16u8 src0, src1, src2, src3;
270 v16u8 ref0, ref1, ref2, ref3;
271 v8i16 avg0 = { 0 };
272 v8i16 avg1 = { 0 };
273 v8i16 avg2 = { 0 };
274 v8i16 avg3 = { 0 };
275 v4i32 vec, var = { 0 };
276
277 for (ht_cnt = 32; ht_cnt--;) {
278 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
279 src_ptr += src_stride;
280 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
281 ref_ptr += ref_stride;
282
283 CALC_MSE_AVG_B(src0, ref0, var, avg0);
284 CALC_MSE_AVG_B(src1, ref1, var, avg1);
285 CALC_MSE_AVG_B(src2, ref2, var, avg2);
286 CALC_MSE_AVG_B(src3, ref3, var, avg3);
287 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
288 src_ptr += src_stride;
289 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
290 ref_ptr += ref_stride;
291 CALC_MSE_AVG_B(src0, ref0, var, avg0);
292 CALC_MSE_AVG_B(src1, ref1, var, avg1);
293 CALC_MSE_AVG_B(src2, ref2, var, avg2);
294 CALC_MSE_AVG_B(src3, ref3, var, avg3);
295 }
296
297 vec = __msa_hadd_s_w(avg0, avg0);
298 vec += __msa_hadd_s_w(avg1, avg1);
299 vec += __msa_hadd_s_w(avg2, avg2);
300 vec += __msa_hadd_s_w(avg3, avg3);
301 *diff = HADD_SW_S32(vec);
302
303 return HADD_SW_S32(var);
304 }
305
get_mb_ss_msa(const int16_t * src)306 static uint32_t get_mb_ss_msa(const int16_t *src) {
307 uint32_t sum, cnt;
308 v8i16 src0, src1, src2, src3;
309 v4i32 src0_l, src1_l, src2_l, src3_l;
310 v4i32 src0_r, src1_r, src2_r, src3_r;
311 v2i64 sq_src_l = { 0 };
312 v2i64 sq_src_r = { 0 };
313
314 for (cnt = 8; cnt--;) {
315 LD_SH4(src, 8, src0, src1, src2, src3);
316 src += 4 * 8;
317
318 UNPCK_SH_SW(src0, src0_l, src0_r);
319 UNPCK_SH_SW(src1, src1_l, src1_r);
320 UNPCK_SH_SW(src2, src2_l, src2_r);
321 UNPCK_SH_SW(src3, src3_l, src3_r);
322
323 DPADD_SD2_SD(src0_l, src0_r, sq_src_l, sq_src_r);
324 DPADD_SD2_SD(src1_l, src1_r, sq_src_l, sq_src_r);
325 DPADD_SD2_SD(src2_l, src2_r, sq_src_l, sq_src_r);
326 DPADD_SD2_SD(src3_l, src3_r, sq_src_l, sq_src_r);
327 }
328
329 sq_src_l += __msa_splati_d(sq_src_l, 1);
330 sq_src_r += __msa_splati_d(sq_src_r, 1);
331
332 sum = __msa_copy_s_d(sq_src_l, 0);
333 sum += __msa_copy_s_d(sq_src_r, 0);
334
335 return sum;
336 }
337
sse_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)338 static uint32_t sse_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
339 const uint8_t *ref_ptr, int32_t ref_stride,
340 int32_t height) {
341 int32_t ht_cnt;
342 uint32_t src0, src1, src2, src3;
343 uint32_t ref0, ref1, ref2, ref3;
344 v16u8 src = { 0 };
345 v16u8 ref = { 0 };
346 v4i32 var = { 0 };
347
348 for (ht_cnt = (height >> 2); ht_cnt--;) {
349 LW4(src_ptr, src_stride, src0, src1, src2, src3);
350 src_ptr += (4 * src_stride);
351 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
352 ref_ptr += (4 * ref_stride);
353
354 INSERT_W4_UB(src0, src1, src2, src3, src);
355 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
356 CALC_MSE_B(src, ref, var);
357 }
358
359 return HADD_SW_S32(var);
360 }
361
sse_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)362 static uint32_t sse_8width_msa(const uint8_t *src_ptr, int32_t src_stride,
363 const uint8_t *ref_ptr, int32_t ref_stride,
364 int32_t height) {
365 int32_t ht_cnt;
366 v16u8 src0, src1, src2, src3;
367 v16u8 ref0, ref1, ref2, ref3;
368 v4i32 var = { 0 };
369
370 for (ht_cnt = (height >> 2); ht_cnt--;) {
371 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
372 src_ptr += (4 * src_stride);
373 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
374 ref_ptr += (4 * ref_stride);
375
376 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
377 ref0, ref1);
378 CALC_MSE_B(src0, ref0, var);
379 CALC_MSE_B(src1, ref1, var);
380 }
381
382 return HADD_SW_S32(var);
383 }
384
sse_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)385 static uint32_t sse_16width_msa(const uint8_t *src_ptr, int32_t src_stride,
386 const uint8_t *ref_ptr, int32_t ref_stride,
387 int32_t height) {
388 int32_t ht_cnt;
389 v16u8 src, ref;
390 v4i32 var = { 0 };
391
392 for (ht_cnt = (height >> 2); ht_cnt--;) {
393 src = LD_UB(src_ptr);
394 src_ptr += src_stride;
395 ref = LD_UB(ref_ptr);
396 ref_ptr += ref_stride;
397 CALC_MSE_B(src, ref, var);
398
399 src = LD_UB(src_ptr);
400 src_ptr += src_stride;
401 ref = LD_UB(ref_ptr);
402 ref_ptr += ref_stride;
403 CALC_MSE_B(src, ref, var);
404
405 src = LD_UB(src_ptr);
406 src_ptr += src_stride;
407 ref = LD_UB(ref_ptr);
408 ref_ptr += ref_stride;
409 CALC_MSE_B(src, ref, var);
410
411 src = LD_UB(src_ptr);
412 src_ptr += src_stride;
413 ref = LD_UB(ref_ptr);
414 ref_ptr += ref_stride;
415 CALC_MSE_B(src, ref, var);
416 }
417
418 return HADD_SW_S32(var);
419 }
420
sse_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)421 static uint32_t sse_32width_msa(const uint8_t *src_ptr, int32_t src_stride,
422 const uint8_t *ref_ptr, int32_t ref_stride,
423 int32_t height) {
424 int32_t ht_cnt;
425 v16u8 src0, src1, ref0, ref1;
426 v4i32 var = { 0 };
427
428 for (ht_cnt = (height >> 2); ht_cnt--;) {
429 LD_UB2(src_ptr, 16, src0, src1);
430 src_ptr += src_stride;
431 LD_UB2(ref_ptr, 16, ref0, ref1);
432 ref_ptr += ref_stride;
433 CALC_MSE_B(src0, ref0, var);
434 CALC_MSE_B(src1, ref1, var);
435
436 LD_UB2(src_ptr, 16, src0, src1);
437 src_ptr += src_stride;
438 LD_UB2(ref_ptr, 16, ref0, ref1);
439 ref_ptr += ref_stride;
440 CALC_MSE_B(src0, ref0, var);
441 CALC_MSE_B(src1, ref1, var);
442
443 LD_UB2(src_ptr, 16, src0, src1);
444 src_ptr += src_stride;
445 LD_UB2(ref_ptr, 16, ref0, ref1);
446 ref_ptr += ref_stride;
447 CALC_MSE_B(src0, ref0, var);
448 CALC_MSE_B(src1, ref1, var);
449
450 LD_UB2(src_ptr, 16, src0, src1);
451 src_ptr += src_stride;
452 LD_UB2(ref_ptr, 16, ref0, ref1);
453 ref_ptr += ref_stride;
454 CALC_MSE_B(src0, ref0, var);
455 CALC_MSE_B(src1, ref1, var);
456 }
457
458 return HADD_SW_S32(var);
459 }
460
sse_64width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)461 static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride,
462 const uint8_t *ref_ptr, int32_t ref_stride,
463 int32_t height) {
464 int32_t ht_cnt;
465 v16u8 src0, src1, src2, src3;
466 v16u8 ref0, ref1, ref2, ref3;
467 v4i32 var = { 0 };
468
469 for (ht_cnt = height >> 1; ht_cnt--;) {
470 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
471 src_ptr += src_stride;
472 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
473 ref_ptr += ref_stride;
474 CALC_MSE_B(src0, ref0, var);
475 CALC_MSE_B(src2, ref2, var);
476 CALC_MSE_B(src1, ref1, var);
477 CALC_MSE_B(src3, ref3, var);
478
479 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
480 src_ptr += src_stride;
481 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
482 ref_ptr += ref_stride;
483 CALC_MSE_B(src0, ref0, var);
484 CALC_MSE_B(src2, ref2, var);
485 CALC_MSE_B(src1, ref1, var);
486 CALC_MSE_B(src3, ref3, var);
487 }
488
489 return HADD_SW_S32(var);
490 }
491
aom_get4x4sse_cs_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride)492 uint32_t aom_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride,
493 const uint8_t *ref_ptr, int32_t ref_stride) {
494 uint32_t err = 0;
495 uint32_t src0, src1, src2, src3;
496 uint32_t ref0, ref1, ref2, ref3;
497 v16i8 src = { 0 };
498 v16i8 ref = { 0 };
499 v16u8 src_vec0, src_vec1;
500 v8i16 diff0, diff1;
501 v4i32 err0 = { 0 };
502 v4i32 err1 = { 0 };
503
504 LW4(src_ptr, src_stride, src0, src1, src2, src3);
505 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
506 INSERT_W4_SB(src0, src1, src2, src3, src);
507 INSERT_W4_SB(ref0, ref1, ref2, ref3, ref);
508 ILVRL_B2_UB(src, ref, src_vec0, src_vec1);
509 HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1);
510 DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1);
511 err = HADD_SW_S32(err0);
512 err += HADD_SW_S32(err1);
513
514 return err;
515 }
516
517 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
518 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
519 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
520 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
521 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
522 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
523 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
524
525 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
526 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
527 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
528 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
529 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
530 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
531
532 #define AOM_VARIANCE_WDXHT_MSA(wd, ht) \
533 uint32_t aom_variance##wd##x##ht##_msa( \
534 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
535 int32_t ref_stride, uint32_t *sse) { \
536 int32_t diff; \
537 \
538 *sse = \
539 sse_diff_##wd##width_msa(src, src_stride, ref, ref_stride, ht, &diff); \
540 \
541 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
542 }
543
544 /* clang-format off */
545 AOM_VARIANCE_WDXHT_MSA(4, 4)
546 AOM_VARIANCE_WDXHT_MSA(4, 8)
547
548 AOM_VARIANCE_WDXHT_MSA(8, 4)
549 AOM_VARIANCE_WDXHT_MSA(8, 8)
550 AOM_VARIANCE_WDXHT_MSA(8, 16)
551
552 AOM_VARIANCE_WDXHT_MSA(16, 8)
553 AOM_VARIANCE_WDXHT_MSA(16, 16)
554 AOM_VARIANCE_WDXHT_MSA(16, 32)
555
556 AOM_VARIANCE_WDXHT_MSA(32, 16)
557 AOM_VARIANCE_WDXHT_MSA(32, 32)
558 /* clang-format on */
559
aom_variance32x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)560 uint32_t aom_variance32x64_msa(const uint8_t *src, int32_t src_stride,
561 const uint8_t *ref, int32_t ref_stride,
562 uint32_t *sse) {
563 int32_t diff;
564
565 *sse = sse_diff_32x64_msa(src, src_stride, ref, ref_stride, &diff);
566
567 return VARIANCE_32Wx64H(*sse, diff);
568 }
569
aom_variance64x32_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)570 uint32_t aom_variance64x32_msa(const uint8_t *src, int32_t src_stride,
571 const uint8_t *ref, int32_t ref_stride,
572 uint32_t *sse) {
573 int32_t diff;
574
575 *sse = sse_diff_64x32_msa(src, src_stride, ref, ref_stride, &diff);
576
577 return VARIANCE_64Wx32H(*sse, diff);
578 }
579
aom_variance64x64_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)580 uint32_t aom_variance64x64_msa(const uint8_t *src, int32_t src_stride,
581 const uint8_t *ref, int32_t ref_stride,
582 uint32_t *sse) {
583 int32_t diff;
584
585 *sse = sse_diff_64x64_msa(src, src_stride, ref, ref_stride, &diff);
586
587 return VARIANCE_64Wx64H(*sse, diff);
588 }
589
aom_mse8x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)590 uint32_t aom_mse8x8_msa(const uint8_t *src, int32_t src_stride,
591 const uint8_t *ref, int32_t ref_stride, uint32_t *sse) {
592 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 8);
593
594 return *sse;
595 }
596
aom_mse8x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)597 uint32_t aom_mse8x16_msa(const uint8_t *src, int32_t src_stride,
598 const uint8_t *ref, int32_t ref_stride,
599 uint32_t *sse) {
600 *sse = sse_8width_msa(src, src_stride, ref, ref_stride, 16);
601
602 return *sse;
603 }
604
aom_mse16x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)605 uint32_t aom_mse16x8_msa(const uint8_t *src, int32_t src_stride,
606 const uint8_t *ref, int32_t ref_stride,
607 uint32_t *sse) {
608 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 8);
609
610 return *sse;
611 }
612
aom_mse16x16_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse)613 uint32_t aom_mse16x16_msa(const uint8_t *src, int32_t src_stride,
614 const uint8_t *ref, int32_t ref_stride,
615 uint32_t *sse) {
616 *sse = sse_16width_msa(src, src_stride, ref, ref_stride, 16);
617
618 return *sse;
619 }
620
aom_get8x8var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)621 void aom_get8x8var_msa(const uint8_t *src, int32_t src_stride,
622 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
623 int32_t *sum) {
624 *sse = sse_diff_8width_msa(src, src_stride, ref, ref_stride, 8, sum);
625 }
626
aom_get16x16var_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,uint32_t * sse,int32_t * sum)627 void aom_get16x16var_msa(const uint8_t *src, int32_t src_stride,
628 const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
629 int32_t *sum) {
630 *sse = sse_diff_16width_msa(src, src_stride, ref, ref_stride, 16, sum);
631 }
632
aom_get_mb_ss_msa(const int16_t * src)633 uint32_t aom_get_mb_ss_msa(const int16_t *src) { return get_mb_ss_msa(src); }
634