1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include "config/aom_dsp_rtcd.h"
13
14 #include "aom_ports/mem.h"
15 #include "aom_dsp/mips/macros_msa.h"
16 #include "aom_dsp/aom_filter.h"
17 #include "aom_dsp/variance.h"
18
19 #define CALC_MSE_AVG_B(src, ref, var, sub) \
20 { \
21 v16u8 src_l0_m, src_l1_m; \
22 v8i16 res_l0_m, res_l1_m; \
23 \
24 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
25 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
26 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
27 \
28 sub += res_l0_m + res_l1_m; \
29 }
30
31 #define VARIANCE_WxH(sse, diff, shift) sse - (((uint32_t)diff * diff) >> shift)
32
33 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
34 sse - (((int64_t)diff * diff) >> shift)
35
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)36 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
37 int32_t src_stride,
38 const uint8_t *ref_ptr,
39 int32_t ref_stride,
40 const uint8_t *sec_pred, int32_t height,
41 int32_t *diff) {
42 int32_t ht_cnt;
43 uint32_t src0, src1, src2, src3;
44 uint32_t ref0, ref1, ref2, ref3;
45 v16u8 pred, src = { 0 };
46 v16u8 ref = { 0 };
47 v8i16 avg = { 0 };
48 v4i32 vec, var = { 0 };
49
50 for (ht_cnt = (height >> 2); ht_cnt--;) {
51 pred = LD_UB(sec_pred);
52 sec_pred += 16;
53 LW4(src_ptr, src_stride, src0, src1, src2, src3);
54 src_ptr += (4 * src_stride);
55 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
56 ref_ptr += (4 * ref_stride);
57
58 INSERT_W4_UB(src0, src1, src2, src3, src);
59 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
60
61 src = __msa_aver_u_b(src, pred);
62 CALC_MSE_AVG_B(src, ref, var, avg);
63 }
64
65 vec = __msa_hadd_s_w(avg, avg);
66 *diff = HADD_SW_S32(vec);
67
68 return HADD_SW_S32(var);
69 }
70
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)71 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
72 int32_t src_stride,
73 const uint8_t *ref_ptr,
74 int32_t ref_stride,
75 const uint8_t *sec_pred, int32_t height,
76 int32_t *diff) {
77 int32_t ht_cnt;
78 v16u8 src0, src1, src2, src3;
79 v16u8 ref0, ref1, ref2, ref3;
80 v16u8 pred0, pred1;
81 v8i16 avg = { 0 };
82 v4i32 vec, var = { 0 };
83
84 for (ht_cnt = (height >> 2); ht_cnt--;) {
85 LD_UB2(sec_pred, 16, pred0, pred1);
86 sec_pred += 32;
87 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
88 src_ptr += (4 * src_stride);
89 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
90 ref_ptr += (4 * ref_stride);
91
92 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
93 ref0, ref1);
94 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
95 CALC_MSE_AVG_B(src0, ref0, var, avg);
96 CALC_MSE_AVG_B(src1, ref1, var, avg);
97 }
98
99 vec = __msa_hadd_s_w(avg, avg);
100 *diff = HADD_SW_S32(vec);
101
102 return HADD_SW_S32(var);
103 }
104
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)105 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
106 int32_t src_stride,
107 const uint8_t *ref_ptr,
108 int32_t ref_stride,
109 const uint8_t *sec_pred,
110 int32_t height, int32_t *diff) {
111 int32_t ht_cnt;
112 v16u8 src, ref, pred;
113 v8i16 avg = { 0 };
114 v4i32 vec, var = { 0 };
115
116 for (ht_cnt = (height >> 2); ht_cnt--;) {
117 pred = LD_UB(sec_pred);
118 sec_pred += 16;
119 src = LD_UB(src_ptr);
120 src_ptr += src_stride;
121 ref = LD_UB(ref_ptr);
122 ref_ptr += ref_stride;
123 src = __msa_aver_u_b(src, pred);
124 CALC_MSE_AVG_B(src, ref, var, avg);
125
126 pred = LD_UB(sec_pred);
127 sec_pred += 16;
128 src = LD_UB(src_ptr);
129 src_ptr += src_stride;
130 ref = LD_UB(ref_ptr);
131 ref_ptr += ref_stride;
132 src = __msa_aver_u_b(src, pred);
133 CALC_MSE_AVG_B(src, ref, var, avg);
134
135 pred = LD_UB(sec_pred);
136 sec_pred += 16;
137 src = LD_UB(src_ptr);
138 src_ptr += src_stride;
139 ref = LD_UB(ref_ptr);
140 ref_ptr += ref_stride;
141 src = __msa_aver_u_b(src, pred);
142 CALC_MSE_AVG_B(src, ref, var, avg);
143
144 pred = LD_UB(sec_pred);
145 sec_pred += 16;
146 src = LD_UB(src_ptr);
147 src_ptr += src_stride;
148 ref = LD_UB(ref_ptr);
149 ref_ptr += ref_stride;
150 src = __msa_aver_u_b(src, pred);
151 CALC_MSE_AVG_B(src, ref, var, avg);
152 }
153
154 vec = __msa_hadd_s_w(avg, avg);
155 *diff = HADD_SW_S32(vec);
156
157 return HADD_SW_S32(var);
158 }
159
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)160 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
161 int32_t src_stride,
162 const uint8_t *ref_ptr,
163 int32_t ref_stride,
164 const uint8_t *sec_pred,
165 int32_t height, int32_t *diff) {
166 int32_t ht_cnt;
167 v16u8 src0, src1, ref0, ref1, pred0, pred1;
168 v8i16 avg = { 0 };
169 v4i32 vec, var = { 0 };
170
171 for (ht_cnt = (height >> 2); ht_cnt--;) {
172 LD_UB2(sec_pred, 16, pred0, pred1);
173 sec_pred += 32;
174 LD_UB2(src_ptr, 16, src0, src1);
175 src_ptr += src_stride;
176 LD_UB2(ref_ptr, 16, ref0, ref1);
177 ref_ptr += ref_stride;
178 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
179 CALC_MSE_AVG_B(src0, ref0, var, avg);
180 CALC_MSE_AVG_B(src1, ref1, var, avg);
181
182 LD_UB2(sec_pred, 16, pred0, pred1);
183 sec_pred += 32;
184 LD_UB2(src_ptr, 16, src0, src1);
185 src_ptr += src_stride;
186 LD_UB2(ref_ptr, 16, ref0, ref1);
187 ref_ptr += ref_stride;
188 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
189 CALC_MSE_AVG_B(src0, ref0, var, avg);
190 CALC_MSE_AVG_B(src1, ref1, var, avg);
191
192 LD_UB2(sec_pred, 16, pred0, pred1);
193 sec_pred += 32;
194 LD_UB2(src_ptr, 16, src0, src1);
195 src_ptr += src_stride;
196 LD_UB2(ref_ptr, 16, ref0, ref1);
197 ref_ptr += ref_stride;
198 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
199 CALC_MSE_AVG_B(src0, ref0, var, avg);
200 CALC_MSE_AVG_B(src1, ref1, var, avg);
201
202 LD_UB2(sec_pred, 16, pred0, pred1);
203 sec_pred += 32;
204 LD_UB2(src_ptr, 16, src0, src1);
205 src_ptr += src_stride;
206 LD_UB2(ref_ptr, 16, ref0, ref1);
207 ref_ptr += ref_stride;
208 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
209 CALC_MSE_AVG_B(src0, ref0, var, avg);
210 CALC_MSE_AVG_B(src1, ref1, var, avg);
211 }
212
213 vec = __msa_hadd_s_w(avg, avg);
214 *diff = HADD_SW_S32(vec);
215
216 return HADD_SW_S32(var);
217 }
218
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)219 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
220 int32_t src_stride,
221 const uint8_t *ref_ptr,
222 int32_t ref_stride,
223 const uint8_t *sec_pred, int32_t *diff) {
224 int32_t ht_cnt;
225 v16u8 src0, src1, ref0, ref1, pred0, pred1;
226 v8i16 avg0 = { 0 };
227 v8i16 avg1 = { 0 };
228 v4i32 vec, var = { 0 };
229
230 for (ht_cnt = 16; ht_cnt--;) {
231 LD_UB2(sec_pred, 16, pred0, pred1);
232 sec_pred += 32;
233 LD_UB2(src_ptr, 16, src0, src1);
234 src_ptr += src_stride;
235 LD_UB2(ref_ptr, 16, ref0, ref1);
236 ref_ptr += ref_stride;
237 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
238 CALC_MSE_AVG_B(src0, ref0, var, avg0);
239 CALC_MSE_AVG_B(src1, ref1, var, avg1);
240
241 LD_UB2(sec_pred, 16, pred0, pred1);
242 sec_pred += 32;
243 LD_UB2(src_ptr, 16, src0, src1);
244 src_ptr += src_stride;
245 LD_UB2(ref_ptr, 16, ref0, ref1);
246 ref_ptr += ref_stride;
247 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
248 CALC_MSE_AVG_B(src0, ref0, var, avg0);
249 CALC_MSE_AVG_B(src1, ref1, var, avg1);
250
251 LD_UB2(sec_pred, 16, pred0, pred1);
252 sec_pred += 32;
253 LD_UB2(src_ptr, 16, src0, src1);
254 src_ptr += src_stride;
255 LD_UB2(ref_ptr, 16, ref0, ref1);
256 ref_ptr += ref_stride;
257 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
258 CALC_MSE_AVG_B(src0, ref0, var, avg0);
259 CALC_MSE_AVG_B(src1, ref1, var, avg1);
260
261 LD_UB2(sec_pred, 16, pred0, pred1);
262 sec_pred += 32;
263 LD_UB2(src_ptr, 16, src0, src1);
264 src_ptr += src_stride;
265 LD_UB2(ref_ptr, 16, ref0, ref1);
266 ref_ptr += ref_stride;
267 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
268 CALC_MSE_AVG_B(src0, ref0, var, avg0);
269 CALC_MSE_AVG_B(src1, ref1, var, avg1);
270 }
271
272 vec = __msa_hadd_s_w(avg0, avg0);
273 vec += __msa_hadd_s_w(avg1, avg1);
274 *diff = HADD_SW_S32(vec);
275
276 return HADD_SW_S32(var);
277 }
278
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)279 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
280 int32_t src_stride,
281 const uint8_t *ref_ptr,
282 int32_t ref_stride,
283 const uint8_t *sec_pred, int32_t *diff) {
284 int32_t ht_cnt;
285 v16u8 src0, src1, src2, src3;
286 v16u8 ref0, ref1, ref2, ref3;
287 v16u8 pred0, pred1, pred2, pred3;
288 v8i16 avg0 = { 0 };
289 v8i16 avg1 = { 0 };
290 v4i32 vec, var = { 0 };
291
292 for (ht_cnt = 16; ht_cnt--;) {
293 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
294 sec_pred += 64;
295 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
296 src_ptr += src_stride;
297 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
298 ref_ptr += ref_stride;
299 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
300 src2, src3);
301 CALC_MSE_AVG_B(src0, ref0, var, avg0);
302 CALC_MSE_AVG_B(src2, ref2, var, avg0);
303 CALC_MSE_AVG_B(src1, ref1, var, avg1);
304 CALC_MSE_AVG_B(src3, ref3, var, avg1);
305
306 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
307 sec_pred += 64;
308 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
309 src_ptr += src_stride;
310 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
311 ref_ptr += ref_stride;
312 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
313 src2, src3);
314 CALC_MSE_AVG_B(src0, ref0, var, avg0);
315 CALC_MSE_AVG_B(src2, ref2, var, avg0);
316 CALC_MSE_AVG_B(src1, ref1, var, avg1);
317 CALC_MSE_AVG_B(src3, ref3, var, avg1);
318 }
319
320 vec = __msa_hadd_s_w(avg0, avg0);
321 vec += __msa_hadd_s_w(avg1, avg1);
322
323 *diff = HADD_SW_S32(vec);
324
325 return HADD_SW_S32(var);
326 }
327
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)328 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
329 int32_t src_stride,
330 const uint8_t *ref_ptr,
331 int32_t ref_stride,
332 const uint8_t *sec_pred, int32_t *diff) {
333 int32_t ht_cnt;
334 v16u8 src0, src1, src2, src3;
335 v16u8 ref0, ref1, ref2, ref3;
336 v16u8 pred0, pred1, pred2, pred3;
337 v8i16 avg0 = { 0 };
338 v8i16 avg1 = { 0 };
339 v8i16 avg2 = { 0 };
340 v8i16 avg3 = { 0 };
341 v4i32 vec, var = { 0 };
342
343 for (ht_cnt = 32; ht_cnt--;) {
344 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
345 sec_pred += 64;
346 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
347 src_ptr += src_stride;
348 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
349 ref_ptr += ref_stride;
350 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
351 src2, src3);
352 CALC_MSE_AVG_B(src0, ref0, var, avg0);
353 CALC_MSE_AVG_B(src1, ref1, var, avg1);
354 CALC_MSE_AVG_B(src2, ref2, var, avg2);
355 CALC_MSE_AVG_B(src3, ref3, var, avg3);
356
357 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
358 sec_pred += 64;
359 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
360 src_ptr += src_stride;
361 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
362 ref_ptr += ref_stride;
363 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
364 src2, src3);
365 CALC_MSE_AVG_B(src0, ref0, var, avg0);
366 CALC_MSE_AVG_B(src1, ref1, var, avg1);
367 CALC_MSE_AVG_B(src2, ref2, var, avg2);
368 CALC_MSE_AVG_B(src3, ref3, var, avg3);
369 }
370
371 vec = __msa_hadd_s_w(avg0, avg0);
372 vec += __msa_hadd_s_w(avg1, avg1);
373 vec += __msa_hadd_s_w(avg2, avg2);
374 vec += __msa_hadd_s_w(avg3, avg3);
375 *diff = HADD_SW_S32(vec);
376
377 return HADD_SW_S32(var);
378 }
379
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)380 static uint32_t sub_pixel_sse_diff_4width_h_msa(
381 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
382 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
383 int16_t filtval;
384 uint32_t loop_cnt;
385 uint32_t ref0, ref1, ref2, ref3;
386 v16u8 filt0, ref = { 0 };
387 v16i8 src0, src1, src2, src3;
388 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
389 v8u16 vec0, vec1, vec2, vec3;
390 v8i16 avg = { 0 };
391 v4i32 vec, var = { 0 };
392
393 filtval = LH(filter);
394 filt0 = (v16u8)__msa_fill_h(filtval);
395
396 for (loop_cnt = (height >> 2); loop_cnt--;) {
397 LD_SB4(src, src_stride, src0, src1, src2, src3);
398 src += (4 * src_stride);
399 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
400 dst += (4 * dst_stride);
401 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
402 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
403 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
404 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
405 vec2, vec3);
406 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
407 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
408 src2, src3);
409 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
410 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
411 CALC_MSE_AVG_B(src0, ref, var, avg);
412 }
413
414 vec = __msa_hadd_s_w(avg, avg);
415 *diff = HADD_SW_S32(vec);
416
417 return HADD_SW_S32(var);
418 }
419
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)420 static uint32_t sub_pixel_sse_diff_8width_h_msa(
421 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
422 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
423 int16_t filtval;
424 uint32_t loop_cnt;
425 v16u8 filt0, out, ref0, ref1, ref2, ref3;
426 v16i8 src0, src1, src2, src3;
427 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
428 v8u16 vec0, vec1, vec2, vec3;
429 v8i16 avg = { 0 };
430 v4i32 vec, var = { 0 };
431
432 filtval = LH(filter);
433 filt0 = (v16u8)__msa_fill_h(filtval);
434
435 for (loop_cnt = (height >> 2); loop_cnt--;) {
436 LD_SB4(src, src_stride, src0, src1, src2, src3);
437 src += (4 * src_stride);
438 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
439 dst += (4 * dst_stride);
440
441 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
442 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
443 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
444 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
445 vec2, vec3);
446 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
447 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
448 src2, src3);
449 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
450 CALC_MSE_AVG_B(out, ref0, var, avg);
451 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
452 CALC_MSE_AVG_B(out, ref1, var, avg);
453 }
454
455 vec = __msa_hadd_s_w(avg, avg);
456 *diff = HADD_SW_S32(vec);
457
458 return HADD_SW_S32(var);
459 }
460
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)461 static uint32_t sub_pixel_sse_diff_16width_h_msa(
462 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
463 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
464 int16_t filtval;
465 uint32_t loop_cnt;
466 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
467 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
468 v16u8 dst0, dst1, dst2, dst3, filt0;
469 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
470 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
471 v8i16 avg = { 0 };
472 v4i32 vec, var = { 0 };
473
474 filtval = LH(filter);
475 filt0 = (v16u8)__msa_fill_h(filtval);
476
477 for (loop_cnt = (height >> 2); loop_cnt--;) {
478 LD_SB4(src, src_stride, src0, src2, src4, src6);
479 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
480 src += (4 * src_stride);
481 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
482 dst += (4 * dst_stride);
483
484 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
485 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
486 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
487 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
488 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
489 out2, out3);
490 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
491 out6, out7);
492 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
493 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
494 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
495 src2, src3);
496 CALC_MSE_AVG_B(src0, dst0, var, avg);
497 CALC_MSE_AVG_B(src1, dst1, var, avg);
498 CALC_MSE_AVG_B(src2, dst2, var, avg);
499 CALC_MSE_AVG_B(src3, dst3, var, avg);
500 }
501
502 vec = __msa_hadd_s_w(avg, avg);
503 *diff = HADD_SW_S32(vec);
504
505 return HADD_SW_S32(var);
506 }
507
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)508 static uint32_t sub_pixel_sse_diff_32width_h_msa(
509 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
510 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
511 uint32_t loop_cnt, sse = 0;
512 int32_t diff0[2];
513
514 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
515 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
516 filter, height, &diff0[loop_cnt]);
517 src += 16;
518 dst += 16;
519 }
520
521 *diff = diff0[0] + diff0[1];
522
523 return sse;
524 }
525
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)526 static uint32_t sub_pixel_sse_diff_64width_h_msa(
527 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
528 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
529 uint32_t loop_cnt, sse = 0;
530 int32_t diff0[4];
531
532 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
533 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
534 filter, height, &diff0[loop_cnt]);
535 src += 16;
536 dst += 16;
537 }
538
539 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
540
541 return sse;
542 }
543
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)544 static uint32_t sub_pixel_sse_diff_4width_v_msa(
545 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
546 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
547 int16_t filtval;
548 uint32_t loop_cnt;
549 uint32_t ref0, ref1, ref2, ref3;
550 v16u8 src0, src1, src2, src3, src4, out;
551 v16u8 src10_r, src32_r, src21_r, src43_r;
552 v16u8 ref = { 0 };
553 v16u8 src2110, src4332;
554 v16u8 filt0;
555 v8i16 avg = { 0 };
556 v4i32 vec, var = { 0 };
557 v8u16 tmp0, tmp1;
558
559 filtval = LH(filter);
560 filt0 = (v16u8)__msa_fill_h(filtval);
561
562 src0 = LD_UB(src);
563 src += src_stride;
564
565 for (loop_cnt = (height >> 2); loop_cnt--;) {
566 LD_UB4(src, src_stride, src1, src2, src3, src4);
567 src += (4 * src_stride);
568 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
569 dst += (4 * dst_stride);
570
571 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
572 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
573 src32_r, src43_r);
574 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
575 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
576 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
577 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
578 CALC_MSE_AVG_B(out, ref, var, avg);
579 src0 = src4;
580 }
581
582 vec = __msa_hadd_s_w(avg, avg);
583 *diff = HADD_SW_S32(vec);
584
585 return HADD_SW_S32(var);
586 }
587
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)588 static uint32_t sub_pixel_sse_diff_8width_v_msa(
589 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
590 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
591 int16_t filtval;
592 uint32_t loop_cnt;
593 v16u8 src0, src1, src2, src3, src4;
594 v16u8 ref0, ref1, ref2, ref3;
595 v8u16 vec0, vec1, vec2, vec3;
596 v8u16 tmp0, tmp1, tmp2, tmp3;
597 v16u8 filt0;
598 v8i16 avg = { 0 };
599 v4i32 vec, var = { 0 };
600
601 filtval = LH(filter);
602 filt0 = (v16u8)__msa_fill_h(filtval);
603
604 src0 = LD_UB(src);
605 src += src_stride;
606
607 for (loop_cnt = (height >> 2); loop_cnt--;) {
608 LD_UB4(src, src_stride, src1, src2, src3, src4);
609 src += (4 * src_stride);
610 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
611 dst += (4 * dst_stride);
612
613 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
614 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
615 vec3);
616 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
617 tmp2, tmp3);
618 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
619 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
620 CALC_MSE_AVG_B(src0, ref0, var, avg);
621 CALC_MSE_AVG_B(src1, ref1, var, avg);
622 src0 = src4;
623 }
624
625 vec = __msa_hadd_s_w(avg, avg);
626 *diff = HADD_SW_S32(vec);
627
628 return HADD_SW_S32(var);
629 }
630
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)631 static uint32_t sub_pixel_sse_diff_16width_v_msa(
632 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
633 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
634 int16_t filtval;
635 uint32_t loop_cnt;
636 v16u8 ref0, ref1, ref2, ref3;
637 v16u8 src0, src1, src2, src3, src4;
638 v16u8 out0, out1, out2, out3;
639 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
640 v8u16 tmp0, tmp1, tmp2, tmp3;
641 v16u8 filt0;
642 v8i16 avg = { 0 };
643 v4i32 vec, var = { 0 };
644
645 filtval = LH(filter);
646 filt0 = (v16u8)__msa_fill_h(filtval);
647
648 src0 = LD_UB(src);
649 src += src_stride;
650
651 for (loop_cnt = (height >> 2); loop_cnt--;) {
652 LD_UB4(src, src_stride, src1, src2, src3, src4);
653 src += (4 * src_stride);
654 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
655 dst += (4 * dst_stride);
656
657 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
658 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
659 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
660 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
661 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
662
663 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
664 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
665 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
666 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
667 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
668
669 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
670 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
671 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
672 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
673 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
674 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
675
676 src0 = src4;
677
678 CALC_MSE_AVG_B(out0, ref0, var, avg);
679 CALC_MSE_AVG_B(out1, ref1, var, avg);
680 CALC_MSE_AVG_B(out2, ref2, var, avg);
681 CALC_MSE_AVG_B(out3, ref3, var, avg);
682 }
683
684 vec = __msa_hadd_s_w(avg, avg);
685 *diff = HADD_SW_S32(vec);
686
687 return HADD_SW_S32(var);
688 }
689
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)690 static uint32_t sub_pixel_sse_diff_32width_v_msa(
691 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
692 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
693 uint32_t loop_cnt, sse = 0;
694 int32_t diff0[2];
695
696 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
697 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
698 filter, height, &diff0[loop_cnt]);
699 src += 16;
700 dst += 16;
701 }
702
703 *diff = diff0[0] + diff0[1];
704
705 return sse;
706 }
707
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)708 static uint32_t sub_pixel_sse_diff_64width_v_msa(
709 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
710 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
711 uint32_t loop_cnt, sse = 0;
712 int32_t diff0[4];
713
714 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
715 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
716 filter, height, &diff0[loop_cnt]);
717 src += 16;
718 dst += 16;
719 }
720
721 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
722
723 return sse;
724 }
725
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)726 static uint32_t sub_pixel_sse_diff_4width_hv_msa(
727 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
728 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
729 int32_t height, int32_t *diff) {
730 int16_t filtval;
731 uint32_t loop_cnt;
732 uint32_t ref0, ref1, ref2, ref3;
733 v16u8 src0, src1, src2, src3, src4;
734 v16u8 out, ref = { 0 };
735 v16u8 filt_vt, filt_hz, vec0, vec1;
736 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
737 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
738 v8u16 tmp0, tmp1;
739 v8i16 avg = { 0 };
740 v4i32 vec, var = { 0 };
741
742 filtval = LH(filter_horiz);
743 filt_hz = (v16u8)__msa_fill_h(filtval);
744 filtval = LH(filter_vert);
745 filt_vt = (v16u8)__msa_fill_h(filtval);
746
747 src0 = LD_UB(src);
748 src += src_stride;
749
750 for (loop_cnt = (height >> 2); loop_cnt--;) {
751 LD_UB4(src, src_stride, src1, src2, src3, src4);
752 src += (4 * src_stride);
753 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
754 dst += (4 * dst_stride);
755 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
756 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
757 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
758 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
759 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
760 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
761 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
762 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
763 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
764 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
765 CALC_MSE_AVG_B(out, ref, var, avg);
766 src0 = src4;
767 }
768
769 vec = __msa_hadd_s_w(avg, avg);
770 *diff = HADD_SW_S32(vec);
771
772 return HADD_SW_S32(var);
773 }
774
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)775 static uint32_t sub_pixel_sse_diff_8width_hv_msa(
776 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
777 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
778 int32_t height, int32_t *diff) {
779 int16_t filtval;
780 uint32_t loop_cnt;
781 v16u8 ref0, ref1, ref2, ref3;
782 v16u8 src0, src1, src2, src3, src4;
783 v16u8 out0, out1;
784 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
785 v8u16 hz_out0, hz_out1;
786 v8u16 tmp0, tmp1, tmp2, tmp3;
787 v16u8 filt_vt, filt_hz, vec0;
788 v8i16 avg = { 0 };
789 v4i32 vec, var = { 0 };
790
791 filtval = LH(filter_horiz);
792 filt_hz = (v16u8)__msa_fill_h(filtval);
793 filtval = LH(filter_vert);
794 filt_vt = (v16u8)__msa_fill_h(filtval);
795
796 src0 = LD_UB(src);
797 src += src_stride;
798 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
799
800 for (loop_cnt = (height >> 2); loop_cnt--;) {
801 LD_UB4(src, src_stride, src1, src2, src3, src4);
802 src += (4 * src_stride);
803 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
804 dst += (4 * dst_stride);
805
806 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
807 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
808 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
809 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
810 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
811 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
812 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
813 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
814 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
815 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
816 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
817 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
818 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
819 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
820 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
821 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
822 CALC_MSE_AVG_B(out0, ref0, var, avg);
823 CALC_MSE_AVG_B(out1, ref1, var, avg);
824 }
825
826 vec = __msa_hadd_s_w(avg, avg);
827 *diff = HADD_SW_S32(vec);
828
829 return HADD_SW_S32(var);
830 }
831
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)832 static uint32_t sub_pixel_sse_diff_16width_hv_msa(
833 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
834 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
835 int32_t height, int32_t *diff) {
836 int16_t filtval;
837 uint32_t loop_cnt;
838 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
839 v16u8 ref0, ref1, ref2, ref3;
840 v16u8 filt_hz, filt_vt, vec0, vec1;
841 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
842 v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
843 v8u16 tmp0, tmp1;
844 v8i16 avg = { 0 };
845 v4i32 vec, var = { 0 };
846
847 filtval = LH(filter_horiz);
848 filt_hz = (v16u8)__msa_fill_h(filtval);
849 filtval = LH(filter_vert);
850 filt_vt = (v16u8)__msa_fill_h(filtval);
851
852 LD_UB2(src, 8, src0, src1);
853 src += src_stride;
854
855 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
856 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
857
858 for (loop_cnt = (height >> 2); loop_cnt--;) {
859 LD_UB4(src, src_stride, src0, src2, src4, src6);
860 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
861 src += (4 * src_stride);
862 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
863 dst += (4 * dst_stride);
864
865 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
866 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
867 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
868 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
869 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
870 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
871
872 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
873 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
874 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
875 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
876 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
877 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
878
879 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
880 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
881 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
882 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
883 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
884 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
885
886 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
887 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
888 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
889 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
890 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
891 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
892
893 CALC_MSE_AVG_B(src0, ref0, var, avg);
894 CALC_MSE_AVG_B(src1, ref1, var, avg);
895 CALC_MSE_AVG_B(src2, ref2, var, avg);
896 CALC_MSE_AVG_B(src3, ref3, var, avg);
897 }
898
899 vec = __msa_hadd_s_w(avg, avg);
900 *diff = HADD_SW_S32(vec);
901
902 return HADD_SW_S32(var);
903 }
904
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)905 static uint32_t sub_pixel_sse_diff_32width_hv_msa(
906 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
907 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
908 int32_t height, int32_t *diff) {
909 uint32_t loop_cnt, sse = 0;
910 int32_t diff0[2];
911
912 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
913 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
914 filter_horiz, filter_vert, height,
915 &diff0[loop_cnt]);
916 src += 16;
917 dst += 16;
918 }
919
920 *diff = diff0[0] + diff0[1];
921
922 return sse;
923 }
924
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)925 static uint32_t sub_pixel_sse_diff_64width_hv_msa(
926 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
927 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
928 int32_t height, int32_t *diff) {
929 uint32_t loop_cnt, sse = 0;
930 int32_t diff0[4];
931
932 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
933 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
934 filter_horiz, filter_vert, height,
935 &diff0[loop_cnt]);
936 src += 16;
937 dst += 16;
938 }
939
940 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
941
942 return sse;
943 }
944
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)945 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
946 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
947 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
948 int32_t height, int32_t *diff) {
949 int16_t filtval;
950 uint32_t loop_cnt;
951 uint32_t ref0, ref1, ref2, ref3;
952 v16u8 out, pred, filt0, ref = { 0 };
953 v16i8 src0, src1, src2, src3;
954 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
955 v8u16 vec0, vec1, vec2, vec3;
956 v8i16 avg = { 0 };
957 v4i32 vec, var = { 0 };
958
959 filtval = LH(filter);
960 filt0 = (v16u8)__msa_fill_h(filtval);
961
962 for (loop_cnt = (height >> 2); loop_cnt--;) {
963 LD_SB4(src, src_stride, src0, src1, src2, src3);
964 src += (4 * src_stride);
965 pred = LD_UB(sec_pred);
966 sec_pred += 16;
967 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
968 dst += (4 * dst_stride);
969
970 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
971 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
972 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
973 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
974 vec2, vec3);
975 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
976 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
977 src2, src3);
978 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
979 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
980 out = __msa_aver_u_b(out, pred);
981 CALC_MSE_AVG_B(out, ref, var, avg);
982 }
983
984 vec = __msa_hadd_s_w(avg, avg);
985 *diff = HADD_SW_S32(vec);
986
987 return HADD_SW_S32(var);
988 }
989
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)990 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
991 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
992 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
993 int32_t height, int32_t *diff) {
994 int16_t filtval;
995 uint32_t loop_cnt;
996 v16u8 out, pred, filt0;
997 v16u8 ref0, ref1, ref2, ref3;
998 v16i8 src0, src1, src2, src3;
999 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1000 v8u16 vec0, vec1, vec2, vec3;
1001 v8i16 avg = { 0 };
1002 v4i32 vec, var = { 0 };
1003
1004 filtval = LH(filter);
1005 filt0 = (v16u8)__msa_fill_h(filtval);
1006
1007 for (loop_cnt = (height >> 2); loop_cnt--;) {
1008 LD_SB4(src, src_stride, src0, src1, src2, src3);
1009 src += (4 * src_stride);
1010 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1011 dst += (4 * dst_stride);
1012
1013 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1014 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1015 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1016 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
1017 vec2, vec3);
1018 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1019 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
1020 src2, src3);
1021 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1022
1023 pred = LD_UB(sec_pred);
1024 sec_pred += 16;
1025 out = __msa_aver_u_b(out, pred);
1026 CALC_MSE_AVG_B(out, ref0, var, avg);
1027 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1028 pred = LD_UB(sec_pred);
1029 sec_pred += 16;
1030 out = __msa_aver_u_b(out, pred);
1031 CALC_MSE_AVG_B(out, ref1, var, avg);
1032 }
1033
1034 vec = __msa_hadd_s_w(avg, avg);
1035 *diff = HADD_SW_S32(vec);
1036
1037 return HADD_SW_S32(var);
1038 }
1039
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1040 static uint32_t subpel_avg_ssediff_16w_h_msa(
1041 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1042 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1043 int32_t height, int32_t *diff, int32_t width) {
1044 int16_t filtval;
1045 uint32_t loop_cnt;
1046 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1047 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1048 v16u8 dst0, dst1, dst2, dst3;
1049 v16u8 tmp0, tmp1, tmp2, tmp3;
1050 v16u8 pred0, pred1, pred2, pred3, filt0;
1051 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1052 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1053 v8i16 avg = { 0 };
1054 v4i32 vec, var = { 0 };
1055
1056 filtval = LH(filter);
1057 filt0 = (v16u8)__msa_fill_h(filtval);
1058
1059 for (loop_cnt = (height >> 2); loop_cnt--;) {
1060 LD_SB4(src, src_stride, src0, src2, src4, src6);
1061 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1062 src += (4 * src_stride);
1063 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1064 dst += (4 * dst_stride);
1065 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1066 sec_pred += (4 * width);
1067
1068 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1069 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1070 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1071 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1072 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
1073 out2, out3);
1074 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
1075 out6, out7);
1076 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1077 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1078 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
1079 tmp2, tmp3);
1080 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
1081 tmp2, tmp3);
1082
1083 CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1084 CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1085 CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1086 CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1087 }
1088
1089 vec = __msa_hadd_s_w(avg, avg);
1090 *diff = HADD_SW_S32(vec);
1091
1092 return HADD_SW_S32(var);
1093 }
1094
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1095 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
1096 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1097 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1098 int32_t height, int32_t *diff) {
1099 return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1100 sec_pred, filter, height, diff, 16);
1101 }
1102
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1103 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
1104 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1105 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1106 int32_t height, int32_t *diff) {
1107 uint32_t loop_cnt, sse = 0;
1108 int32_t diff0[2];
1109
1110 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1111 sse +=
1112 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1113 filter, height, &diff0[loop_cnt], 32);
1114 src += 16;
1115 dst += 16;
1116 sec_pred += 16;
1117 }
1118
1119 *diff = diff0[0] + diff0[1];
1120
1121 return sse;
1122 }
1123
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1124 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
1125 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1126 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1127 int32_t height, int32_t *diff) {
1128 uint32_t loop_cnt, sse = 0;
1129 int32_t diff0[4];
1130
1131 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1132 sse +=
1133 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1134 filter, height, &diff0[loop_cnt], 64);
1135 src += 16;
1136 dst += 16;
1137 sec_pred += 16;
1138 }
1139
1140 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1141
1142 return sse;
1143 }
1144
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1145 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
1146 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1147 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1148 int32_t height, int32_t *diff) {
1149 int16_t filtval;
1150 uint32_t loop_cnt;
1151 uint32_t ref0, ref1, ref2, ref3;
1152 v16u8 src0, src1, src2, src3, src4;
1153 v16u8 src10_r, src32_r, src21_r, src43_r;
1154 v16u8 out, pred, ref = { 0 };
1155 v16u8 src2110, src4332, filt0;
1156 v8i16 avg = { 0 };
1157 v4i32 vec, var = { 0 };
1158 v8u16 tmp0, tmp1;
1159
1160 filtval = LH(filter);
1161 filt0 = (v16u8)__msa_fill_h(filtval);
1162
1163 src0 = LD_UB(src);
1164 src += src_stride;
1165
1166 for (loop_cnt = (height >> 2); loop_cnt--;) {
1167 LD_UB4(src, src_stride, src1, src2, src3, src4);
1168 src += (4 * src_stride);
1169 pred = LD_UB(sec_pred);
1170 sec_pred += 16;
1171 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1172 dst += (4 * dst_stride);
1173
1174 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1175 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1176 src32_r, src43_r);
1177 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1178 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1179 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1180
1181 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1182 out = __msa_aver_u_b(out, pred);
1183 CALC_MSE_AVG_B(out, ref, var, avg);
1184 src0 = src4;
1185 }
1186
1187 vec = __msa_hadd_s_w(avg, avg);
1188 *diff = HADD_SW_S32(vec);
1189
1190 return HADD_SW_S32(var);
1191 }
1192
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1193 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
1194 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1195 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1196 int32_t height, int32_t *diff) {
1197 int16_t filtval;
1198 uint32_t loop_cnt;
1199 v16u8 src0, src1, src2, src3, src4;
1200 v16u8 ref0, ref1, ref2, ref3;
1201 v16u8 pred0, pred1, filt0;
1202 v8u16 vec0, vec1, vec2, vec3;
1203 v8u16 tmp0, tmp1, tmp2, tmp3;
1204 v8i16 avg = { 0 };
1205 v4i32 vec, var = { 0 };
1206
1207 filtval = LH(filter);
1208 filt0 = (v16u8)__msa_fill_h(filtval);
1209
1210 src0 = LD_UB(src);
1211 src += src_stride;
1212
1213 for (loop_cnt = (height >> 2); loop_cnt--;) {
1214 LD_UB4(src, src_stride, src1, src2, src3, src4);
1215 src += (4 * src_stride);
1216 LD_UB2(sec_pred, 16, pred0, pred1);
1217 sec_pred += 32;
1218 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1219 dst += (4 * dst_stride);
1220 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1221 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
1222 vec3);
1223 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
1224 tmp2, tmp3);
1225 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1226 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1227 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1228 CALC_MSE_AVG_B(src0, ref0, var, avg);
1229 CALC_MSE_AVG_B(src1, ref1, var, avg);
1230
1231 src0 = src4;
1232 }
1233
1234 vec = __msa_hadd_s_w(avg, avg);
1235 *diff = HADD_SW_S32(vec);
1236
1237 return HADD_SW_S32(var);
1238 }
1239
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1240 static uint32_t subpel_avg_ssediff_16w_v_msa(
1241 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1242 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1243 int32_t height, int32_t *diff, int32_t width) {
1244 int16_t filtval;
1245 uint32_t loop_cnt;
1246 v16u8 ref0, ref1, ref2, ref3;
1247 v16u8 pred0, pred1, pred2, pred3;
1248 v16u8 src0, src1, src2, src3, src4;
1249 v16u8 out0, out1, out2, out3, filt0;
1250 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1251 v8u16 tmp0, tmp1, tmp2, tmp3;
1252 v8i16 avg = { 0 };
1253 v4i32 vec, var = { 0 };
1254
1255 filtval = LH(filter);
1256 filt0 = (v16u8)__msa_fill_h(filtval);
1257
1258 src0 = LD_UB(src);
1259 src += src_stride;
1260
1261 for (loop_cnt = (height >> 2); loop_cnt--;) {
1262 LD_UB4(src, src_stride, src1, src2, src3, src4);
1263 src += (4 * src_stride);
1264 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1265 sec_pred += (4 * width);
1266
1267 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1268 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1269 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1270 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1271 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1272
1273 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1274 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1275 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1276 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1277 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1278
1279 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1280 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1281 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1282
1283 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1284 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1285 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1286
1287 src0 = src4;
1288 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1289 dst += (4 * dst_stride);
1290
1291 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1292 out2, out3);
1293
1294 CALC_MSE_AVG_B(out0, ref0, var, avg);
1295 CALC_MSE_AVG_B(out1, ref1, var, avg);
1296 CALC_MSE_AVG_B(out2, ref2, var, avg);
1297 CALC_MSE_AVG_B(out3, ref3, var, avg);
1298 }
1299
1300 vec = __msa_hadd_s_w(avg, avg);
1301 *diff = HADD_SW_S32(vec);
1302
1303 return HADD_SW_S32(var);
1304 }
1305
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1306 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
1307 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1308 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1309 int32_t height, int32_t *diff) {
1310 return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1311 sec_pred, filter, height, diff, 16);
1312 }
1313
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1314 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
1315 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1316 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1317 int32_t height, int32_t *diff) {
1318 uint32_t loop_cnt, sse = 0;
1319 int32_t diff0[2];
1320
1321 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1322 sse +=
1323 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1324 filter, height, &diff0[loop_cnt], 32);
1325 src += 16;
1326 dst += 16;
1327 sec_pred += 16;
1328 }
1329
1330 *diff = diff0[0] + diff0[1];
1331
1332 return sse;
1333 }
1334
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1335 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
1336 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1337 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1338 int32_t height, int32_t *diff) {
1339 uint32_t loop_cnt, sse = 0;
1340 int32_t diff0[4];
1341
1342 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1343 sse +=
1344 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1345 filter, height, &diff0[loop_cnt], 64);
1346 src += 16;
1347 dst += 16;
1348 sec_pred += 16;
1349 }
1350
1351 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1352
1353 return sse;
1354 }
1355
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1356 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1357 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1358 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1359 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1360 int16_t filtval;
1361 uint32_t loop_cnt;
1362 uint32_t ref0, ref1, ref2, ref3;
1363 v16u8 src0, src1, src2, src3, src4;
1364 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1365 v16u8 filt_hz, filt_vt, vec0, vec1;
1366 v16u8 out, pred, ref = { 0 };
1367 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1368 v8i16 avg = { 0 };
1369 v4i32 vec, var = { 0 };
1370
1371 filtval = LH(filter_horiz);
1372 filt_hz = (v16u8)__msa_fill_h(filtval);
1373 filtval = LH(filter_vert);
1374 filt_vt = (v16u8)__msa_fill_h(filtval);
1375
1376 src0 = LD_UB(src);
1377 src += src_stride;
1378
1379 for (loop_cnt = (height >> 2); loop_cnt--;) {
1380 LD_UB4(src, src_stride, src1, src2, src3, src4);
1381 src += (4 * src_stride);
1382 pred = LD_UB(sec_pred);
1383 sec_pred += 16;
1384 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1385 dst += (4 * dst_stride);
1386 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1387 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1388 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1389 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1390 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1391 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1392 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1393 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1394 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1395 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1396 out = __msa_aver_u_b(out, pred);
1397 CALC_MSE_AVG_B(out, ref, var, avg);
1398 src0 = src4;
1399 }
1400
1401 vec = __msa_hadd_s_w(avg, avg);
1402 *diff = HADD_SW_S32(vec);
1403
1404 return HADD_SW_S32(var);
1405 }
1406
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1407 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1408 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1409 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1410 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1411 int16_t filtval;
1412 uint32_t loop_cnt;
1413 v16u8 ref0, ref1, ref2, ref3;
1414 v16u8 src0, src1, src2, src3, src4;
1415 v16u8 pred0, pred1, out0, out1;
1416 v16u8 filt_hz, filt_vt, vec0;
1417 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1418 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1419 v8i16 avg = { 0 };
1420 v4i32 vec, var = { 0 };
1421
1422 filtval = LH(filter_horiz);
1423 filt_hz = (v16u8)__msa_fill_h(filtval);
1424 filtval = LH(filter_vert);
1425 filt_vt = (v16u8)__msa_fill_h(filtval);
1426
1427 src0 = LD_UB(src);
1428 src += src_stride;
1429 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1430
1431 for (loop_cnt = (height >> 2); loop_cnt--;) {
1432 LD_UB4(src, src_stride, src1, src2, src3, src4);
1433 src += (4 * src_stride);
1434 LD_UB2(sec_pred, 16, pred0, pred1);
1435 sec_pred += 32;
1436 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1437 dst += (4 * dst_stride);
1438
1439 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1440 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1441
1442 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1443 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1444 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1445
1446 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1447 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1448 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1449 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1450
1451 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1452 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1453 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1454
1455 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1456 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1457
1458 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1459 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1460 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1461
1462 CALC_MSE_AVG_B(out0, ref0, var, avg);
1463 CALC_MSE_AVG_B(out1, ref1, var, avg);
1464 }
1465
1466 vec = __msa_hadd_s_w(avg, avg);
1467 *diff = HADD_SW_S32(vec);
1468
1469 return HADD_SW_S32(var);
1470 }
1471
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1472 static uint32_t subpel_avg_ssediff_16w_hv_msa(
1473 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1474 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1475 const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1476 int16_t filtval;
1477 uint32_t loop_cnt;
1478 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1479 v16u8 ref0, ref1, ref2, ref3;
1480 v16u8 pred0, pred1, pred2, pred3;
1481 v16u8 out0, out1, out2, out3;
1482 v16u8 filt_hz, filt_vt, vec0, vec1;
1483 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1484 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1485 v8i16 avg = { 0 };
1486 v4i32 vec, var = { 0 };
1487
1488 filtval = LH(filter_horiz);
1489 filt_hz = (v16u8)__msa_fill_h(filtval);
1490 filtval = LH(filter_vert);
1491 filt_vt = (v16u8)__msa_fill_h(filtval);
1492
1493 LD_UB2(src, 8, src0, src1);
1494 src += src_stride;
1495
1496 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1497 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1498
1499 for (loop_cnt = (height >> 2); loop_cnt--;) {
1500 LD_UB4(src, src_stride, src0, src2, src4, src6);
1501 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1502 src += (4 * src_stride);
1503 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1504 sec_pred += (4 * width);
1505
1506 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1507 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1508 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1509 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1510 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1511 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1512
1513 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1514 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1515 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1516 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1517 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1518 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1519
1520 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1521 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1522 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1523 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1524 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1525 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1526
1527 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1528 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1529 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1530 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1531 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1532 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1533
1534 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1535 dst += (4 * dst_stride);
1536
1537 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1538 out2, out3);
1539
1540 CALC_MSE_AVG_B(out0, ref0, var, avg);
1541 CALC_MSE_AVG_B(out1, ref1, var, avg);
1542 CALC_MSE_AVG_B(out2, ref2, var, avg);
1543 CALC_MSE_AVG_B(out3, ref3, var, avg);
1544 }
1545
1546 vec = __msa_hadd_s_w(avg, avg);
1547 *diff = HADD_SW_S32(vec);
1548
1549 return HADD_SW_S32(var);
1550 }
1551
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1552 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1553 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1554 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1555 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1556 return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1557 sec_pred, filter_horiz, filter_vert,
1558 height, diff, 16);
1559 }
1560
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1561 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1562 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1563 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1564 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1565 uint32_t loop_cnt, sse = 0;
1566 int32_t diff0[2];
1567
1568 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1569 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1570 sec_pred, filter_horiz, filter_vert,
1571 height, &diff0[loop_cnt], 32);
1572 src += 16;
1573 dst += 16;
1574 sec_pred += 16;
1575 }
1576
1577 *diff = diff0[0] + diff0[1];
1578
1579 return sse;
1580 }
1581
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1582 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1583 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1584 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1585 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1586 uint32_t loop_cnt, sse = 0;
1587 int32_t diff0[4];
1588
1589 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1590 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1591 sec_pred, filter_horiz, filter_vert,
1592 height, &diff0[loop_cnt], 64);
1593 src += 16;
1594 dst += 16;
1595 sec_pred += 16;
1596 }
1597
1598 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1599
1600 return sse;
1601 }
1602
1603 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1604 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1605 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1606 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1607 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1608 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1609 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1610
1611 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1612 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1613 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1614 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1615 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1616 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1617
1618 #define AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
1619 uint32_t aom_sub_pixel_variance##wd##x##ht##_msa( \
1620 const uint8_t *src, int32_t src_stride, int32_t xoffset, \
1621 int32_t yoffset, const uint8_t *ref, int32_t ref_stride, \
1622 uint32_t *sse) { \
1623 int32_t diff; \
1624 uint32_t var; \
1625 const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
1626 const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
1627 \
1628 if (yoffset) { \
1629 if (xoffset) { \
1630 *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
1631 src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
1632 } else { \
1633 *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
1634 src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
1635 } \
1636 \
1637 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1638 } else { \
1639 if (xoffset) { \
1640 *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
1641 src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
1642 \
1643 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1644 } else { \
1645 var = aom_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
1646 sse); \
1647 } \
1648 } \
1649 \
1650 return var; \
1651 }
1652
1653 /* clang-format off */
1654 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4)
1655 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8)
1656
1657 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4)
1658 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8)
1659 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16)
1660
1661 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8)
1662 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16)
1663 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32)
1664
1665 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16)
1666 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32)
1667 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64)
1668
1669 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32)
1670 AOM_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64)
1671 /* clang-format on */
1672
1673 #define AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
1674 uint32_t aom_sub_pixel_avg_variance##wd##x##ht##_msa( \
1675 const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
1676 int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
1677 uint32_t *sse, const uint8_t *sec_pred) { \
1678 int32_t diff; \
1679 const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
1680 const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
1681 \
1682 if (yoffset) { \
1683 if (xoffset) { \
1684 *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
1685 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
1686 v_filter, ht, &diff); \
1687 } else { \
1688 *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
1689 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1690 &diff); \
1691 } \
1692 } else { \
1693 if (xoffset) { \
1694 *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
1695 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1696 &diff); \
1697 } else { \
1698 *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
1699 ref_stride, sec_pred, ht, &diff); \
1700 } \
1701 } \
1702 \
1703 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1704 }
1705
1706 /* clang-format off */
1707 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4)
1708 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8)
1709
1710 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4)
1711 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8)
1712 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16)
1713
1714 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8)
1715 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16)
1716 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32)
1717
1718 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16)
1719 AOM_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32)
1720 /* clang-format on */
1721
aom_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t xoffset,int32_t yoffset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1722 uint32_t aom_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1723 int32_t src_stride,
1724 int32_t xoffset, int32_t yoffset,
1725 const uint8_t *ref_ptr,
1726 int32_t ref_stride, uint32_t *sse,
1727 const uint8_t *sec_pred) {
1728 int32_t diff;
1729 const uint8_t *h_filter = bilinear_filters_2t[xoffset];
1730 const uint8_t *v_filter = bilinear_filters_2t[yoffset];
1731
1732 if (yoffset) {
1733 if (xoffset) {
1734 *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
1735 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
1736 v_filter, 64, &diff);
1737 } else {
1738 *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
1739 ref_stride, sec_pred,
1740 v_filter, 64, &diff);
1741 }
1742 } else {
1743 if (xoffset) {
1744 *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
1745 ref_stride, sec_pred,
1746 h_filter, 64, &diff);
1747 } else {
1748 *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1749 sec_pred, &diff);
1750 }
1751 }
1752
1753 return VARIANCE_32Wx64H(*sse, diff);
1754 }
1755
1756 #define AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
1757 uint32_t aom_sub_pixel_avg_variance64x##ht##_msa( \
1758 const uint8_t *src_ptr, int32_t src_stride, int32_t xoffset, \
1759 int32_t yoffset, const uint8_t *ref_ptr, int32_t ref_stride, \
1760 uint32_t *sse, const uint8_t *sec_pred) { \
1761 int32_t diff; \
1762 const uint8_t *h_filter = bilinear_filters_2t[xoffset]; \
1763 const uint8_t *v_filter = bilinear_filters_2t[yoffset]; \
1764 \
1765 if (yoffset) { \
1766 if (xoffset) { \
1767 *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
1768 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
1769 v_filter, ht, &diff); \
1770 } else { \
1771 *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
1772 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1773 &diff); \
1774 } \
1775 } else { \
1776 if (xoffset) { \
1777 *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
1778 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1779 &diff); \
1780 } else { \
1781 *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
1782 ref_stride, sec_pred, &diff); \
1783 } \
1784 } \
1785 \
1786 return VARIANCE_64Wx##ht##H(*sse, diff); \
1787 }
1788
1789 /* clang-format off */
1790 AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32)
1791 AOM_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64)
1792 /* clang-format on */
1793