1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_dsp/mips/macros_msa.h"
14 #include "vpx_dsp/variance.h"
15
16 static const uint8_t bilinear_filters_msa[8][2] = {
17 { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
18 { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 },
19 };
20
21 #define CALC_MSE_AVG_B(src, ref, var, sub) \
22 { \
23 v16u8 src_l0_m, src_l1_m; \
24 v8i16 res_l0_m, res_l1_m; \
25 \
26 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
27 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
28 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
29 \
30 (sub) += res_l0_m + res_l1_m; \
31 }
32
33 #define VARIANCE_WxH(sse, diff, shift) \
34 (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
35
36 #define VARIANCE_LARGE_WxH(sse, diff, shift) \
37 (sse) - (((int64_t)(diff) * (diff)) >> (shift))
38
avg_sse_diff_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)39 static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
40 int32_t src_stride,
41 const uint8_t *ref_ptr,
42 int32_t ref_stride,
43 const uint8_t *sec_pred, int32_t height,
44 int32_t *diff) {
45 int32_t ht_cnt;
46 uint32_t src0, src1, src2, src3;
47 uint32_t ref0, ref1, ref2, ref3;
48 v16u8 pred, src = { 0 };
49 v16u8 ref = { 0 };
50 v8i16 avg = { 0 };
51 v4i32 vec, var = { 0 };
52
53 for (ht_cnt = (height >> 2); ht_cnt--;) {
54 pred = LD_UB(sec_pred);
55 sec_pred += 16;
56 LW4(src_ptr, src_stride, src0, src1, src2, src3);
57 src_ptr += (4 * src_stride);
58 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
59 ref_ptr += (4 * ref_stride);
60
61 INSERT_W4_UB(src0, src1, src2, src3, src);
62 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
63
64 src = __msa_aver_u_b(src, pred);
65 CALC_MSE_AVG_B(src, ref, var, avg);
66 }
67
68 vec = __msa_hadd_s_w(avg, avg);
69 *diff = HADD_SW_S32(vec);
70
71 return HADD_SW_S32(var);
72 }
73
avg_sse_diff_8width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)74 static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
75 int32_t src_stride,
76 const uint8_t *ref_ptr,
77 int32_t ref_stride,
78 const uint8_t *sec_pred, int32_t height,
79 int32_t *diff) {
80 int32_t ht_cnt;
81 v16u8 src0, src1, src2, src3;
82 v16u8 ref0, ref1, ref2, ref3;
83 v16u8 pred0, pred1;
84 v8i16 avg = { 0 };
85 v4i32 vec, var = { 0 };
86
87 for (ht_cnt = (height >> 2); ht_cnt--;) {
88 LD_UB2(sec_pred, 16, pred0, pred1);
89 sec_pred += 32;
90 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
91 src_ptr += (4 * src_stride);
92 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
93 ref_ptr += (4 * ref_stride);
94
95 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
96 ref0, ref1);
97 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
98 CALC_MSE_AVG_B(src0, ref0, var, avg);
99 CALC_MSE_AVG_B(src1, ref1, var, avg);
100 }
101
102 vec = __msa_hadd_s_w(avg, avg);
103 *diff = HADD_SW_S32(vec);
104
105 return HADD_SW_S32(var);
106 }
107
avg_sse_diff_16width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)108 static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
109 int32_t src_stride,
110 const uint8_t *ref_ptr,
111 int32_t ref_stride,
112 const uint8_t *sec_pred,
113 int32_t height, int32_t *diff) {
114 int32_t ht_cnt;
115 v16u8 src, ref, pred;
116 v8i16 avg = { 0 };
117 v4i32 vec, var = { 0 };
118
119 for (ht_cnt = (height >> 2); ht_cnt--;) {
120 pred = LD_UB(sec_pred);
121 sec_pred += 16;
122 src = LD_UB(src_ptr);
123 src_ptr += src_stride;
124 ref = LD_UB(ref_ptr);
125 ref_ptr += ref_stride;
126 src = __msa_aver_u_b(src, pred);
127 CALC_MSE_AVG_B(src, ref, var, avg);
128
129 pred = LD_UB(sec_pred);
130 sec_pred += 16;
131 src = LD_UB(src_ptr);
132 src_ptr += src_stride;
133 ref = LD_UB(ref_ptr);
134 ref_ptr += ref_stride;
135 src = __msa_aver_u_b(src, pred);
136 CALC_MSE_AVG_B(src, ref, var, avg);
137
138 pred = LD_UB(sec_pred);
139 sec_pred += 16;
140 src = LD_UB(src_ptr);
141 src_ptr += src_stride;
142 ref = LD_UB(ref_ptr);
143 ref_ptr += ref_stride;
144 src = __msa_aver_u_b(src, pred);
145 CALC_MSE_AVG_B(src, ref, var, avg);
146
147 pred = LD_UB(sec_pred);
148 sec_pred += 16;
149 src = LD_UB(src_ptr);
150 src_ptr += src_stride;
151 ref = LD_UB(ref_ptr);
152 ref_ptr += ref_stride;
153 src = __msa_aver_u_b(src, pred);
154 CALC_MSE_AVG_B(src, ref, var, avg);
155 }
156
157 vec = __msa_hadd_s_w(avg, avg);
158 *diff = HADD_SW_S32(vec);
159
160 return HADD_SW_S32(var);
161 }
162
avg_sse_diff_32width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t height,int32_t * diff)163 static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
164 int32_t src_stride,
165 const uint8_t *ref_ptr,
166 int32_t ref_stride,
167 const uint8_t *sec_pred,
168 int32_t height, int32_t *diff) {
169 int32_t ht_cnt;
170 v16u8 src0, src1, ref0, ref1, pred0, pred1;
171 v8i16 avg = { 0 };
172 v4i32 vec, var = { 0 };
173
174 for (ht_cnt = (height >> 2); ht_cnt--;) {
175 LD_UB2(sec_pred, 16, pred0, pred1);
176 sec_pred += 32;
177 LD_UB2(src_ptr, 16, src0, src1);
178 src_ptr += src_stride;
179 LD_UB2(ref_ptr, 16, ref0, ref1);
180 ref_ptr += ref_stride;
181 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
182 CALC_MSE_AVG_B(src0, ref0, var, avg);
183 CALC_MSE_AVG_B(src1, ref1, var, avg);
184
185 LD_UB2(sec_pred, 16, pred0, pred1);
186 sec_pred += 32;
187 LD_UB2(src_ptr, 16, src0, src1);
188 src_ptr += src_stride;
189 LD_UB2(ref_ptr, 16, ref0, ref1);
190 ref_ptr += ref_stride;
191 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
192 CALC_MSE_AVG_B(src0, ref0, var, avg);
193 CALC_MSE_AVG_B(src1, ref1, var, avg);
194
195 LD_UB2(sec_pred, 16, pred0, pred1);
196 sec_pred += 32;
197 LD_UB2(src_ptr, 16, src0, src1);
198 src_ptr += src_stride;
199 LD_UB2(ref_ptr, 16, ref0, ref1);
200 ref_ptr += ref_stride;
201 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
202 CALC_MSE_AVG_B(src0, ref0, var, avg);
203 CALC_MSE_AVG_B(src1, ref1, var, avg);
204
205 LD_UB2(sec_pred, 16, pred0, pred1);
206 sec_pred += 32;
207 LD_UB2(src_ptr, 16, src0, src1);
208 src_ptr += src_stride;
209 LD_UB2(ref_ptr, 16, ref0, ref1);
210 ref_ptr += ref_stride;
211 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
212 CALC_MSE_AVG_B(src0, ref0, var, avg);
213 CALC_MSE_AVG_B(src1, ref1, var, avg);
214 }
215
216 vec = __msa_hadd_s_w(avg, avg);
217 *diff = HADD_SW_S32(vec);
218
219 return HADD_SW_S32(var);
220 }
221
avg_sse_diff_32x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)222 static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
223 int32_t src_stride,
224 const uint8_t *ref_ptr,
225 int32_t ref_stride,
226 const uint8_t *sec_pred, int32_t *diff) {
227 int32_t ht_cnt;
228 v16u8 src0, src1, ref0, ref1, pred0, pred1;
229 v8i16 avg0 = { 0 };
230 v8i16 avg1 = { 0 };
231 v4i32 vec, var = { 0 };
232
233 for (ht_cnt = 16; ht_cnt--;) {
234 LD_UB2(sec_pred, 16, pred0, pred1);
235 sec_pred += 32;
236 LD_UB2(src_ptr, 16, src0, src1);
237 src_ptr += src_stride;
238 LD_UB2(ref_ptr, 16, ref0, ref1);
239 ref_ptr += ref_stride;
240 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
241 CALC_MSE_AVG_B(src0, ref0, var, avg0);
242 CALC_MSE_AVG_B(src1, ref1, var, avg1);
243
244 LD_UB2(sec_pred, 16, pred0, pred1);
245 sec_pred += 32;
246 LD_UB2(src_ptr, 16, src0, src1);
247 src_ptr += src_stride;
248 LD_UB2(ref_ptr, 16, ref0, ref1);
249 ref_ptr += ref_stride;
250 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
251 CALC_MSE_AVG_B(src0, ref0, var, avg0);
252 CALC_MSE_AVG_B(src1, ref1, var, avg1);
253
254 LD_UB2(sec_pred, 16, pred0, pred1);
255 sec_pred += 32;
256 LD_UB2(src_ptr, 16, src0, src1);
257 src_ptr += src_stride;
258 LD_UB2(ref_ptr, 16, ref0, ref1);
259 ref_ptr += ref_stride;
260 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
261 CALC_MSE_AVG_B(src0, ref0, var, avg0);
262 CALC_MSE_AVG_B(src1, ref1, var, avg1);
263
264 LD_UB2(sec_pred, 16, pred0, pred1);
265 sec_pred += 32;
266 LD_UB2(src_ptr, 16, src0, src1);
267 src_ptr += src_stride;
268 LD_UB2(ref_ptr, 16, ref0, ref1);
269 ref_ptr += ref_stride;
270 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
271 CALC_MSE_AVG_B(src0, ref0, var, avg0);
272 CALC_MSE_AVG_B(src1, ref1, var, avg1);
273 }
274
275 vec = __msa_hadd_s_w(avg0, avg0);
276 vec += __msa_hadd_s_w(avg1, avg1);
277 *diff = HADD_SW_S32(vec);
278
279 return HADD_SW_S32(var);
280 }
281
avg_sse_diff_64x32_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)282 static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
283 int32_t src_stride,
284 const uint8_t *ref_ptr,
285 int32_t ref_stride,
286 const uint8_t *sec_pred, int32_t *diff) {
287 int32_t ht_cnt;
288 v16u8 src0, src1, src2, src3;
289 v16u8 ref0, ref1, ref2, ref3;
290 v16u8 pred0, pred1, pred2, pred3;
291 v8i16 avg0 = { 0 };
292 v8i16 avg1 = { 0 };
293 v4i32 vec, var = { 0 };
294
295 for (ht_cnt = 16; ht_cnt--;) {
296 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
297 sec_pred += 64;
298 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
299 src_ptr += src_stride;
300 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
301 ref_ptr += ref_stride;
302 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
303 src2, src3);
304 CALC_MSE_AVG_B(src0, ref0, var, avg0);
305 CALC_MSE_AVG_B(src2, ref2, var, avg0);
306 CALC_MSE_AVG_B(src1, ref1, var, avg1);
307 CALC_MSE_AVG_B(src3, ref3, var, avg1);
308
309 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
310 sec_pred += 64;
311 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
312 src_ptr += src_stride;
313 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
314 ref_ptr += ref_stride;
315 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
316 src2, src3);
317 CALC_MSE_AVG_B(src0, ref0, var, avg0);
318 CALC_MSE_AVG_B(src2, ref2, var, avg0);
319 CALC_MSE_AVG_B(src1, ref1, var, avg1);
320 CALC_MSE_AVG_B(src3, ref3, var, avg1);
321 }
322
323 vec = __msa_hadd_s_w(avg0, avg0);
324 vec += __msa_hadd_s_w(avg1, avg1);
325
326 *diff = HADD_SW_S32(vec);
327
328 return HADD_SW_S32(var);
329 }
330
avg_sse_diff_64x64_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,const uint8_t * sec_pred,int32_t * diff)331 static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
332 int32_t src_stride,
333 const uint8_t *ref_ptr,
334 int32_t ref_stride,
335 const uint8_t *sec_pred, int32_t *diff) {
336 int32_t ht_cnt;
337 v16u8 src0, src1, src2, src3;
338 v16u8 ref0, ref1, ref2, ref3;
339 v16u8 pred0, pred1, pred2, pred3;
340 v8i16 avg0 = { 0 };
341 v8i16 avg1 = { 0 };
342 v8i16 avg2 = { 0 };
343 v8i16 avg3 = { 0 };
344 v4i32 vec, var = { 0 };
345
346 for (ht_cnt = 32; ht_cnt--;) {
347 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
348 sec_pred += 64;
349 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
350 src_ptr += src_stride;
351 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
352 ref_ptr += ref_stride;
353 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
354 src2, src3);
355 CALC_MSE_AVG_B(src0, ref0, var, avg0);
356 CALC_MSE_AVG_B(src1, ref1, var, avg1);
357 CALC_MSE_AVG_B(src2, ref2, var, avg2);
358 CALC_MSE_AVG_B(src3, ref3, var, avg3);
359
360 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
361 sec_pred += 64;
362 LD_UB4(src_ptr, 16, src0, src1, src2, src3);
363 src_ptr += src_stride;
364 LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
365 ref_ptr += ref_stride;
366 AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3, src0, src1,
367 src2, src3);
368 CALC_MSE_AVG_B(src0, ref0, var, avg0);
369 CALC_MSE_AVG_B(src1, ref1, var, avg1);
370 CALC_MSE_AVG_B(src2, ref2, var, avg2);
371 CALC_MSE_AVG_B(src3, ref3, var, avg3);
372 }
373
374 vec = __msa_hadd_s_w(avg0, avg0);
375 vec += __msa_hadd_s_w(avg1, avg1);
376 vec += __msa_hadd_s_w(avg2, avg2);
377 vec += __msa_hadd_s_w(avg3, avg3);
378 *diff = HADD_SW_S32(vec);
379
380 return HADD_SW_S32(var);
381 }
382
sub_pixel_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)383 static uint32_t sub_pixel_sse_diff_4width_h_msa(
384 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
385 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
386 int16_t filtval;
387 uint32_t loop_cnt;
388 uint32_t ref0, ref1, ref2, ref3;
389 v16u8 filt0, ref = { 0 };
390 v16i8 src0, src1, src2, src3;
391 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
392 v8u16 vec0, vec1, vec2, vec3;
393 v8i16 avg = { 0 };
394 v4i32 vec, var = { 0 };
395
396 filtval = LH(filter);
397 filt0 = (v16u8)__msa_fill_h(filtval);
398
399 for (loop_cnt = (height >> 2); loop_cnt--;) {
400 LD_SB4(src, src_stride, src0, src1, src2, src3);
401 src += (4 * src_stride);
402 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
403 dst += (4 * dst_stride);
404 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
405 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
406 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
407 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
408 vec2, vec3);
409 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
410 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
411 src2, src3);
412 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
413 src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
414 CALC_MSE_AVG_B(src0, ref, var, avg);
415 }
416
417 vec = __msa_hadd_s_w(avg, avg);
418 *diff = HADD_SW_S32(vec);
419
420 return HADD_SW_S32(var);
421 }
422
sub_pixel_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)423 static uint32_t sub_pixel_sse_diff_8width_h_msa(
424 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
425 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
426 int16_t filtval;
427 uint32_t loop_cnt;
428 v16u8 filt0, out, ref0, ref1, ref2, ref3;
429 v16i8 src0, src1, src2, src3;
430 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
431 v8u16 vec0, vec1, vec2, vec3;
432 v8i16 avg = { 0 };
433 v4i32 vec, var = { 0 };
434
435 filtval = LH(filter);
436 filt0 = (v16u8)__msa_fill_h(filtval);
437
438 for (loop_cnt = (height >> 2); loop_cnt--;) {
439 LD_SB4(src, src_stride, src0, src1, src2, src3);
440 src += (4 * src_stride);
441 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
442 dst += (4 * dst_stride);
443
444 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
445 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
446 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
447 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
448 vec2, vec3);
449 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
450 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
451 src2, src3);
452 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
453 CALC_MSE_AVG_B(out, ref0, var, avg);
454 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
455 CALC_MSE_AVG_B(out, ref1, var, avg);
456 }
457
458 vec = __msa_hadd_s_w(avg, avg);
459 *diff = HADD_SW_S32(vec);
460
461 return HADD_SW_S32(var);
462 }
463
sub_pixel_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)464 static uint32_t sub_pixel_sse_diff_16width_h_msa(
465 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
466 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
467 int16_t filtval;
468 uint32_t loop_cnt;
469 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
470 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
471 v16u8 dst0, dst1, dst2, dst3, filt0;
472 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
473 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
474 v8i16 avg = { 0 };
475 v4i32 vec, var = { 0 };
476
477 filtval = LH(filter);
478 filt0 = (v16u8)__msa_fill_h(filtval);
479
480 for (loop_cnt = (height >> 2); loop_cnt--;) {
481 LD_SB4(src, src_stride, src0, src2, src4, src6);
482 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
483 src += (4 * src_stride);
484 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
485 dst += (4 * dst_stride);
486
487 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
488 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
489 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
490 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
491 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
492 out2, out3);
493 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
494 out6, out7);
495 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
496 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
497 PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6, src0, src1,
498 src2, src3);
499 CALC_MSE_AVG_B(src0, dst0, var, avg);
500 CALC_MSE_AVG_B(src1, dst1, var, avg);
501 CALC_MSE_AVG_B(src2, dst2, var, avg);
502 CALC_MSE_AVG_B(src3, dst3, var, avg);
503 }
504
505 vec = __msa_hadd_s_w(avg, avg);
506 *diff = HADD_SW_S32(vec);
507
508 return HADD_SW_S32(var);
509 }
510
sub_pixel_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)511 static uint32_t sub_pixel_sse_diff_32width_h_msa(
512 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
513 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
514 uint32_t loop_cnt, sse = 0;
515 int32_t diff0[2];
516
517 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
518 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
519 filter, height, &diff0[loop_cnt]);
520 src += 16;
521 dst += 16;
522 }
523
524 *diff = diff0[0] + diff0[1];
525
526 return sse;
527 }
528
sub_pixel_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)529 static uint32_t sub_pixel_sse_diff_64width_h_msa(
530 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
531 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
532 uint32_t loop_cnt, sse = 0;
533 int32_t diff0[4];
534
535 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
536 sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
537 filter, height, &diff0[loop_cnt]);
538 src += 16;
539 dst += 16;
540 }
541
542 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
543
544 return sse;
545 }
546
sub_pixel_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)547 static uint32_t sub_pixel_sse_diff_4width_v_msa(
548 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
549 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
550 int16_t filtval;
551 uint32_t loop_cnt;
552 uint32_t ref0, ref1, ref2, ref3;
553 v16u8 src0, src1, src2, src3, src4, out;
554 v16u8 src10_r, src32_r, src21_r, src43_r;
555 v16u8 ref = { 0 };
556 v16u8 src2110, src4332;
557 v16u8 filt0;
558 v8i16 avg = { 0 };
559 v4i32 vec, var = { 0 };
560 v8u16 tmp0, tmp1;
561
562 filtval = LH(filter);
563 filt0 = (v16u8)__msa_fill_h(filtval);
564
565 src0 = LD_UB(src);
566 src += src_stride;
567
568 for (loop_cnt = (height >> 2); loop_cnt--;) {
569 LD_UB4(src, src_stride, src1, src2, src3, src4);
570 src += (4 * src_stride);
571 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
572 dst += (4 * dst_stride);
573
574 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
575 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
576 src32_r, src43_r);
577 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
578 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
579 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
580 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
581 CALC_MSE_AVG_B(out, ref, var, avg);
582 src0 = src4;
583 }
584
585 vec = __msa_hadd_s_w(avg, avg);
586 *diff = HADD_SW_S32(vec);
587
588 return HADD_SW_S32(var);
589 }
590
sub_pixel_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)591 static uint32_t sub_pixel_sse_diff_8width_v_msa(
592 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
593 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
594 int16_t filtval;
595 uint32_t loop_cnt;
596 v16u8 src0, src1, src2, src3, src4;
597 v16u8 ref0, ref1, ref2, ref3;
598 v8u16 vec0, vec1, vec2, vec3;
599 v8u16 tmp0, tmp1, tmp2, tmp3;
600 v16u8 filt0;
601 v8i16 avg = { 0 };
602 v4i32 vec, var = { 0 };
603
604 filtval = LH(filter);
605 filt0 = (v16u8)__msa_fill_h(filtval);
606
607 src0 = LD_UB(src);
608 src += src_stride;
609
610 for (loop_cnt = (height >> 2); loop_cnt--;) {
611 LD_UB4(src, src_stride, src1, src2, src3, src4);
612 src += (4 * src_stride);
613 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
614 dst += (4 * dst_stride);
615
616 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
617 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
618 vec3);
619 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
620 tmp2, tmp3);
621 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
622 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
623 CALC_MSE_AVG_B(src0, ref0, var, avg);
624 CALC_MSE_AVG_B(src1, ref1, var, avg);
625 src0 = src4;
626 }
627
628 vec = __msa_hadd_s_w(avg, avg);
629 *diff = HADD_SW_S32(vec);
630
631 return HADD_SW_S32(var);
632 }
633
sub_pixel_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)634 static uint32_t sub_pixel_sse_diff_16width_v_msa(
635 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
636 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
637 int16_t filtval;
638 uint32_t loop_cnt;
639 v16u8 ref0, ref1, ref2, ref3;
640 v16u8 src0, src1, src2, src3, src4;
641 v16u8 out0, out1, out2, out3;
642 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
643 v8u16 tmp0, tmp1, tmp2, tmp3;
644 v16u8 filt0;
645 v8i16 avg = { 0 };
646 v4i32 vec, var = { 0 };
647
648 filtval = LH(filter);
649 filt0 = (v16u8)__msa_fill_h(filtval);
650
651 src0 = LD_UB(src);
652 src += src_stride;
653
654 for (loop_cnt = (height >> 2); loop_cnt--;) {
655 LD_UB4(src, src_stride, src1, src2, src3, src4);
656 src += (4 * src_stride);
657 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
658 dst += (4 * dst_stride);
659
660 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
661 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
662 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
663 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
664 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
665
666 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
667 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
668 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
669 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
670 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
671
672 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
673 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
674 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
675 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
676 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
677 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
678
679 src0 = src4;
680
681 CALC_MSE_AVG_B(out0, ref0, var, avg);
682 CALC_MSE_AVG_B(out1, ref1, var, avg);
683 CALC_MSE_AVG_B(out2, ref2, var, avg);
684 CALC_MSE_AVG_B(out3, ref3, var, avg);
685 }
686
687 vec = __msa_hadd_s_w(avg, avg);
688 *diff = HADD_SW_S32(vec);
689
690 return HADD_SW_S32(var);
691 }
692
sub_pixel_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)693 static uint32_t sub_pixel_sse_diff_32width_v_msa(
694 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
695 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
696 uint32_t loop_cnt, sse = 0;
697 int32_t diff0[2];
698
699 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
700 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
701 filter, height, &diff0[loop_cnt]);
702 src += 16;
703 dst += 16;
704 }
705
706 *diff = diff0[0] + diff0[1];
707
708 return sse;
709 }
710
sub_pixel_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter,int32_t height,int32_t * diff)711 static uint32_t sub_pixel_sse_diff_64width_v_msa(
712 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
713 int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
714 uint32_t loop_cnt, sse = 0;
715 int32_t diff0[4];
716
717 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
718 sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
719 filter, height, &diff0[loop_cnt]);
720 src += 16;
721 dst += 16;
722 }
723
724 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
725
726 return sse;
727 }
728
sub_pixel_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)729 static uint32_t sub_pixel_sse_diff_4width_hv_msa(
730 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
731 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
732 int32_t height, int32_t *diff) {
733 int16_t filtval;
734 uint32_t loop_cnt;
735 uint32_t ref0, ref1, ref2, ref3;
736 v16u8 src0, src1, src2, src3, src4;
737 v16u8 out, ref = { 0 };
738 v16u8 filt_vt, filt_hz, vec0, vec1;
739 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
740 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
741 v8u16 tmp0, tmp1;
742 v8i16 avg = { 0 };
743 v4i32 vec, var = { 0 };
744
745 filtval = LH(filter_horiz);
746 filt_hz = (v16u8)__msa_fill_h(filtval);
747 filtval = LH(filter_vert);
748 filt_vt = (v16u8)__msa_fill_h(filtval);
749
750 src0 = LD_UB(src);
751 src += src_stride;
752
753 for (loop_cnt = (height >> 2); loop_cnt--;) {
754 LD_UB4(src, src_stride, src1, src2, src3, src4);
755 src += (4 * src_stride);
756 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
757 dst += (4 * dst_stride);
758 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
759 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
760 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
761 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
762 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
763 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
764 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
765 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
766 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
767 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
768 CALC_MSE_AVG_B(out, ref, var, avg);
769 src0 = src4;
770 }
771
772 vec = __msa_hadd_s_w(avg, avg);
773 *diff = HADD_SW_S32(vec);
774
775 return HADD_SW_S32(var);
776 }
777
sub_pixel_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)778 static uint32_t sub_pixel_sse_diff_8width_hv_msa(
779 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
780 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
781 int32_t height, int32_t *diff) {
782 int16_t filtval;
783 uint32_t loop_cnt;
784 v16u8 ref0, ref1, ref2, ref3;
785 v16u8 src0, src1, src2, src3, src4;
786 v16u8 out0, out1;
787 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
788 v8u16 hz_out0, hz_out1;
789 v8u16 tmp0, tmp1, tmp2, tmp3;
790 v16u8 filt_vt, filt_hz, vec0;
791 v8i16 avg = { 0 };
792 v4i32 vec, var = { 0 };
793
794 filtval = LH(filter_horiz);
795 filt_hz = (v16u8)__msa_fill_h(filtval);
796 filtval = LH(filter_vert);
797 filt_vt = (v16u8)__msa_fill_h(filtval);
798
799 src0 = LD_UB(src);
800 src += src_stride;
801 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
802
803 for (loop_cnt = (height >> 2); loop_cnt--;) {
804 LD_UB4(src, src_stride, src1, src2, src3, src4);
805 src += (4 * src_stride);
806 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
807 dst += (4 * dst_stride);
808
809 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
810 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
811 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
812 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
813 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
814 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
815 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
816 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
817 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
818 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
819 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
820 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
821 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
822 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
823 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
824 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
825 CALC_MSE_AVG_B(out0, ref0, var, avg);
826 CALC_MSE_AVG_B(out1, ref1, var, avg);
827 }
828
829 vec = __msa_hadd_s_w(avg, avg);
830 *diff = HADD_SW_S32(vec);
831
832 return HADD_SW_S32(var);
833 }
834
sub_pixel_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)835 static uint32_t sub_pixel_sse_diff_16width_hv_msa(
836 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
837 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
838 int32_t height, int32_t *diff) {
839 int16_t filtval;
840 uint32_t loop_cnt;
841 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
842 v16u8 ref0, ref1, ref2, ref3;
843 v16u8 filt_hz, filt_vt, vec0, vec1;
844 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
845 v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
846 v8u16 tmp0, tmp1;
847 v8i16 avg = { 0 };
848 v4i32 vec, var = { 0 };
849
850 filtval = LH(filter_horiz);
851 filt_hz = (v16u8)__msa_fill_h(filtval);
852 filtval = LH(filter_vert);
853 filt_vt = (v16u8)__msa_fill_h(filtval);
854
855 LD_UB2(src, 8, src0, src1);
856 src += src_stride;
857
858 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
859 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
860
861 for (loop_cnt = (height >> 2); loop_cnt--;) {
862 LD_UB4(src, src_stride, src0, src2, src4, src6);
863 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
864 src += (4 * src_stride);
865 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
866 dst += (4 * dst_stride);
867
868 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
869 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
870 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
871 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
872 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
873 src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
874
875 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
876 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
877 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
878 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
879 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
880 src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
881
882 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
883 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
884 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
885 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
886 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
887 src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
888
889 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
890 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
891 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
892 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
893 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
894 src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
895
896 CALC_MSE_AVG_B(src0, ref0, var, avg);
897 CALC_MSE_AVG_B(src1, ref1, var, avg);
898 CALC_MSE_AVG_B(src2, ref2, var, avg);
899 CALC_MSE_AVG_B(src3, ref3, var, avg);
900 }
901
902 vec = __msa_hadd_s_w(avg, avg);
903 *diff = HADD_SW_S32(vec);
904
905 return HADD_SW_S32(var);
906 }
907
sub_pixel_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)908 static uint32_t sub_pixel_sse_diff_32width_hv_msa(
909 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
910 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
911 int32_t height, int32_t *diff) {
912 uint32_t loop_cnt, sse = 0;
913 int32_t diff0[2];
914
915 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
916 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
917 filter_horiz, filter_vert, height,
918 &diff0[loop_cnt]);
919 src += 16;
920 dst += 16;
921 }
922
923 *diff = diff0[0] + diff0[1];
924
925 return sse;
926 }
927
sub_pixel_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)928 static uint32_t sub_pixel_sse_diff_64width_hv_msa(
929 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
930 int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
931 int32_t height, int32_t *diff) {
932 uint32_t loop_cnt, sse = 0;
933 int32_t diff0[4];
934
935 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
936 sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
937 filter_horiz, filter_vert, height,
938 &diff0[loop_cnt]);
939 src += 16;
940 dst += 16;
941 }
942
943 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
944
945 return sse;
946 }
947
sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)948 static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(
949 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
950 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
951 int32_t height, int32_t *diff) {
952 int16_t filtval;
953 uint32_t loop_cnt;
954 uint32_t ref0, ref1, ref2, ref3;
955 v16u8 out, pred, filt0, ref = { 0 };
956 v16i8 src0, src1, src2, src3;
957 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
958 v8u16 vec0, vec1, vec2, vec3;
959 v8i16 avg = { 0 };
960 v4i32 vec, var = { 0 };
961
962 filtval = LH(filter);
963 filt0 = (v16u8)__msa_fill_h(filtval);
964
965 for (loop_cnt = (height >> 2); loop_cnt--;) {
966 LD_SB4(src, src_stride, src0, src1, src2, src3);
967 src += (4 * src_stride);
968 pred = LD_UB(sec_pred);
969 sec_pred += 16;
970 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
971 dst += (4 * dst_stride);
972
973 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
974 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
975 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
976 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
977 vec2, vec3);
978 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
979 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
980 src2, src3);
981 ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
982 out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
983 out = __msa_aver_u_b(out, pred);
984 CALC_MSE_AVG_B(out, ref, var, avg);
985 }
986
987 vec = __msa_hadd_s_w(avg, avg);
988 *diff = HADD_SW_S32(vec);
989
990 return HADD_SW_S32(var);
991 }
992
sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)993 static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(
994 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
995 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
996 int32_t height, int32_t *diff) {
997 int16_t filtval;
998 uint32_t loop_cnt;
999 v16u8 out, pred, filt0;
1000 v16u8 ref0, ref1, ref2, ref3;
1001 v16i8 src0, src1, src2, src3;
1002 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1003 v8u16 vec0, vec1, vec2, vec3;
1004 v8i16 avg = { 0 };
1005 v4i32 vec, var = { 0 };
1006
1007 filtval = LH(filter);
1008 filt0 = (v16u8)__msa_fill_h(filtval);
1009
1010 for (loop_cnt = (height >> 2); loop_cnt--;) {
1011 LD_SB4(src, src_stride, src0, src1, src2, src3);
1012 src += (4 * src_stride);
1013 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1014 dst += (4 * dst_stride);
1015
1016 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1017 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1018 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1019 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
1020 vec2, vec3);
1021 SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
1022 PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3, src0, src1,
1023 src2, src3);
1024 out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
1025
1026 pred = LD_UB(sec_pred);
1027 sec_pred += 16;
1028 out = __msa_aver_u_b(out, pred);
1029 CALC_MSE_AVG_B(out, ref0, var, avg);
1030 out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
1031 pred = LD_UB(sec_pred);
1032 sec_pred += 16;
1033 out = __msa_aver_u_b(out, pred);
1034 CALC_MSE_AVG_B(out, ref1, var, avg);
1035 }
1036
1037 vec = __msa_hadd_s_w(avg, avg);
1038 *diff = HADD_SW_S32(vec);
1039
1040 return HADD_SW_S32(var);
1041 }
1042
subpel_avg_ssediff_16w_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1043 static uint32_t subpel_avg_ssediff_16w_h_msa(
1044 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1045 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1046 int32_t height, int32_t *diff, int32_t width) {
1047 int16_t filtval;
1048 uint32_t loop_cnt;
1049 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1050 v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1051 v16u8 dst0, dst1, dst2, dst3;
1052 v16u8 tmp0, tmp1, tmp2, tmp3;
1053 v16u8 pred0, pred1, pred2, pred3, filt0;
1054 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1055 v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
1056 v8i16 avg = { 0 };
1057 v4i32 vec, var = { 0 };
1058
1059 filtval = LH(filter);
1060 filt0 = (v16u8)__msa_fill_h(filtval);
1061
1062 for (loop_cnt = (height >> 2); loop_cnt--;) {
1063 LD_SB4(src, src_stride, src0, src2, src4, src6);
1064 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1065 src += (4 * src_stride);
1066 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1067 dst += (4 * dst_stride);
1068 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1069 sec_pred += (4 * width);
1070
1071 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1072 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1073 VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
1074 VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
1075 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0, out1,
1076 out2, out3);
1077 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, out4, out5,
1078 out6, out7);
1079 SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
1080 SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
1081 PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6, tmp0, tmp1,
1082 tmp2, tmp3);
1083 AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3, tmp0, tmp1,
1084 tmp2, tmp3);
1085
1086 CALC_MSE_AVG_B(tmp0, dst0, var, avg);
1087 CALC_MSE_AVG_B(tmp1, dst1, var, avg);
1088 CALC_MSE_AVG_B(tmp2, dst2, var, avg);
1089 CALC_MSE_AVG_B(tmp3, dst3, var, avg);
1090 }
1091
1092 vec = __msa_hadd_s_w(avg, avg);
1093 *diff = HADD_SW_S32(vec);
1094
1095 return HADD_SW_S32(var);
1096 }
1097
sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1098 static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(
1099 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1100 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1101 int32_t height, int32_t *diff) {
1102 return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
1103 sec_pred, filter, height, diff, 16);
1104 }
1105
sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1106 static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(
1107 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1108 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1109 int32_t height, int32_t *diff) {
1110 uint32_t loop_cnt, sse = 0;
1111 int32_t diff0[2];
1112
1113 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1114 sse +=
1115 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1116 filter, height, &diff0[loop_cnt], 32);
1117 src += 16;
1118 dst += 16;
1119 sec_pred += 16;
1120 }
1121
1122 *diff = diff0[0] + diff0[1];
1123
1124 return sse;
1125 }
1126
sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1127 static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(
1128 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1129 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1130 int32_t height, int32_t *diff) {
1131 uint32_t loop_cnt, sse = 0;
1132 int32_t diff0[4];
1133
1134 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1135 sse +=
1136 subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride, sec_pred,
1137 filter, height, &diff0[loop_cnt], 64);
1138 src += 16;
1139 dst += 16;
1140 sec_pred += 16;
1141 }
1142
1143 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1144
1145 return sse;
1146 }
1147
sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1148 static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(
1149 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1150 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1151 int32_t height, int32_t *diff) {
1152 int16_t filtval;
1153 uint32_t loop_cnt;
1154 uint32_t ref0, ref1, ref2, ref3;
1155 v16u8 src0, src1, src2, src3, src4;
1156 v16u8 src10_r, src32_r, src21_r, src43_r;
1157 v16u8 out, pred, ref = { 0 };
1158 v16u8 src2110, src4332, filt0;
1159 v8i16 avg = { 0 };
1160 v4i32 vec, var = { 0 };
1161 v8u16 tmp0, tmp1;
1162
1163 filtval = LH(filter);
1164 filt0 = (v16u8)__msa_fill_h(filtval);
1165
1166 src0 = LD_UB(src);
1167 src += src_stride;
1168
1169 for (loop_cnt = (height >> 2); loop_cnt--;) {
1170 LD_UB4(src, src_stride, src1, src2, src3, src4);
1171 src += (4 * src_stride);
1172 pred = LD_UB(sec_pred);
1173 sec_pred += 16;
1174 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1175 dst += (4 * dst_stride);
1176
1177 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1178 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1179 src32_r, src43_r);
1180 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1181 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1182 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1183
1184 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1185 out = __msa_aver_u_b(out, pred);
1186 CALC_MSE_AVG_B(out, ref, var, avg);
1187 src0 = src4;
1188 }
1189
1190 vec = __msa_hadd_s_w(avg, avg);
1191 *diff = HADD_SW_S32(vec);
1192
1193 return HADD_SW_S32(var);
1194 }
1195
sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1196 static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(
1197 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1198 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1199 int32_t height, int32_t *diff) {
1200 int16_t filtval;
1201 uint32_t loop_cnt;
1202 v16u8 src0, src1, src2, src3, src4;
1203 v16u8 ref0, ref1, ref2, ref3;
1204 v16u8 pred0, pred1, filt0;
1205 v8u16 vec0, vec1, vec2, vec3;
1206 v8u16 tmp0, tmp1, tmp2, tmp3;
1207 v8i16 avg = { 0 };
1208 v4i32 vec, var = { 0 };
1209
1210 filtval = LH(filter);
1211 filt0 = (v16u8)__msa_fill_h(filtval);
1212
1213 src0 = LD_UB(src);
1214 src += src_stride;
1215
1216 for (loop_cnt = (height >> 2); loop_cnt--;) {
1217 LD_UB4(src, src_stride, src1, src2, src3, src4);
1218 src += (4 * src_stride);
1219 LD_UB2(sec_pred, 16, pred0, pred1);
1220 sec_pred += 32;
1221 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1222 dst += (4 * dst_stride);
1223 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1224 ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
1225 vec3);
1226 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
1227 tmp2, tmp3);
1228 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
1229 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1230 AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
1231 CALC_MSE_AVG_B(src0, ref0, var, avg);
1232 CALC_MSE_AVG_B(src1, ref1, var, avg);
1233
1234 src0 = src4;
1235 }
1236
1237 vec = __msa_hadd_s_w(avg, avg);
1238 *diff = HADD_SW_S32(vec);
1239
1240 return HADD_SW_S32(var);
1241 }
1242
subpel_avg_ssediff_16w_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff,int32_t width)1243 static uint32_t subpel_avg_ssediff_16w_v_msa(
1244 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1245 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1246 int32_t height, int32_t *diff, int32_t width) {
1247 int16_t filtval;
1248 uint32_t loop_cnt;
1249 v16u8 ref0, ref1, ref2, ref3;
1250 v16u8 pred0, pred1, pred2, pred3;
1251 v16u8 src0, src1, src2, src3, src4;
1252 v16u8 out0, out1, out2, out3, filt0;
1253 v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1254 v8u16 tmp0, tmp1, tmp2, tmp3;
1255 v8i16 avg = { 0 };
1256 v4i32 vec, var = { 0 };
1257
1258 filtval = LH(filter);
1259 filt0 = (v16u8)__msa_fill_h(filtval);
1260
1261 src0 = LD_UB(src);
1262 src += src_stride;
1263
1264 for (loop_cnt = (height >> 2); loop_cnt--;) {
1265 LD_UB4(src, src_stride, src1, src2, src3, src4);
1266 src += (4 * src_stride);
1267 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1268 sec_pred += (4 * width);
1269
1270 ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
1271 ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
1272 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1273 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1274 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1275
1276 ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
1277 ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
1278 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1279 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1280 out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1281
1282 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1283 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1284 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1285
1286 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1287 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1288 out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
1289
1290 src0 = src4;
1291 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1292 dst += (4 * dst_stride);
1293
1294 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1295 out2, out3);
1296
1297 CALC_MSE_AVG_B(out0, ref0, var, avg);
1298 CALC_MSE_AVG_B(out1, ref1, var, avg);
1299 CALC_MSE_AVG_B(out2, ref2, var, avg);
1300 CALC_MSE_AVG_B(out3, ref3, var, avg);
1301 }
1302
1303 vec = __msa_hadd_s_w(avg, avg);
1304 *diff = HADD_SW_S32(vec);
1305
1306 return HADD_SW_S32(var);
1307 }
1308
sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1309 static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(
1310 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1311 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1312 int32_t height, int32_t *diff) {
1313 return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
1314 sec_pred, filter, height, diff, 16);
1315 }
1316
sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1317 static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(
1318 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1319 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1320 int32_t height, int32_t *diff) {
1321 uint32_t loop_cnt, sse = 0;
1322 int32_t diff0[2];
1323
1324 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1325 sse +=
1326 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1327 filter, height, &diff0[loop_cnt], 32);
1328 src += 16;
1329 dst += 16;
1330 sec_pred += 16;
1331 }
1332
1333 *diff = diff0[0] + diff0[1];
1334
1335 return sse;
1336 }
1337
sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter,int32_t height,int32_t * diff)1338 static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(
1339 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1340 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
1341 int32_t height, int32_t *diff) {
1342 uint32_t loop_cnt, sse = 0;
1343 int32_t diff0[4];
1344
1345 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1346 sse +=
1347 subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride, sec_pred,
1348 filter, height, &diff0[loop_cnt], 64);
1349 src += 16;
1350 dst += 16;
1351 sec_pred += 16;
1352 }
1353
1354 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1355
1356 return sse;
1357 }
1358
sub_pixel_avg_sse_diff_4width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1359 static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
1360 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1361 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1362 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1363 int16_t filtval;
1364 uint32_t loop_cnt;
1365 uint32_t ref0, ref1, ref2, ref3;
1366 v16u8 src0, src1, src2, src3, src4;
1367 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1368 v16u8 filt_hz, filt_vt, vec0, vec1;
1369 v16u8 out, pred, ref = { 0 };
1370 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
1371 v8i16 avg = { 0 };
1372 v4i32 vec, var = { 0 };
1373
1374 filtval = LH(filter_horiz);
1375 filt_hz = (v16u8)__msa_fill_h(filtval);
1376 filtval = LH(filter_vert);
1377 filt_vt = (v16u8)__msa_fill_h(filtval);
1378
1379 src0 = LD_UB(src);
1380 src += src_stride;
1381
1382 for (loop_cnt = (height >> 2); loop_cnt--;) {
1383 LD_UB4(src, src_stride, src1, src2, src3, src4);
1384 src += (4 * src_stride);
1385 pred = LD_UB(sec_pred);
1386 sec_pred += 16;
1387 LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
1388 dst += (4 * dst_stride);
1389 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1390 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
1391 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
1392 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1393 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
1394 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
1395 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1396 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1397 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1398 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1399 out = __msa_aver_u_b(out, pred);
1400 CALC_MSE_AVG_B(out, ref, var, avg);
1401 src0 = src4;
1402 }
1403
1404 vec = __msa_hadd_s_w(avg, avg);
1405 *diff = HADD_SW_S32(vec);
1406
1407 return HADD_SW_S32(var);
1408 }
1409
sub_pixel_avg_sse_diff_8width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1410 static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
1411 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1412 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1413 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1414 int16_t filtval;
1415 uint32_t loop_cnt;
1416 v16u8 ref0, ref1, ref2, ref3;
1417 v16u8 src0, src1, src2, src3, src4;
1418 v16u8 pred0, pred1, out0, out1;
1419 v16u8 filt_hz, filt_vt, vec0;
1420 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1421 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
1422 v8i16 avg = { 0 };
1423 v4i32 vec, var = { 0 };
1424
1425 filtval = LH(filter_horiz);
1426 filt_hz = (v16u8)__msa_fill_h(filtval);
1427 filtval = LH(filter_vert);
1428 filt_vt = (v16u8)__msa_fill_h(filtval);
1429
1430 src0 = LD_UB(src);
1431 src += src_stride;
1432 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1433
1434 for (loop_cnt = (height >> 2); loop_cnt--;) {
1435 LD_UB4(src, src_stride, src1, src2, src3, src4);
1436 src += (4 * src_stride);
1437 LD_UB2(sec_pred, 16, pred0, pred1);
1438 sec_pred += 32;
1439 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1440 dst += (4 * dst_stride);
1441
1442 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
1443 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1444
1445 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1446 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
1447 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1448
1449 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1450 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
1451 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1452 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1453
1454 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
1455 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
1456 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1457
1458 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
1459 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
1460
1461 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
1462 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
1463 AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
1464
1465 CALC_MSE_AVG_B(out0, ref0, var, avg);
1466 CALC_MSE_AVG_B(out1, ref1, var, avg);
1467 }
1468
1469 vec = __msa_hadd_s_w(avg, avg);
1470 *diff = HADD_SW_S32(vec);
1471
1472 return HADD_SW_S32(var);
1473 }
1474
subpel_avg_ssediff_16w_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff,int32_t width)1475 static uint32_t subpel_avg_ssediff_16w_hv_msa(
1476 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1477 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1478 const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
1479 int16_t filtval;
1480 uint32_t loop_cnt;
1481 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1482 v16u8 ref0, ref1, ref2, ref3;
1483 v16u8 pred0, pred1, pred2, pred3;
1484 v16u8 out0, out1, out2, out3;
1485 v16u8 filt_hz, filt_vt, vec0, vec1;
1486 v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1487 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
1488 v8i16 avg = { 0 };
1489 v4i32 vec, var = { 0 };
1490
1491 filtval = LH(filter_horiz);
1492 filt_hz = (v16u8)__msa_fill_h(filtval);
1493 filtval = LH(filter_vert);
1494 filt_vt = (v16u8)__msa_fill_h(filtval);
1495
1496 LD_UB2(src, 8, src0, src1);
1497 src += src_stride;
1498
1499 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1500 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1501
1502 for (loop_cnt = (height >> 2); loop_cnt--;) {
1503 LD_UB4(src, src_stride, src0, src2, src4, src6);
1504 LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
1505 src += (4 * src_stride);
1506 LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
1507 sec_pred += (4 * width);
1508
1509 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
1510 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
1511 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1512 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1513 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1514 out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1515
1516 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
1517 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
1518 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1519 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1520 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1521 out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1522
1523 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
1524 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
1525 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1526 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1527 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1528 out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1529
1530 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
1531 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
1532 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
1533 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1534 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
1535 out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
1536
1537 LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
1538 dst += (4 * dst_stride);
1539
1540 AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3, out0, out1,
1541 out2, out3);
1542
1543 CALC_MSE_AVG_B(out0, ref0, var, avg);
1544 CALC_MSE_AVG_B(out1, ref1, var, avg);
1545 CALC_MSE_AVG_B(out2, ref2, var, avg);
1546 CALC_MSE_AVG_B(out3, ref3, var, avg);
1547 }
1548
1549 vec = __msa_hadd_s_w(avg, avg);
1550 *diff = HADD_SW_S32(vec);
1551
1552 return HADD_SW_S32(var);
1553 }
1554
sub_pixel_avg_sse_diff_16width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1555 static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
1556 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1557 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1558 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1559 return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1560 sec_pred, filter_horiz, filter_vert,
1561 height, diff, 16);
1562 }
1563
sub_pixel_avg_sse_diff_32width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1564 static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
1565 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1566 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1567 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1568 uint32_t loop_cnt, sse = 0;
1569 int32_t diff0[2];
1570
1571 for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
1572 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1573 sec_pred, filter_horiz, filter_vert,
1574 height, &diff0[loop_cnt], 32);
1575 src += 16;
1576 dst += 16;
1577 sec_pred += 16;
1578 }
1579
1580 *diff = diff0[0] + diff0[1];
1581
1582 return sse;
1583 }
1584
sub_pixel_avg_sse_diff_64width_hv_msa(const uint8_t * src,int32_t src_stride,const uint8_t * dst,int32_t dst_stride,const uint8_t * sec_pred,const uint8_t * filter_horiz,const uint8_t * filter_vert,int32_t height,int32_t * diff)1585 static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
1586 const uint8_t *src, int32_t src_stride, const uint8_t *dst,
1587 int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
1588 const uint8_t *filter_vert, int32_t height, int32_t *diff) {
1589 uint32_t loop_cnt, sse = 0;
1590 int32_t diff0[4];
1591
1592 for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
1593 sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
1594 sec_pred, filter_horiz, filter_vert,
1595 height, &diff0[loop_cnt], 64);
1596 src += 16;
1597 dst += 16;
1598 sec_pred += 16;
1599 }
1600
1601 *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
1602
1603 return sse;
1604 }
1605
1606 #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
1607 #define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1608 #define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
1609 #define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
1610 #define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1611 #define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
1612 #define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
1613
1614 #define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1615 #define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
1616 #define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
1617 #define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1618 #define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
1619 #define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
1620
1621 #define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht) \
1622 uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa( \
1623 const uint8_t *src, int32_t src_stride, int32_t x_offset, \
1624 int32_t y_offset, const uint8_t *ref, int32_t ref_stride, \
1625 uint32_t *sse) { \
1626 int32_t diff; \
1627 uint32_t var; \
1628 const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
1629 const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
1630 \
1631 if (y_offset) { \
1632 if (x_offset) { \
1633 *sse = sub_pixel_sse_diff_##wd##width_hv_msa( \
1634 src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
1635 } else { \
1636 *sse = sub_pixel_sse_diff_##wd##width_v_msa( \
1637 src, src_stride, ref, ref_stride, v_filter, ht, &diff); \
1638 } \
1639 \
1640 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1641 } else { \
1642 if (x_offset) { \
1643 *sse = sub_pixel_sse_diff_##wd##width_h_msa( \
1644 src, src_stride, ref, ref_stride, h_filter, ht, &diff); \
1645 \
1646 var = VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1647 } else { \
1648 var = vpx_variance##wd##x##ht##_msa(src, src_stride, ref, ref_stride, \
1649 sse); \
1650 } \
1651 } \
1652 \
1653 return var; \
1654 }
1655
1656 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
1657 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
1658
1659 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
1660 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
1661 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
1662
1663 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
1664 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
1665 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
1666
1667 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
1668 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
1669 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
1670
1671 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
1672 VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
1673
1674 #define VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht) \
1675 uint32_t vpx_sub_pixel_avg_variance##wd##x##ht##_msa( \
1676 const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
1677 int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
1678 uint32_t *sse, const uint8_t *sec_pred) { \
1679 int32_t diff; \
1680 const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
1681 const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
1682 \
1683 if (y_offset) { \
1684 if (x_offset) { \
1685 *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa( \
1686 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
1687 v_filter, ht, &diff); \
1688 } else { \
1689 *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa( \
1690 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1691 &diff); \
1692 } \
1693 } else { \
1694 if (x_offset) { \
1695 *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa( \
1696 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1697 &diff); \
1698 } else { \
1699 *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride, ref_ptr, \
1700 ref_stride, sec_pred, ht, &diff); \
1701 } \
1702 } \
1703 \
1704 return VARIANCE_##wd##Wx##ht##H(*sse, diff); \
1705 }
1706
1707 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
1708 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
1709
1710 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
1711 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
1712 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
1713
1714 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
1715 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
1716 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
1717
1718 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
1719 VPX_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
1720
vpx_sub_pixel_avg_variance32x64_msa(const uint8_t * src_ptr,int32_t src_stride,int32_t x_offset,int32_t y_offset,const uint8_t * ref_ptr,int32_t ref_stride,uint32_t * sse,const uint8_t * sec_pred)1721 uint32_t vpx_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
1722 int32_t src_stride,
1723 int32_t x_offset, int32_t y_offset,
1724 const uint8_t *ref_ptr,
1725 int32_t ref_stride, uint32_t *sse,
1726 const uint8_t *sec_pred) {
1727 int32_t diff;
1728 const uint8_t *h_filter = bilinear_filters_msa[x_offset];
1729 const uint8_t *v_filter = bilinear_filters_msa[y_offset];
1730
1731 if (y_offset) {
1732 if (x_offset) {
1733 *sse = sub_pixel_avg_sse_diff_32width_hv_msa(
1734 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,
1735 v_filter, 64, &diff);
1736 } else {
1737 *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride, ref_ptr,
1738 ref_stride, sec_pred,
1739 v_filter, 64, &diff);
1740 }
1741 } else {
1742 if (x_offset) {
1743 *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride, ref_ptr,
1744 ref_stride, sec_pred,
1745 h_filter, 64, &diff);
1746 } else {
1747 *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
1748 sec_pred, &diff);
1749 }
1750 }
1751
1752 return VARIANCE_32Wx64H(*sse, diff);
1753 }
1754
1755 #define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht) \
1756 uint32_t vpx_sub_pixel_avg_variance64x##ht##_msa( \
1757 const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset, \
1758 int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride, \
1759 uint32_t *sse, const uint8_t *sec_pred) { \
1760 int32_t diff; \
1761 const uint8_t *h_filter = bilinear_filters_msa[x_offset]; \
1762 const uint8_t *v_filter = bilinear_filters_msa[y_offset]; \
1763 \
1764 if (y_offset) { \
1765 if (x_offset) { \
1766 *sse = sub_pixel_avg_sse_diff_64width_hv_msa( \
1767 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, \
1768 v_filter, ht, &diff); \
1769 } else { \
1770 *sse = sub_pixel_avg_sse_diff_64width_v_msa( \
1771 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
1772 &diff); \
1773 } \
1774 } else { \
1775 if (x_offset) { \
1776 *sse = sub_pixel_avg_sse_diff_64width_h_msa( \
1777 src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
1778 &diff); \
1779 } else { \
1780 *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride, ref_ptr, \
1781 ref_stride, sec_pred, &diff); \
1782 } \
1783 } \
1784 \
1785 return VARIANCE_64Wx##ht##H(*sse, diff); \
1786 }
1787
1788 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
1789 VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);
1790