1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
15 { \
16 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20 }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24 const uint8_t *ref_ptr, int32_t ref_stride,
25 int32_t height) {
26 int32_t ht_cnt;
27 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28 v16u8 src = { 0 };
29 v16u8 ref = { 0 };
30 v16u8 diff;
31 v8u16 sad = { 0 };
32
33 for (ht_cnt = (height >> 2); ht_cnt--;) {
34 LW4(src_ptr, src_stride, src0, src1, src2, src3);
35 src_ptr += (4 * src_stride);
36 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37 ref_ptr += (4 * ref_stride);
38
39 INSERT_W4_UB(src0, src1, src2, src3, src);
40 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41
42 diff = __msa_asub_u_b(src, ref);
43 sad += __msa_hadd_u_h(diff, diff);
44 }
45
46 return HADD_UH_U32(sad);
47 }
48
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50 const uint8_t *ref, int32_t ref_stride,
51 int32_t height) {
52 int32_t ht_cnt;
53 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54 v8u16 sad = { 0 };
55
56 for (ht_cnt = (height >> 2); ht_cnt--;) {
57 LD_UB4(src, src_stride, src0, src1, src2, src3);
58 src += (4 * src_stride);
59 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60 ref += (4 * ref_stride);
61
62 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63 ref0, ref1);
64 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65 }
66
67 return HADD_UH_U32(sad);
68 }
69
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71 const uint8_t *ref, int32_t ref_stride,
72 int32_t height) {
73 int32_t ht_cnt;
74 v16u8 src0, src1, ref0, ref1;
75 v8u16 sad = { 0 };
76
77 for (ht_cnt = (height >> 2); ht_cnt--;) {
78 LD_UB2(src, src_stride, src0, src1);
79 src += (2 * src_stride);
80 LD_UB2(ref, ref_stride, ref0, ref1);
81 ref += (2 * ref_stride);
82 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83
84 LD_UB2(src, src_stride, src0, src1);
85 src += (2 * src_stride);
86 LD_UB2(ref, ref_stride, ref0, ref1);
87 ref += (2 * ref_stride);
88 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89 }
90
91 return HADD_UH_U32(sad);
92 }
93
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95 const uint8_t *ref, int32_t ref_stride,
96 int32_t height) {
97 int32_t ht_cnt;
98 v16u8 src0, src1, ref0, ref1;
99 v8u16 sad = { 0 };
100
101 for (ht_cnt = (height >> 2); ht_cnt--;) {
102 LD_UB2(src, 16, src0, src1);
103 src += src_stride;
104 LD_UB2(ref, 16, ref0, ref1);
105 ref += ref_stride;
106 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107
108 LD_UB2(src, 16, src0, src1);
109 src += src_stride;
110 LD_UB2(ref, 16, ref0, ref1);
111 ref += ref_stride;
112 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113
114 LD_UB2(src, 16, src0, src1);
115 src += src_stride;
116 LD_UB2(ref, 16, ref0, ref1);
117 ref += ref_stride;
118 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119
120 LD_UB2(src, 16, src0, src1);
121 src += src_stride;
122 LD_UB2(ref, 16, ref0, ref1);
123 ref += ref_stride;
124 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125 }
126
127 return HADD_UH_U32(sad);
128 }
129
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131 const uint8_t *ref, int32_t ref_stride,
132 int32_t height) {
133 int32_t ht_cnt;
134 uint32_t sad = 0;
135 v16u8 src0, src1, src2, src3;
136 v16u8 ref0, ref1, ref2, ref3;
137 v8u16 sad0 = { 0 };
138 v8u16 sad1 = { 0 };
139
140 for (ht_cnt = (height >> 1); ht_cnt--;) {
141 LD_UB4(src, 16, src0, src1, src2, src3);
142 src += src_stride;
143 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144 ref += ref_stride;
145 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147
148 LD_UB4(src, 16, src0, src1, src2, src3);
149 src += src_stride;
150 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151 ref += ref_stride;
152 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154 }
155
156 sad = HADD_UH_U32(sad0);
157 sad += HADD_UH_U32(sad1);
158
159 return sad;
160 }
161
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163 const uint8_t *ref_ptr, int32_t ref_stride,
164 int32_t height, uint32_t *sad_array) {
165 int32_t ht_cnt;
166 uint32_t src0, src1, src2, src3;
167 v16u8 src = { 0 };
168 v16u8 ref = { 0 };
169 v16u8 ref0, ref1, ref2, ref3, diff;
170 v8u16 sad0 = { 0 };
171 v8u16 sad1 = { 0 };
172 v8u16 sad2 = { 0 };
173
174 for (ht_cnt = (height >> 2); ht_cnt--;) {
175 LW4(src_ptr, src_stride, src0, src1, src2, src3);
176 src_ptr += (4 * src_stride);
177 INSERT_W4_UB(src0, src1, src2, src3, src);
178
179 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180 ref_ptr += (4 * ref_stride);
181 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182 diff = __msa_asub_u_b(src, ref);
183 sad0 += __msa_hadd_u_h(diff, diff);
184
185 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188 diff = __msa_asub_u_b(src, ref);
189 sad1 += __msa_hadd_u_h(diff, diff);
190
191 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194 diff = __msa_asub_u_b(src, ref);
195 sad2 += __msa_hadd_u_h(diff, diff);
196 }
197
198 sad_array[0] = HADD_UH_U32(sad0);
199 sad_array[1] = HADD_UH_U32(sad1);
200 sad_array[2] = HADD_UH_U32(sad2);
201 }
202
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204 const uint8_t *ref, int32_t ref_stride,
205 int32_t height, uint32_t *sad_array) {
206 int32_t ht_cnt;
207 v16u8 src0, src1, src2, src3;
208 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209 v8u16 sad0 = { 0 };
210 v8u16 sad1 = { 0 };
211 v8u16 sad2 = { 0 };
212
213 for (ht_cnt = (height >> 2); ht_cnt--;) {
214 LD_UB4(src, src_stride, src0, src1, src2, src3);
215 src += (4 * src_stride);
216 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217 ref += (4 * ref_stride);
218 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219 ref0, ref1);
220 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221
222 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226
227 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231 }
232
233 sad_array[0] = HADD_UH_U32(sad0);
234 sad_array[1] = HADD_UH_U32(sad1);
235 sad_array[2] = HADD_UH_U32(sad2);
236 }
237
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239 const uint8_t *ref_ptr, int32_t ref_stride,
240 int32_t height, uint32_t *sad_array) {
241 int32_t ht_cnt;
242 v16u8 src, ref, ref0, ref1, diff;
243 v8u16 sad0 = { 0 };
244 v8u16 sad1 = { 0 };
245 v8u16 sad2 = { 0 };
246
247 for (ht_cnt = (height >> 1); ht_cnt--;) {
248 src = LD_UB(src_ptr);
249 src_ptr += src_stride;
250 LD_UB2(ref_ptr, 16, ref0, ref1);
251 ref_ptr += ref_stride;
252
253 diff = __msa_asub_u_b(src, ref0);
254 sad0 += __msa_hadd_u_h(diff, diff);
255
256 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257 diff = __msa_asub_u_b(src, ref);
258 sad1 += __msa_hadd_u_h(diff, diff);
259
260 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261 diff = __msa_asub_u_b(src, ref);
262 sad2 += __msa_hadd_u_h(diff, diff);
263
264 src = LD_UB(src_ptr);
265 src_ptr += src_stride;
266 LD_UB2(ref_ptr, 16, ref0, ref1);
267 ref_ptr += ref_stride;
268
269 diff = __msa_asub_u_b(src, ref0);
270 sad0 += __msa_hadd_u_h(diff, diff);
271
272 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273 diff = __msa_asub_u_b(src, ref);
274 sad1 += __msa_hadd_u_h(diff, diff);
275
276 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277 diff = __msa_asub_u_b(src, ref);
278 sad2 += __msa_hadd_u_h(diff, diff);
279 }
280
281 sad_array[0] = HADD_UH_U32(sad0);
282 sad_array[1] = HADD_UH_U32(sad1);
283 sad_array[2] = HADD_UH_U32(sad2);
284 }
285
sad_32width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)286 static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
287 const uint8_t *ref, int32_t ref_stride,
288 int32_t height, uint32_t *sad_array) {
289 int32_t ht_cnt;
290 v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
291 v8u16 sad0 = { 0 };
292 v8u16 sad1 = { 0 };
293 v8u16 sad2 = { 0 };
294
295 for (ht_cnt = height >> 1; ht_cnt--;) {
296 LD_UB2(src, 16, src0, src1);
297 src += src_stride;
298 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
299 ref += ref_stride;
300
301 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
302
303 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
304 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
305
306 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
307 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
308
309 LD_UB2(src, 16, src0, src1);
310 src += src_stride;
311 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
312 ref += ref_stride;
313
314 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
315
316 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
317 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
318
319 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
320 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
321 }
322
323 sad_array[0] = HADD_UH_U32(sad0);
324 sad_array[1] = HADD_UH_U32(sad1);
325 sad_array[2] = HADD_UH_U32(sad2);
326 }
327
sad_64width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)328 static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
329 const uint8_t *ref, int32_t ref_stride,
330 int32_t height, uint32_t *sad_array) {
331 int32_t ht_cnt;
332 v16u8 src0, src1, src2, src3;
333 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
334 v8u16 sad0_0 = { 0 };
335 v8u16 sad0_1 = { 0 };
336 v8u16 sad1_0 = { 0 };
337 v8u16 sad1_1 = { 0 };
338 v8u16 sad2_0 = { 0 };
339 v8u16 sad2_1 = { 0 };
340 v4u32 sad;
341
342 for (ht_cnt = height; ht_cnt--;) {
343 LD_UB4(src, 16, src0, src1, src2, src3);
344 src += src_stride;
345 LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
346 ref0_4 = LD_UB(ref + 64);
347 ref += ref_stride;
348
349 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
350 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
351
352 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
353 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
354 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
355 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
356
357 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
358 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
359 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
360 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
361 }
362
363 sad = __msa_hadd_u_w(sad0_0, sad0_0);
364 sad += __msa_hadd_u_w(sad0_1, sad0_1);
365 sad_array[0] = HADD_SW_S32((v4i32)sad);
366
367 sad = __msa_hadd_u_w(sad1_0, sad1_0);
368 sad += __msa_hadd_u_w(sad1_1, sad1_1);
369 sad_array[1] = HADD_SW_S32((v4i32)sad);
370
371 sad = __msa_hadd_u_w(sad2_0, sad2_0);
372 sad += __msa_hadd_u_w(sad2_1, sad2_1);
373 sad_array[2] = HADD_SW_S32((v4i32)sad);
374 }
375
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)376 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
377 const uint8_t *ref_ptr, int32_t ref_stride,
378 int32_t height, uint32_t *sad_array) {
379 int32_t ht_cnt;
380 uint32_t src0, src1, src2, src3;
381 v16u8 ref0, ref1, ref2, ref3, diff;
382 v16u8 src = { 0 };
383 v16u8 ref = { 0 };
384 v8u16 sad0 = { 0 };
385 v8u16 sad1 = { 0 };
386 v8u16 sad2 = { 0 };
387 v8u16 sad3 = { 0 };
388 v8u16 sad4 = { 0 };
389 v8u16 sad5 = { 0 };
390 v8u16 sad6 = { 0 };
391 v8u16 sad7 = { 0 };
392
393 for (ht_cnt = (height >> 2); ht_cnt--;) {
394 LW4(src_ptr, src_stride, src0, src1, src2, src3);
395 INSERT_W4_UB(src0, src1, src2, src3, src);
396 src_ptr += (4 * src_stride);
397 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
398 ref_ptr += (4 * ref_stride);
399
400 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
401 diff = __msa_asub_u_b(src, ref);
402 sad0 += __msa_hadd_u_h(diff, diff);
403
404 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
405 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
406 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
407 diff = __msa_asub_u_b(src, ref);
408 sad1 += __msa_hadd_u_h(diff, diff);
409
410 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
411 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
412 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
413 diff = __msa_asub_u_b(src, ref);
414 sad2 += __msa_hadd_u_h(diff, diff);
415
416 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
417 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
418 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
419 diff = __msa_asub_u_b(src, ref);
420 sad3 += __msa_hadd_u_h(diff, diff);
421
422 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
423 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
424 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
425 diff = __msa_asub_u_b(src, ref);
426 sad4 += __msa_hadd_u_h(diff, diff);
427
428 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
429 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
430 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
431 diff = __msa_asub_u_b(src, ref);
432 sad5 += __msa_hadd_u_h(diff, diff);
433
434 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
435 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
436 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
437 diff = __msa_asub_u_b(src, ref);
438 sad6 += __msa_hadd_u_h(diff, diff);
439
440 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
441 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
442 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
443 diff = __msa_asub_u_b(src, ref);
444 sad7 += __msa_hadd_u_h(diff, diff);
445 }
446
447 sad_array[0] = HADD_UH_U32(sad0);
448 sad_array[1] = HADD_UH_U32(sad1);
449 sad_array[2] = HADD_UH_U32(sad2);
450 sad_array[3] = HADD_UH_U32(sad3);
451 sad_array[4] = HADD_UH_U32(sad4);
452 sad_array[5] = HADD_UH_U32(sad5);
453 sad_array[6] = HADD_UH_U32(sad6);
454 sad_array[7] = HADD_UH_U32(sad7);
455 }
456
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)457 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
458 const uint8_t *ref, int32_t ref_stride,
459 int32_t height, uint32_t *sad_array) {
460 int32_t ht_cnt;
461 v16u8 src0, src1, src2, src3;
462 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
463 v8u16 sad0 = { 0 };
464 v8u16 sad1 = { 0 };
465 v8u16 sad2 = { 0 };
466 v8u16 sad3 = { 0 };
467 v8u16 sad4 = { 0 };
468 v8u16 sad5 = { 0 };
469 v8u16 sad6 = { 0 };
470 v8u16 sad7 = { 0 };
471
472 for (ht_cnt = (height >> 2); ht_cnt--;) {
473 LD_UB4(src, src_stride, src0, src1, src2, src3);
474 src += (4 * src_stride);
475 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
476 ref += (4 * ref_stride);
477 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
478 ref0, ref1);
479 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
480
481 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
482 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
483 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
484 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
485
486 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
487 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
488 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
489 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
490
491 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
492 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
493 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
494 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
495
496 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
497 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
498 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
499 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
500
501 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
502 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
503 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
504 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
505
506 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
507 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
508 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
509 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
510
511 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
512 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
513 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
514 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
515 }
516
517 sad_array[0] = HADD_UH_U32(sad0);
518 sad_array[1] = HADD_UH_U32(sad1);
519 sad_array[2] = HADD_UH_U32(sad2);
520 sad_array[3] = HADD_UH_U32(sad3);
521 sad_array[4] = HADD_UH_U32(sad4);
522 sad_array[5] = HADD_UH_U32(sad5);
523 sad_array[6] = HADD_UH_U32(sad6);
524 sad_array[7] = HADD_UH_U32(sad7);
525 }
526
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)527 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
528 const uint8_t *ref_ptr, int32_t ref_stride,
529 int32_t height, uint32_t *sad_array) {
530 int32_t ht_cnt;
531 v16u8 src, ref0, ref1, ref;
532 v16u8 diff;
533 v8u16 sad0 = { 0 };
534 v8u16 sad1 = { 0 };
535 v8u16 sad2 = { 0 };
536 v8u16 sad3 = { 0 };
537 v8u16 sad4 = { 0 };
538 v8u16 sad5 = { 0 };
539 v8u16 sad6 = { 0 };
540 v8u16 sad7 = { 0 };
541
542 for (ht_cnt = (height >> 1); ht_cnt--;) {
543 src = LD_UB(src_ptr);
544 src_ptr += src_stride;
545 LD_UB2(ref_ptr, 16, ref0, ref1);
546 ref_ptr += ref_stride;
547
548 diff = __msa_asub_u_b(src, ref0);
549 sad0 += __msa_hadd_u_h(diff, diff);
550
551 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
552 diff = __msa_asub_u_b(src, ref);
553 sad1 += __msa_hadd_u_h(diff, diff);
554
555 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
556 diff = __msa_asub_u_b(src, ref);
557 sad2 += __msa_hadd_u_h(diff, diff);
558
559 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
560 diff = __msa_asub_u_b(src, ref);
561 sad3 += __msa_hadd_u_h(diff, diff);
562
563 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
564 diff = __msa_asub_u_b(src, ref);
565 sad4 += __msa_hadd_u_h(diff, diff);
566
567 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
568 diff = __msa_asub_u_b(src, ref);
569 sad5 += __msa_hadd_u_h(diff, diff);
570
571 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
572 diff = __msa_asub_u_b(src, ref);
573 sad6 += __msa_hadd_u_h(diff, diff);
574
575 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
576 diff = __msa_asub_u_b(src, ref);
577 sad7 += __msa_hadd_u_h(diff, diff);
578
579 src = LD_UB(src_ptr);
580 src_ptr += src_stride;
581 LD_UB2(ref_ptr, 16, ref0, ref1);
582 ref_ptr += ref_stride;
583
584 diff = __msa_asub_u_b(src, ref0);
585 sad0 += __msa_hadd_u_h(diff, diff);
586
587 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
588 diff = __msa_asub_u_b(src, ref);
589 sad1 += __msa_hadd_u_h(diff, diff);
590
591 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
592 diff = __msa_asub_u_b(src, ref);
593 sad2 += __msa_hadd_u_h(diff, diff);
594
595 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
596 diff = __msa_asub_u_b(src, ref);
597 sad3 += __msa_hadd_u_h(diff, diff);
598
599 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
600 diff = __msa_asub_u_b(src, ref);
601 sad4 += __msa_hadd_u_h(diff, diff);
602
603 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
604 diff = __msa_asub_u_b(src, ref);
605 sad5 += __msa_hadd_u_h(diff, diff);
606
607 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
608 diff = __msa_asub_u_b(src, ref);
609 sad6 += __msa_hadd_u_h(diff, diff);
610
611 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
612 diff = __msa_asub_u_b(src, ref);
613 sad7 += __msa_hadd_u_h(diff, diff);
614 }
615
616 sad_array[0] = HADD_UH_U32(sad0);
617 sad_array[1] = HADD_UH_U32(sad1);
618 sad_array[2] = HADD_UH_U32(sad2);
619 sad_array[3] = HADD_UH_U32(sad3);
620 sad_array[4] = HADD_UH_U32(sad4);
621 sad_array[5] = HADD_UH_U32(sad5);
622 sad_array[6] = HADD_UH_U32(sad6);
623 sad_array[7] = HADD_UH_U32(sad7);
624 }
625
sad_32width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)626 static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
627 const uint8_t *ref, int32_t ref_stride,
628 int32_t height, uint32_t *sad_array) {
629 int32_t ht_cnt;
630 v16u8 src0, src1;
631 v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
632 v8u16 sad0 = { 0 };
633 v8u16 sad1 = { 0 };
634 v8u16 sad2 = { 0 };
635 v8u16 sad3 = { 0 };
636 v8u16 sad4 = { 0 };
637 v8u16 sad5 = { 0 };
638 v8u16 sad6 = { 0 };
639 v8u16 sad7 = { 0 };
640
641 for (ht_cnt = height; ht_cnt--;) {
642 LD_UB2(src, 16, src0, src1);
643 src += src_stride;
644 LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
645 ref += ref_stride;
646
647 sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
648
649 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
650 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
651
652 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
653 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
654
655 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
656 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
657
658 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
659 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
660
661 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
662 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
663
664 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
665 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
666
667 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
668 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
669 }
670
671 sad_array[0] = HADD_UH_U32(sad0);
672 sad_array[1] = HADD_UH_U32(sad1);
673 sad_array[2] = HADD_UH_U32(sad2);
674 sad_array[3] = HADD_UH_U32(sad3);
675 sad_array[4] = HADD_UH_U32(sad4);
676 sad_array[5] = HADD_UH_U32(sad5);
677 sad_array[6] = HADD_UH_U32(sad6);
678 sad_array[7] = HADD_UH_U32(sad7);
679 }
680
sad_64width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)681 static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
682 const uint8_t *ref, int32_t ref_stride,
683 int32_t height, uint32_t *sad_array) {
684 const uint8_t *src_dup, *ref_dup;
685 int32_t ht_cnt;
686 v16u8 src0, src1, src2, src3;
687 v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
688 v16u8 ref0, ref1, ref2, ref3;
689 v8u16 sad0_0 = { 0 };
690 v8u16 sad0_1 = { 0 };
691 v8u16 sad1_0 = { 0 };
692 v8u16 sad1_1 = { 0 };
693 v8u16 sad2_0 = { 0 };
694 v8u16 sad2_1 = { 0 };
695 v8u16 sad3_0 = { 0 };
696 v8u16 sad3_1 = { 0 };
697 v4u32 sad;
698
699 src_dup = src;
700 ref_dup = ref;
701
702 for (ht_cnt = height; ht_cnt--;) {
703 LD_UB4(src, 16, src0, src1, src2, src3);
704 src += src_stride;
705 LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
706 ref += ref_stride;
707
708 sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
709 sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
710
711 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
712 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
713 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
714 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
715
716 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
717 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
718 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
719 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
720
721 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
722 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
723 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
724 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
725 }
726
727 sad = __msa_hadd_u_w(sad0_0, sad0_0);
728 sad += __msa_hadd_u_w(sad0_1, sad0_1);
729 sad_array[0] = HADD_SW_S32(sad);
730
731 sad = __msa_hadd_u_w(sad1_0, sad1_0);
732 sad += __msa_hadd_u_w(sad1_1, sad1_1);
733 sad_array[1] = HADD_SW_S32(sad);
734
735 sad = __msa_hadd_u_w(sad2_0, sad2_0);
736 sad += __msa_hadd_u_w(sad2_1, sad2_1);
737 sad_array[2] = HADD_SW_S32(sad);
738
739 sad = __msa_hadd_u_w(sad3_0, sad3_0);
740 sad += __msa_hadd_u_w(sad3_1, sad3_1);
741 sad_array[3] = HADD_SW_S32(sad);
742
743 sad0_0 = (v8u16)__msa_ldi_h(0);
744 sad0_1 = (v8u16)__msa_ldi_h(0);
745 sad1_0 = (v8u16)__msa_ldi_h(0);
746 sad1_1 = (v8u16)__msa_ldi_h(0);
747 sad2_0 = (v8u16)__msa_ldi_h(0);
748 sad2_1 = (v8u16)__msa_ldi_h(0);
749 sad3_0 = (v8u16)__msa_ldi_h(0);
750 sad3_1 = (v8u16)__msa_ldi_h(0);
751
752 for (ht_cnt = 64; ht_cnt--;) {
753 LD_UB4(src_dup, 16, src0, src1, src2, src3);
754 src_dup += src_stride;
755 LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
756 ref_dup += ref_stride;
757
758 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
759 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
760 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
761 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
762
763 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
764 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
765 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
766 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
767
768 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
769 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
770 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
771 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
772
773 SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
774 SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
775 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
776 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
777 }
778
779 sad = __msa_hadd_u_w(sad0_0, sad0_0);
780 sad += __msa_hadd_u_w(sad0_1, sad0_1);
781 sad_array[4] = HADD_SW_S32(sad);
782
783 sad = __msa_hadd_u_w(sad1_0, sad1_0);
784 sad += __msa_hadd_u_w(sad1_1, sad1_1);
785 sad_array[5] = HADD_SW_S32(sad);
786
787 sad = __msa_hadd_u_w(sad2_0, sad2_0);
788 sad += __msa_hadd_u_w(sad2_1, sad2_1);
789 sad_array[6] = HADD_SW_S32(sad);
790
791 sad = __msa_hadd_u_w(sad3_0, sad3_0);
792 sad += __msa_hadd_u_w(sad3_1, sad3_1);
793 sad_array[7] = HADD_SW_S32(sad);
794 }
795
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)796 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
797 const uint8_t *const aref_ptr[],
798 int32_t ref_stride, int32_t height,
799 uint32_t *sad_array) {
800 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
801 int32_t ht_cnt;
802 uint32_t src0, src1, src2, src3;
803 uint32_t ref0, ref1, ref2, ref3;
804 v16u8 src = { 0 };
805 v16u8 ref = { 0 };
806 v16u8 diff;
807 v8u16 sad0 = { 0 };
808 v8u16 sad1 = { 0 };
809 v8u16 sad2 = { 0 };
810 v8u16 sad3 = { 0 };
811
812 ref0_ptr = aref_ptr[0];
813 ref1_ptr = aref_ptr[1];
814 ref2_ptr = aref_ptr[2];
815 ref3_ptr = aref_ptr[3];
816
817 for (ht_cnt = (height >> 2); ht_cnt--;) {
818 LW4(src_ptr, src_stride, src0, src1, src2, src3);
819 INSERT_W4_UB(src0, src1, src2, src3, src);
820 src_ptr += (4 * src_stride);
821
822 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
823 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
824 ref0_ptr += (4 * ref_stride);
825
826 diff = __msa_asub_u_b(src, ref);
827 sad0 += __msa_hadd_u_h(diff, diff);
828
829 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
830 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
831 ref1_ptr += (4 * ref_stride);
832
833 diff = __msa_asub_u_b(src, ref);
834 sad1 += __msa_hadd_u_h(diff, diff);
835
836 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
837 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
838 ref2_ptr += (4 * ref_stride);
839
840 diff = __msa_asub_u_b(src, ref);
841 sad2 += __msa_hadd_u_h(diff, diff);
842
843 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
844 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
845 ref3_ptr += (4 * ref_stride);
846
847 diff = __msa_asub_u_b(src, ref);
848 sad3 += __msa_hadd_u_h(diff, diff);
849 }
850
851 sad_array[0] = HADD_UH_U32(sad0);
852 sad_array[1] = HADD_UH_U32(sad1);
853 sad_array[2] = HADD_UH_U32(sad2);
854 sad_array[3] = HADD_UH_U32(sad3);
855 }
856
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)857 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
858 const uint8_t *const aref_ptr[],
859 int32_t ref_stride, int32_t height,
860 uint32_t *sad_array) {
861 int32_t ht_cnt;
862 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
863 v16u8 src0, src1, src2, src3;
864 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
865 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
866 v8u16 sad0 = { 0 };
867 v8u16 sad1 = { 0 };
868 v8u16 sad2 = { 0 };
869 v8u16 sad3 = { 0 };
870
871 ref0_ptr = aref_ptr[0];
872 ref1_ptr = aref_ptr[1];
873 ref2_ptr = aref_ptr[2];
874 ref3_ptr = aref_ptr[3];
875
876 for (ht_cnt = (height >> 2); ht_cnt--;) {
877 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
878 src_ptr += (4 * src_stride);
879 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
880 ref0_ptr += (4 * ref_stride);
881 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
882 ref1_ptr += (4 * ref_stride);
883 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
884 ref2_ptr += (4 * ref_stride);
885 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
886 ref3_ptr += (4 * ref_stride);
887
888 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
889 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
890 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
891
892 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
893 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
894
895 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
896 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
897
898 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
899 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
900 }
901
902 sad_array[0] = HADD_UH_U32(sad0);
903 sad_array[1] = HADD_UH_U32(sad1);
904 sad_array[2] = HADD_UH_U32(sad2);
905 sad_array[3] = HADD_UH_U32(sad3);
906 }
907
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)908 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
909 const uint8_t *const aref_ptr[],
910 int32_t ref_stride, int32_t height,
911 uint32_t *sad_array) {
912 int32_t ht_cnt;
913 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
914 v16u8 src, ref0, ref1, ref2, ref3, diff;
915 v8u16 sad0 = { 0 };
916 v8u16 sad1 = { 0 };
917 v8u16 sad2 = { 0 };
918 v8u16 sad3 = { 0 };
919
920 ref0_ptr = aref_ptr[0];
921 ref1_ptr = aref_ptr[1];
922 ref2_ptr = aref_ptr[2];
923 ref3_ptr = aref_ptr[3];
924
925 for (ht_cnt = (height >> 1); ht_cnt--;) {
926 src = LD_UB(src_ptr);
927 src_ptr += src_stride;
928 ref0 = LD_UB(ref0_ptr);
929 ref0_ptr += ref_stride;
930 ref1 = LD_UB(ref1_ptr);
931 ref1_ptr += ref_stride;
932 ref2 = LD_UB(ref2_ptr);
933 ref2_ptr += ref_stride;
934 ref3 = LD_UB(ref3_ptr);
935 ref3_ptr += ref_stride;
936
937 diff = __msa_asub_u_b(src, ref0);
938 sad0 += __msa_hadd_u_h(diff, diff);
939 diff = __msa_asub_u_b(src, ref1);
940 sad1 += __msa_hadd_u_h(diff, diff);
941 diff = __msa_asub_u_b(src, ref2);
942 sad2 += __msa_hadd_u_h(diff, diff);
943 diff = __msa_asub_u_b(src, ref3);
944 sad3 += __msa_hadd_u_h(diff, diff);
945
946 src = LD_UB(src_ptr);
947 src_ptr += src_stride;
948 ref0 = LD_UB(ref0_ptr);
949 ref0_ptr += ref_stride;
950 ref1 = LD_UB(ref1_ptr);
951 ref1_ptr += ref_stride;
952 ref2 = LD_UB(ref2_ptr);
953 ref2_ptr += ref_stride;
954 ref3 = LD_UB(ref3_ptr);
955 ref3_ptr += ref_stride;
956
957 diff = __msa_asub_u_b(src, ref0);
958 sad0 += __msa_hadd_u_h(diff, diff);
959 diff = __msa_asub_u_b(src, ref1);
960 sad1 += __msa_hadd_u_h(diff, diff);
961 diff = __msa_asub_u_b(src, ref2);
962 sad2 += __msa_hadd_u_h(diff, diff);
963 diff = __msa_asub_u_b(src, ref3);
964 sad3 += __msa_hadd_u_h(diff, diff);
965 }
966
967 sad_array[0] = HADD_UH_U32(sad0);
968 sad_array[1] = HADD_UH_U32(sad1);
969 sad_array[2] = HADD_UH_U32(sad2);
970 sad_array[3] = HADD_UH_U32(sad3);
971 }
972
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)973 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
974 const uint8_t *const aref_ptr[],
975 int32_t ref_stride, int32_t height,
976 uint32_t *sad_array) {
977 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
978 int32_t ht_cnt;
979 v16u8 src0, src1, ref0, ref1;
980 v8u16 sad0 = { 0 };
981 v8u16 sad1 = { 0 };
982 v8u16 sad2 = { 0 };
983 v8u16 sad3 = { 0 };
984
985 ref0_ptr = aref_ptr[0];
986 ref1_ptr = aref_ptr[1];
987 ref2_ptr = aref_ptr[2];
988 ref3_ptr = aref_ptr[3];
989
990 for (ht_cnt = height; ht_cnt--;) {
991 LD_UB2(src, 16, src0, src1);
992 src += src_stride;
993
994 LD_UB2(ref0_ptr, 16, ref0, ref1);
995 ref0_ptr += ref_stride;
996 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
997
998 LD_UB2(ref1_ptr, 16, ref0, ref1);
999 ref1_ptr += ref_stride;
1000 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1001
1002 LD_UB2(ref2_ptr, 16, ref0, ref1);
1003 ref2_ptr += ref_stride;
1004 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1005
1006 LD_UB2(ref3_ptr, 16, ref0, ref1);
1007 ref3_ptr += ref_stride;
1008 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1009 }
1010
1011 sad_array[0] = HADD_UH_U32(sad0);
1012 sad_array[1] = HADD_UH_U32(sad1);
1013 sad_array[2] = HADD_UH_U32(sad2);
1014 sad_array[3] = HADD_UH_U32(sad3);
1015 }
1016
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)1017 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1018 const uint8_t *const aref_ptr[],
1019 int32_t ref_stride, int32_t height,
1020 uint32_t *sad_array) {
1021 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1022 int32_t ht_cnt;
1023 v16u8 src0, src1, src2, src3;
1024 v16u8 ref0, ref1, ref2, ref3;
1025 v8u16 sad0_0 = { 0 };
1026 v8u16 sad0_1 = { 0 };
1027 v8u16 sad1_0 = { 0 };
1028 v8u16 sad1_1 = { 0 };
1029 v8u16 sad2_0 = { 0 };
1030 v8u16 sad2_1 = { 0 };
1031 v8u16 sad3_0 = { 0 };
1032 v8u16 sad3_1 = { 0 };
1033 v4u32 sad;
1034
1035 ref0_ptr = aref_ptr[0];
1036 ref1_ptr = aref_ptr[1];
1037 ref2_ptr = aref_ptr[2];
1038 ref3_ptr = aref_ptr[3];
1039
1040 for (ht_cnt = height; ht_cnt--;) {
1041 LD_UB4(src, 16, src0, src1, src2, src3);
1042 src += src_stride;
1043
1044 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1045 ref0_ptr += ref_stride;
1046 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1047 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1048
1049 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1050 ref1_ptr += ref_stride;
1051 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1052 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1053
1054 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1055 ref2_ptr += ref_stride;
1056 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1057 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1058
1059 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1060 ref3_ptr += ref_stride;
1061 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1062 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1063 }
1064
1065 sad = __msa_hadd_u_w(sad0_0, sad0_0);
1066 sad += __msa_hadd_u_w(sad0_1, sad0_1);
1067 sad_array[0] = HADD_UW_U32(sad);
1068
1069 sad = __msa_hadd_u_w(sad1_0, sad1_0);
1070 sad += __msa_hadd_u_w(sad1_1, sad1_1);
1071 sad_array[1] = HADD_UW_U32(sad);
1072
1073 sad = __msa_hadd_u_w(sad2_0, sad2_0);
1074 sad += __msa_hadd_u_w(sad2_1, sad2_1);
1075 sad_array[2] = HADD_UW_U32(sad);
1076
1077 sad = __msa_hadd_u_w(sad3_0, sad3_0);
1078 sad += __msa_hadd_u_w(sad3_1, sad3_1);
1079 sad_array[3] = HADD_UW_U32(sad);
1080 }
1081
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1082 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1083 const uint8_t *ref_ptr, int32_t ref_stride,
1084 int32_t height, const uint8_t *sec_pred) {
1085 int32_t ht_cnt;
1086 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1087 v16u8 src = { 0 };
1088 v16u8 ref = { 0 };
1089 v16u8 diff, pred, comp;
1090 v8u16 sad = { 0 };
1091
1092 for (ht_cnt = (height >> 2); ht_cnt--;) {
1093 LW4(src_ptr, src_stride, src0, src1, src2, src3);
1094 src_ptr += (4 * src_stride);
1095 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1096 ref_ptr += (4 * ref_stride);
1097 pred = LD_UB(sec_pred);
1098 sec_pred += 16;
1099
1100 INSERT_W4_UB(src0, src1, src2, src3, src);
1101 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1102
1103 comp = __msa_aver_u_b(pred, ref);
1104 diff = __msa_asub_u_b(src, comp);
1105 sad += __msa_hadd_u_h(diff, diff);
1106 }
1107
1108 return HADD_UH_U32(sad);
1109 }
1110
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1111 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1112 const uint8_t *ref, int32_t ref_stride,
1113 int32_t height, const uint8_t *sec_pred) {
1114 int32_t ht_cnt;
1115 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1116 v16u8 diff0, diff1, pred0, pred1;
1117 v8u16 sad = { 0 };
1118
1119 for (ht_cnt = (height >> 2); ht_cnt--;) {
1120 LD_UB4(src, src_stride, src0, src1, src2, src3);
1121 src += (4 * src_stride);
1122 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1123 ref += (4 * ref_stride);
1124 LD_UB2(sec_pred, 16, pred0, pred1);
1125 sec_pred += 32;
1126 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
1127 ref0, ref1);
1128 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1129 sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1130 }
1131
1132 return HADD_UH_U32(sad);
1133 }
1134
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1135 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1136 const uint8_t *ref, int32_t ref_stride,
1137 int32_t height, const uint8_t *sec_pred) {
1138 int32_t ht_cnt;
1139 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1140 v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1141 v8u16 sad = { 0 };
1142
1143 for (ht_cnt = (height >> 3); ht_cnt--;) {
1144 LD_UB4(src, src_stride, src0, src1, src2, src3);
1145 src += (4 * src_stride);
1146 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1147 ref += (4 * ref_stride);
1148 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1149 sec_pred += (4 * 16);
1150 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1151 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1152 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1153 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1154
1155 LD_UB4(src, src_stride, src0, src1, src2, src3);
1156 src += (4 * src_stride);
1157 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1158 ref += (4 * ref_stride);
1159 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1160 sec_pred += (4 * 16);
1161 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1162 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1163 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1164 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1165 }
1166
1167 return HADD_UH_U32(sad);
1168 }
1169
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1170 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1171 const uint8_t *ref, int32_t ref_stride,
1172 int32_t height, const uint8_t *sec_pred) {
1173 int32_t ht_cnt;
1174 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1175 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1176 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1177 v16u8 comp0, comp1;
1178 v8u16 sad = { 0 };
1179
1180 for (ht_cnt = (height >> 2); ht_cnt--;) {
1181 LD_UB4(src, src_stride, src0, src2, src4, src6);
1182 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1183 src += (4 * src_stride);
1184
1185 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1186 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1187 ref += (4 * ref_stride);
1188
1189 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1190 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1191 sec_pred += (4 * 32);
1192
1193 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1194 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1195 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1196 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1197 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1198 sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1199 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1200 sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1201 }
1202
1203 return HADD_UH_U32(sad);
1204 }
1205
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1206 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1207 const uint8_t *ref, int32_t ref_stride,
1208 int32_t height, const uint8_t *sec_pred) {
1209 int32_t ht_cnt;
1210 v16u8 src0, src1, src2, src3;
1211 v16u8 ref0, ref1, ref2, ref3;
1212 v16u8 comp0, comp1, comp2, comp3;
1213 v16u8 pred0, pred1, pred2, pred3;
1214 v8u16 sad0 = { 0 };
1215 v8u16 sad1 = { 0 };
1216 v4u32 sad;
1217
1218 for (ht_cnt = (height >> 2); ht_cnt--;) {
1219 LD_UB4(src, 16, src0, src1, src2, src3);
1220 src += src_stride;
1221 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1222 ref += ref_stride;
1223 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1224 sec_pred += 64;
1225 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1226 comp1, comp2, comp3);
1227 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1228 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1229
1230 LD_UB4(src, 16, src0, src1, src2, src3);
1231 src += src_stride;
1232 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1233 ref += ref_stride;
1234 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1235 sec_pred += 64;
1236 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1237 comp1, comp2, comp3);
1238 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1239 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1240
1241 LD_UB4(src, 16, src0, src1, src2, src3);
1242 src += src_stride;
1243 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1244 ref += ref_stride;
1245 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1246 sec_pred += 64;
1247 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1248 comp1, comp2, comp3);
1249 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1250 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1251
1252 LD_UB4(src, 16, src0, src1, src2, src3);
1253 src += src_stride;
1254 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1255 ref += ref_stride;
1256 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1257 sec_pred += 64;
1258 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1259 comp1, comp2, comp3);
1260 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1261 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1262 }
1263
1264 sad = __msa_hadd_u_w(sad0, sad0);
1265 sad += __msa_hadd_u_w(sad1, sad1);
1266
1267 return HADD_SW_S32(sad);
1268 }
1269
1270 #define VPX_SAD_4xHEIGHT_MSA(height) \
1271 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
1272 const uint8_t *ref, int32_t ref_stride) { \
1273 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
1274 }
1275
1276 #define VPX_SAD_8xHEIGHT_MSA(height) \
1277 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
1278 const uint8_t *ref, int32_t ref_stride) { \
1279 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
1280 }
1281
1282 #define VPX_SAD_16xHEIGHT_MSA(height) \
1283 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
1284 const uint8_t *ref, int32_t ref_stride) { \
1285 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
1286 }
1287
1288 #define VPX_SAD_32xHEIGHT_MSA(height) \
1289 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
1290 const uint8_t *ref, int32_t ref_stride) { \
1291 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
1292 }
1293
1294 #define VPX_SAD_64xHEIGHT_MSA(height) \
1295 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
1296 const uint8_t *ref, int32_t ref_stride) { \
1297 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
1298 }
1299
1300 #define VPX_SAD_4xHEIGHTx3_MSA(height) \
1301 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1302 const uint8_t *ref, int32_t ref_stride, \
1303 uint32_t *sads) { \
1304 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1305 }
1306
1307 #define VPX_SAD_8xHEIGHTx3_MSA(height) \
1308 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1309 const uint8_t *ref, int32_t ref_stride, \
1310 uint32_t *sads) { \
1311 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1312 }
1313
1314 #define VPX_SAD_16xHEIGHTx3_MSA(height) \
1315 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1316 const uint8_t *ref, int32_t ref_stride, \
1317 uint32_t *sads) { \
1318 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1319 }
1320
1321 #define VPX_SAD_32xHEIGHTx3_MSA(height) \
1322 void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1323 const uint8_t *ref, int32_t ref_stride, \
1324 uint32_t *sads) { \
1325 sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1326 }
1327
1328 #define VPX_SAD_64xHEIGHTx3_MSA(height) \
1329 void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1330 const uint8_t *ref, int32_t ref_stride, \
1331 uint32_t *sads) { \
1332 sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1333 }
1334
1335 #define VPX_SAD_4xHEIGHTx8_MSA(height) \
1336 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1337 const uint8_t *ref, int32_t ref_stride, \
1338 uint32_t *sads) { \
1339 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1340 }
1341
1342 #define VPX_SAD_8xHEIGHTx8_MSA(height) \
1343 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1344 const uint8_t *ref, int32_t ref_stride, \
1345 uint32_t *sads) { \
1346 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1347 }
1348
1349 #define VPX_SAD_16xHEIGHTx8_MSA(height) \
1350 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1351 const uint8_t *ref, int32_t ref_stride, \
1352 uint32_t *sads) { \
1353 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1354 }
1355
1356 #define VPX_SAD_32xHEIGHTx8_MSA(height) \
1357 void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1358 const uint8_t *ref, int32_t ref_stride, \
1359 uint32_t *sads) { \
1360 sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1361 }
1362
1363 #define VPX_SAD_64xHEIGHTx8_MSA(height) \
1364 void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1365 const uint8_t *ref, int32_t ref_stride, \
1366 uint32_t *sads) { \
1367 sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1368 }
1369
1370 #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
1371 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1372 const uint8_t *const refs[], \
1373 int32_t ref_stride, uint32_t *sads) { \
1374 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1375 }
1376
1377 #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
1378 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1379 const uint8_t *const refs[], \
1380 int32_t ref_stride, uint32_t *sads) { \
1381 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1382 }
1383
1384 #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
1385 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1386 const uint8_t *const refs[], \
1387 int32_t ref_stride, uint32_t *sads) { \
1388 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1389 }
1390
1391 #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
1392 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1393 const uint8_t *const refs[], \
1394 int32_t ref_stride, uint32_t *sads) { \
1395 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1396 }
1397
1398 #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
1399 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1400 const uint8_t *const refs[], \
1401 int32_t ref_stride, uint32_t *sads) { \
1402 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1403 }
1404
1405 #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
1406 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1407 const uint8_t *ref, int32_t ref_stride, \
1408 const uint8_t *second_pred) { \
1409 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
1410 second_pred); \
1411 }
1412
1413 #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
1414 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1415 const uint8_t *ref, int32_t ref_stride, \
1416 const uint8_t *second_pred) { \
1417 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
1418 second_pred); \
1419 }
1420
1421 #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
1422 uint32_t vpx_sad16x##height##_avg_msa( \
1423 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1424 int32_t ref_stride, const uint8_t *second_pred) { \
1425 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1426 second_pred); \
1427 }
1428
1429 #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
1430 uint32_t vpx_sad32x##height##_avg_msa( \
1431 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1432 int32_t ref_stride, const uint8_t *second_pred) { \
1433 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1434 second_pred); \
1435 }
1436
1437 #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
1438 uint32_t vpx_sad64x##height##_avg_msa( \
1439 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1440 int32_t ref_stride, const uint8_t *second_pred) { \
1441 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1442 second_pred); \
1443 }
1444
1445 // 64x64
1446 VPX_SAD_64xHEIGHT_MSA(64);
1447 VPX_SAD_64xHEIGHTx3_MSA(64);
1448 VPX_SAD_64xHEIGHTx8_MSA(64);
1449 VPX_SAD_64xHEIGHTx4D_MSA(64);
1450 VPX_AVGSAD_64xHEIGHT_MSA(64);
1451
1452 // 64x32
1453 VPX_SAD_64xHEIGHT_MSA(32);
1454 VPX_SAD_64xHEIGHTx3_MSA(32);
1455 VPX_SAD_64xHEIGHTx8_MSA(32);
1456 VPX_SAD_64xHEIGHTx4D_MSA(32);
1457 VPX_AVGSAD_64xHEIGHT_MSA(32);
1458
1459 // 32x64
1460 VPX_SAD_32xHEIGHT_MSA(64);
1461 VPX_SAD_32xHEIGHTx3_MSA(64);
1462 VPX_SAD_32xHEIGHTx8_MSA(64);
1463 VPX_SAD_32xHEIGHTx4D_MSA(64);
1464 VPX_AVGSAD_32xHEIGHT_MSA(64);
1465
1466 // 32x32
1467 VPX_SAD_32xHEIGHT_MSA(32);
1468 VPX_SAD_32xHEIGHTx3_MSA(32);
1469 VPX_SAD_32xHEIGHTx8_MSA(32);
1470 VPX_SAD_32xHEIGHTx4D_MSA(32);
1471 VPX_AVGSAD_32xHEIGHT_MSA(32);
1472
1473 // 32x16
1474 VPX_SAD_32xHEIGHT_MSA(16);
1475 VPX_SAD_32xHEIGHTx3_MSA(16);
1476 VPX_SAD_32xHEIGHTx8_MSA(16);
1477 VPX_SAD_32xHEIGHTx4D_MSA(16);
1478 VPX_AVGSAD_32xHEIGHT_MSA(16);
1479
1480 // 16x32
1481 VPX_SAD_16xHEIGHT_MSA(32);
1482 VPX_SAD_16xHEIGHTx3_MSA(32);
1483 VPX_SAD_16xHEIGHTx8_MSA(32);
1484 VPX_SAD_16xHEIGHTx4D_MSA(32);
1485 VPX_AVGSAD_16xHEIGHT_MSA(32);
1486
1487 // 16x16
1488 VPX_SAD_16xHEIGHT_MSA(16);
1489 VPX_SAD_16xHEIGHTx3_MSA(16);
1490 VPX_SAD_16xHEIGHTx8_MSA(16);
1491 VPX_SAD_16xHEIGHTx4D_MSA(16);
1492 VPX_AVGSAD_16xHEIGHT_MSA(16);
1493
1494 // 16x8
1495 VPX_SAD_16xHEIGHT_MSA(8);
1496 VPX_SAD_16xHEIGHTx3_MSA(8);
1497 VPX_SAD_16xHEIGHTx8_MSA(8);
1498 VPX_SAD_16xHEIGHTx4D_MSA(8);
1499 VPX_AVGSAD_16xHEIGHT_MSA(8);
1500
1501 // 8x16
1502 VPX_SAD_8xHEIGHT_MSA(16);
1503 VPX_SAD_8xHEIGHTx3_MSA(16);
1504 VPX_SAD_8xHEIGHTx8_MSA(16);
1505 VPX_SAD_8xHEIGHTx4D_MSA(16);
1506 VPX_AVGSAD_8xHEIGHT_MSA(16);
1507
1508 // 8x8
1509 VPX_SAD_8xHEIGHT_MSA(8);
1510 VPX_SAD_8xHEIGHTx3_MSA(8);
1511 VPX_SAD_8xHEIGHTx8_MSA(8);
1512 VPX_SAD_8xHEIGHTx4D_MSA(8);
1513 VPX_AVGSAD_8xHEIGHT_MSA(8);
1514
1515 // 8x4
1516 VPX_SAD_8xHEIGHT_MSA(4);
1517 VPX_SAD_8xHEIGHTx3_MSA(4);
1518 VPX_SAD_8xHEIGHTx8_MSA(4);
1519 VPX_SAD_8xHEIGHTx4D_MSA(4);
1520 VPX_AVGSAD_8xHEIGHT_MSA(4);
1521
1522 // 4x8
1523 VPX_SAD_4xHEIGHT_MSA(8);
1524 VPX_SAD_4xHEIGHTx3_MSA(8);
1525 VPX_SAD_4xHEIGHTx8_MSA(8);
1526 VPX_SAD_4xHEIGHTx4D_MSA(8);
1527 VPX_AVGSAD_4xHEIGHT_MSA(8);
1528
1529 // 4x4
1530 VPX_SAD_4xHEIGHT_MSA(4);
1531 VPX_SAD_4xHEIGHTx3_MSA(4);
1532 VPX_SAD_4xHEIGHTx8_MSA(4);
1533 VPX_SAD_4xHEIGHTx4D_MSA(4);
1534 VPX_AVGSAD_4xHEIGHT_MSA(4);
1535