1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
15   {                                                        \
16     out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17     out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18     out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19     out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20   }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22 
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24                                const uint8_t *ref_ptr, int32_t ref_stride,
25                                int32_t height) {
26   int32_t ht_cnt;
27   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28   v16u8 src = { 0 };
29   v16u8 ref = { 0 };
30   v16u8 diff;
31   v8u16 sad = { 0 };
32 
33   for (ht_cnt = (height >> 2); ht_cnt--;) {
34     LW4(src_ptr, src_stride, src0, src1, src2, src3);
35     src_ptr += (4 * src_stride);
36     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37     ref_ptr += (4 * ref_stride);
38 
39     INSERT_W4_UB(src0, src1, src2, src3, src);
40     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41 
42     diff = __msa_asub_u_b(src, ref);
43     sad += __msa_hadd_u_h(diff, diff);
44   }
45 
46   return HADD_UH_U32(sad);
47 }
48 
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50                                const uint8_t *ref, int32_t ref_stride,
51                                int32_t height) {
52   int32_t ht_cnt;
53   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54   v8u16 sad = { 0 };
55 
56   for (ht_cnt = (height >> 2); ht_cnt--;) {
57     LD_UB4(src, src_stride, src0, src1, src2, src3);
58     src += (4 * src_stride);
59     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60     ref += (4 * ref_stride);
61 
62     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63                 ref0, ref1);
64     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65   }
66 
67   return HADD_UH_U32(sad);
68 }
69 
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71                                 const uint8_t *ref, int32_t ref_stride,
72                                 int32_t height) {
73   int32_t ht_cnt;
74   v16u8 src0, src1, ref0, ref1;
75   v8u16 sad = { 0 };
76 
77   for (ht_cnt = (height >> 2); ht_cnt--;) {
78     LD_UB2(src, src_stride, src0, src1);
79     src += (2 * src_stride);
80     LD_UB2(ref, ref_stride, ref0, ref1);
81     ref += (2 * ref_stride);
82     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83 
84     LD_UB2(src, src_stride, src0, src1);
85     src += (2 * src_stride);
86     LD_UB2(ref, ref_stride, ref0, ref1);
87     ref += (2 * ref_stride);
88     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89   }
90 
91   return HADD_UH_U32(sad);
92 }
93 
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95                                 const uint8_t *ref, int32_t ref_stride,
96                                 int32_t height) {
97   int32_t ht_cnt;
98   v16u8 src0, src1, ref0, ref1;
99   v8u16 sad = { 0 };
100 
101   for (ht_cnt = (height >> 2); ht_cnt--;) {
102     LD_UB2(src, 16, src0, src1);
103     src += src_stride;
104     LD_UB2(ref, 16, ref0, ref1);
105     ref += ref_stride;
106     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107 
108     LD_UB2(src, 16, src0, src1);
109     src += src_stride;
110     LD_UB2(ref, 16, ref0, ref1);
111     ref += ref_stride;
112     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113 
114     LD_UB2(src, 16, src0, src1);
115     src += src_stride;
116     LD_UB2(ref, 16, ref0, ref1);
117     ref += ref_stride;
118     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119 
120     LD_UB2(src, 16, src0, src1);
121     src += src_stride;
122     LD_UB2(ref, 16, ref0, ref1);
123     ref += ref_stride;
124     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125   }
126 
127   return HADD_UH_U32(sad);
128 }
129 
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131                                 const uint8_t *ref, int32_t ref_stride,
132                                 int32_t height) {
133   int32_t ht_cnt;
134   uint32_t sad = 0;
135   v16u8 src0, src1, src2, src3;
136   v16u8 ref0, ref1, ref2, ref3;
137   v8u16 sad0 = { 0 };
138   v8u16 sad1 = { 0 };
139 
140   for (ht_cnt = (height >> 1); ht_cnt--;) {
141     LD_UB4(src, 16, src0, src1, src2, src3);
142     src += src_stride;
143     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144     ref += ref_stride;
145     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147 
148     LD_UB4(src, 16, src0, src1, src2, src3);
149     src += src_stride;
150     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151     ref += ref_stride;
152     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154   }
155 
156   sad = HADD_UH_U32(sad0);
157   sad += HADD_UH_U32(sad1);
158 
159   return sad;
160 }
161 
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163                               const uint8_t *ref_ptr, int32_t ref_stride,
164                               int32_t height, uint32_t *sad_array) {
165   int32_t ht_cnt;
166   uint32_t src0, src1, src2, src3;
167   v16u8 src = { 0 };
168   v16u8 ref = { 0 };
169   v16u8 ref0, ref1, ref2, ref3, diff;
170   v8u16 sad0 = { 0 };
171   v8u16 sad1 = { 0 };
172   v8u16 sad2 = { 0 };
173 
174   for (ht_cnt = (height >> 2); ht_cnt--;) {
175     LW4(src_ptr, src_stride, src0, src1, src2, src3);
176     src_ptr += (4 * src_stride);
177     INSERT_W4_UB(src0, src1, src2, src3, src);
178 
179     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180     ref_ptr += (4 * ref_stride);
181     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182     diff = __msa_asub_u_b(src, ref);
183     sad0 += __msa_hadd_u_h(diff, diff);
184 
185     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188     diff = __msa_asub_u_b(src, ref);
189     sad1 += __msa_hadd_u_h(diff, diff);
190 
191     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194     diff = __msa_asub_u_b(src, ref);
195     sad2 += __msa_hadd_u_h(diff, diff);
196   }
197 
198   sad_array[0] = HADD_UH_U32(sad0);
199   sad_array[1] = HADD_UH_U32(sad1);
200   sad_array[2] = HADD_UH_U32(sad2);
201 }
202 
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204                               const uint8_t *ref, int32_t ref_stride,
205                               int32_t height, uint32_t *sad_array) {
206   int32_t ht_cnt;
207   v16u8 src0, src1, src2, src3;
208   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209   v8u16 sad0 = { 0 };
210   v8u16 sad1 = { 0 };
211   v8u16 sad2 = { 0 };
212 
213   for (ht_cnt = (height >> 2); ht_cnt--;) {
214     LD_UB4(src, src_stride, src0, src1, src2, src3);
215     src += (4 * src_stride);
216     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217     ref += (4 * ref_stride);
218     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219                 ref0, ref1);
220     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221 
222     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226 
227     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231   }
232 
233   sad_array[0] = HADD_UH_U32(sad0);
234   sad_array[1] = HADD_UH_U32(sad1);
235   sad_array[2] = HADD_UH_U32(sad2);
236 }
237 
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239                                const uint8_t *ref_ptr, int32_t ref_stride,
240                                int32_t height, uint32_t *sad_array) {
241   int32_t ht_cnt;
242   v16u8 src, ref, ref0, ref1, diff;
243   v8u16 sad0 = { 0 };
244   v8u16 sad1 = { 0 };
245   v8u16 sad2 = { 0 };
246 
247   for (ht_cnt = (height >> 1); ht_cnt--;) {
248     src = LD_UB(src_ptr);
249     src_ptr += src_stride;
250     LD_UB2(ref_ptr, 16, ref0, ref1);
251     ref_ptr += ref_stride;
252 
253     diff = __msa_asub_u_b(src, ref0);
254     sad0 += __msa_hadd_u_h(diff, diff);
255 
256     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257     diff = __msa_asub_u_b(src, ref);
258     sad1 += __msa_hadd_u_h(diff, diff);
259 
260     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261     diff = __msa_asub_u_b(src, ref);
262     sad2 += __msa_hadd_u_h(diff, diff);
263 
264     src = LD_UB(src_ptr);
265     src_ptr += src_stride;
266     LD_UB2(ref_ptr, 16, ref0, ref1);
267     ref_ptr += ref_stride;
268 
269     diff = __msa_asub_u_b(src, ref0);
270     sad0 += __msa_hadd_u_h(diff, diff);
271 
272     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273     diff = __msa_asub_u_b(src, ref);
274     sad1 += __msa_hadd_u_h(diff, diff);
275 
276     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277     diff = __msa_asub_u_b(src, ref);
278     sad2 += __msa_hadd_u_h(diff, diff);
279   }
280 
281   sad_array[0] = HADD_UH_U32(sad0);
282   sad_array[1] = HADD_UH_U32(sad1);
283   sad_array[2] = HADD_UH_U32(sad2);
284 }
285 
sad_32width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)286 static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
287                                const uint8_t *ref, int32_t ref_stride,
288                                int32_t height, uint32_t *sad_array) {
289   int32_t ht_cnt;
290   v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
291   v8u16 sad0 = { 0 };
292   v8u16 sad1 = { 0 };
293   v8u16 sad2 = { 0 };
294 
295   for (ht_cnt = height >> 1; ht_cnt--;) {
296     LD_UB2(src, 16, src0, src1);
297     src += src_stride;
298     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
299     ref += ref_stride;
300 
301     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
302 
303     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
304     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
305 
306     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
307     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
308 
309     LD_UB2(src, 16, src0, src1);
310     src += src_stride;
311     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
312     ref += ref_stride;
313 
314     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
315 
316     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
317     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
318 
319     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
320     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
321   }
322 
323   sad_array[0] = HADD_UH_U32(sad0);
324   sad_array[1] = HADD_UH_U32(sad1);
325   sad_array[2] = HADD_UH_U32(sad2);
326 }
327 
sad_64width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)328 static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
329                                const uint8_t *ref, int32_t ref_stride,
330                                int32_t height, uint32_t *sad_array) {
331   int32_t ht_cnt;
332   v16u8 src0, src1, src2, src3;
333   v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
334   v8u16 sad0_0 = { 0 };
335   v8u16 sad0_1 = { 0 };
336   v8u16 sad1_0 = { 0 };
337   v8u16 sad1_1 = { 0 };
338   v8u16 sad2_0 = { 0 };
339   v8u16 sad2_1 = { 0 };
340   v4u32 sad;
341 
342   for (ht_cnt = height; ht_cnt--;) {
343     LD_UB4(src, 16, src0, src1, src2, src3);
344     src += src_stride;
345     LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
346     ref0_4 = LD_UB(ref + 64);
347     ref += ref_stride;
348 
349     sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
350     sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
351 
352     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
353     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
354     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
355     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
356 
357     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
358     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
359     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
360     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
361   }
362 
363   sad = __msa_hadd_u_w(sad0_0, sad0_0);
364   sad += __msa_hadd_u_w(sad0_1, sad0_1);
365   sad_array[0] = HADD_SW_S32((v4i32)sad);
366 
367   sad = __msa_hadd_u_w(sad1_0, sad1_0);
368   sad += __msa_hadd_u_w(sad1_1, sad1_1);
369   sad_array[1] = HADD_SW_S32((v4i32)sad);
370 
371   sad = __msa_hadd_u_w(sad2_0, sad2_0);
372   sad += __msa_hadd_u_w(sad2_1, sad2_1);
373   sad_array[2] = HADD_SW_S32((v4i32)sad);
374 }
375 
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)376 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
377                               const uint8_t *ref_ptr, int32_t ref_stride,
378                               int32_t height, uint32_t *sad_array) {
379   int32_t ht_cnt;
380   uint32_t src0, src1, src2, src3;
381   v16u8 ref0, ref1, ref2, ref3, diff;
382   v16u8 src = { 0 };
383   v16u8 ref = { 0 };
384   v8u16 sad0 = { 0 };
385   v8u16 sad1 = { 0 };
386   v8u16 sad2 = { 0 };
387   v8u16 sad3 = { 0 };
388   v8u16 sad4 = { 0 };
389   v8u16 sad5 = { 0 };
390   v8u16 sad6 = { 0 };
391   v8u16 sad7 = { 0 };
392 
393   for (ht_cnt = (height >> 2); ht_cnt--;) {
394     LW4(src_ptr, src_stride, src0, src1, src2, src3);
395     INSERT_W4_UB(src0, src1, src2, src3, src);
396     src_ptr += (4 * src_stride);
397     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
398     ref_ptr += (4 * ref_stride);
399 
400     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
401     diff = __msa_asub_u_b(src, ref);
402     sad0 += __msa_hadd_u_h(diff, diff);
403 
404     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
405     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
406     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
407     diff = __msa_asub_u_b(src, ref);
408     sad1 += __msa_hadd_u_h(diff, diff);
409 
410     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
411     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
412     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
413     diff = __msa_asub_u_b(src, ref);
414     sad2 += __msa_hadd_u_h(diff, diff);
415 
416     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
417     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
418     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
419     diff = __msa_asub_u_b(src, ref);
420     sad3 += __msa_hadd_u_h(diff, diff);
421 
422     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
423     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
424     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
425     diff = __msa_asub_u_b(src, ref);
426     sad4 += __msa_hadd_u_h(diff, diff);
427 
428     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
429     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
430     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
431     diff = __msa_asub_u_b(src, ref);
432     sad5 += __msa_hadd_u_h(diff, diff);
433 
434     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
435     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
436     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
437     diff = __msa_asub_u_b(src, ref);
438     sad6 += __msa_hadd_u_h(diff, diff);
439 
440     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
441     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
442     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
443     diff = __msa_asub_u_b(src, ref);
444     sad7 += __msa_hadd_u_h(diff, diff);
445   }
446 
447   sad_array[0] = HADD_UH_U32(sad0);
448   sad_array[1] = HADD_UH_U32(sad1);
449   sad_array[2] = HADD_UH_U32(sad2);
450   sad_array[3] = HADD_UH_U32(sad3);
451   sad_array[4] = HADD_UH_U32(sad4);
452   sad_array[5] = HADD_UH_U32(sad5);
453   sad_array[6] = HADD_UH_U32(sad6);
454   sad_array[7] = HADD_UH_U32(sad7);
455 }
456 
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)457 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
458                               const uint8_t *ref, int32_t ref_stride,
459                               int32_t height, uint32_t *sad_array) {
460   int32_t ht_cnt;
461   v16u8 src0, src1, src2, src3;
462   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
463   v8u16 sad0 = { 0 };
464   v8u16 sad1 = { 0 };
465   v8u16 sad2 = { 0 };
466   v8u16 sad3 = { 0 };
467   v8u16 sad4 = { 0 };
468   v8u16 sad5 = { 0 };
469   v8u16 sad6 = { 0 };
470   v8u16 sad7 = { 0 };
471 
472   for (ht_cnt = (height >> 2); ht_cnt--;) {
473     LD_UB4(src, src_stride, src0, src1, src2, src3);
474     src += (4 * src_stride);
475     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
476     ref += (4 * ref_stride);
477     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
478                 ref0, ref1);
479     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
480 
481     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
482     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
483     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
484     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
485 
486     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
487     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
488     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
489     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
490 
491     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
492     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
493     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
494     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
495 
496     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
497     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
498     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
499     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
500 
501     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
502     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
503     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
504     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
505 
506     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
507     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
508     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
509     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
510 
511     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
512     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
513     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
514     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
515   }
516 
517   sad_array[0] = HADD_UH_U32(sad0);
518   sad_array[1] = HADD_UH_U32(sad1);
519   sad_array[2] = HADD_UH_U32(sad2);
520   sad_array[3] = HADD_UH_U32(sad3);
521   sad_array[4] = HADD_UH_U32(sad4);
522   sad_array[5] = HADD_UH_U32(sad5);
523   sad_array[6] = HADD_UH_U32(sad6);
524   sad_array[7] = HADD_UH_U32(sad7);
525 }
526 
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)527 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
528                                const uint8_t *ref_ptr, int32_t ref_stride,
529                                int32_t height, uint32_t *sad_array) {
530   int32_t ht_cnt;
531   v16u8 src, ref0, ref1, ref;
532   v16u8 diff;
533   v8u16 sad0 = { 0 };
534   v8u16 sad1 = { 0 };
535   v8u16 sad2 = { 0 };
536   v8u16 sad3 = { 0 };
537   v8u16 sad4 = { 0 };
538   v8u16 sad5 = { 0 };
539   v8u16 sad6 = { 0 };
540   v8u16 sad7 = { 0 };
541 
542   for (ht_cnt = (height >> 1); ht_cnt--;) {
543     src = LD_UB(src_ptr);
544     src_ptr += src_stride;
545     LD_UB2(ref_ptr, 16, ref0, ref1);
546     ref_ptr += ref_stride;
547 
548     diff = __msa_asub_u_b(src, ref0);
549     sad0 += __msa_hadd_u_h(diff, diff);
550 
551     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
552     diff = __msa_asub_u_b(src, ref);
553     sad1 += __msa_hadd_u_h(diff, diff);
554 
555     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
556     diff = __msa_asub_u_b(src, ref);
557     sad2 += __msa_hadd_u_h(diff, diff);
558 
559     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
560     diff = __msa_asub_u_b(src, ref);
561     sad3 += __msa_hadd_u_h(diff, diff);
562 
563     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
564     diff = __msa_asub_u_b(src, ref);
565     sad4 += __msa_hadd_u_h(diff, diff);
566 
567     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
568     diff = __msa_asub_u_b(src, ref);
569     sad5 += __msa_hadd_u_h(diff, diff);
570 
571     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
572     diff = __msa_asub_u_b(src, ref);
573     sad6 += __msa_hadd_u_h(diff, diff);
574 
575     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
576     diff = __msa_asub_u_b(src, ref);
577     sad7 += __msa_hadd_u_h(diff, diff);
578 
579     src = LD_UB(src_ptr);
580     src_ptr += src_stride;
581     LD_UB2(ref_ptr, 16, ref0, ref1);
582     ref_ptr += ref_stride;
583 
584     diff = __msa_asub_u_b(src, ref0);
585     sad0 += __msa_hadd_u_h(diff, diff);
586 
587     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
588     diff = __msa_asub_u_b(src, ref);
589     sad1 += __msa_hadd_u_h(diff, diff);
590 
591     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
592     diff = __msa_asub_u_b(src, ref);
593     sad2 += __msa_hadd_u_h(diff, diff);
594 
595     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
596     diff = __msa_asub_u_b(src, ref);
597     sad3 += __msa_hadd_u_h(diff, diff);
598 
599     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
600     diff = __msa_asub_u_b(src, ref);
601     sad4 += __msa_hadd_u_h(diff, diff);
602 
603     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
604     diff = __msa_asub_u_b(src, ref);
605     sad5 += __msa_hadd_u_h(diff, diff);
606 
607     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
608     diff = __msa_asub_u_b(src, ref);
609     sad6 += __msa_hadd_u_h(diff, diff);
610 
611     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
612     diff = __msa_asub_u_b(src, ref);
613     sad7 += __msa_hadd_u_h(diff, diff);
614   }
615 
616   sad_array[0] = HADD_UH_U32(sad0);
617   sad_array[1] = HADD_UH_U32(sad1);
618   sad_array[2] = HADD_UH_U32(sad2);
619   sad_array[3] = HADD_UH_U32(sad3);
620   sad_array[4] = HADD_UH_U32(sad4);
621   sad_array[5] = HADD_UH_U32(sad5);
622   sad_array[6] = HADD_UH_U32(sad6);
623   sad_array[7] = HADD_UH_U32(sad7);
624 }
625 
sad_32width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)626 static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
627                                const uint8_t *ref, int32_t ref_stride,
628                                int32_t height, uint32_t *sad_array) {
629   int32_t ht_cnt;
630   v16u8 src0, src1;
631   v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
632   v8u16 sad0 = { 0 };
633   v8u16 sad1 = { 0 };
634   v8u16 sad2 = { 0 };
635   v8u16 sad3 = { 0 };
636   v8u16 sad4 = { 0 };
637   v8u16 sad5 = { 0 };
638   v8u16 sad6 = { 0 };
639   v8u16 sad7 = { 0 };
640 
641   for (ht_cnt = height; ht_cnt--;) {
642     LD_UB2(src, 16, src0, src1);
643     src += src_stride;
644     LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
645     ref += ref_stride;
646 
647     sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
648 
649     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
650     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
651 
652     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
653     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
654 
655     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
656     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
657 
658     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
659     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
660 
661     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
662     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
663 
664     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
665     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
666 
667     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
668     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
669   }
670 
671   sad_array[0] = HADD_UH_U32(sad0);
672   sad_array[1] = HADD_UH_U32(sad1);
673   sad_array[2] = HADD_UH_U32(sad2);
674   sad_array[3] = HADD_UH_U32(sad3);
675   sad_array[4] = HADD_UH_U32(sad4);
676   sad_array[5] = HADD_UH_U32(sad5);
677   sad_array[6] = HADD_UH_U32(sad6);
678   sad_array[7] = HADD_UH_U32(sad7);
679 }
680 
sad_64width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)681 static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
682                                const uint8_t *ref, int32_t ref_stride,
683                                int32_t height, uint32_t *sad_array) {
684   const uint8_t *src_dup, *ref_dup;
685   int32_t ht_cnt;
686   v16u8 src0, src1, src2, src3;
687   v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
688   v16u8 ref0, ref1, ref2, ref3;
689   v8u16 sad0_0 = { 0 };
690   v8u16 sad0_1 = { 0 };
691   v8u16 sad1_0 = { 0 };
692   v8u16 sad1_1 = { 0 };
693   v8u16 sad2_0 = { 0 };
694   v8u16 sad2_1 = { 0 };
695   v8u16 sad3_0 = { 0 };
696   v8u16 sad3_1 = { 0 };
697   v4u32 sad;
698 
699   src_dup = src;
700   ref_dup = ref;
701 
702   for (ht_cnt = height; ht_cnt--;) {
703     LD_UB4(src, 16, src0, src1, src2, src3);
704     src += src_stride;
705     LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
706     ref += ref_stride;
707 
708     sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
709     sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
710 
711     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
712     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
713     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
714     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
715 
716     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
717     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
718     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
719     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
720 
721     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
722     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
723     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
724     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
725   }
726 
727   sad = __msa_hadd_u_w(sad0_0, sad0_0);
728   sad += __msa_hadd_u_w(sad0_1, sad0_1);
729   sad_array[0] = HADD_SW_S32(sad);
730 
731   sad = __msa_hadd_u_w(sad1_0, sad1_0);
732   sad += __msa_hadd_u_w(sad1_1, sad1_1);
733   sad_array[1] = HADD_SW_S32(sad);
734 
735   sad = __msa_hadd_u_w(sad2_0, sad2_0);
736   sad += __msa_hadd_u_w(sad2_1, sad2_1);
737   sad_array[2] = HADD_SW_S32(sad);
738 
739   sad = __msa_hadd_u_w(sad3_0, sad3_0);
740   sad += __msa_hadd_u_w(sad3_1, sad3_1);
741   sad_array[3] = HADD_SW_S32(sad);
742 
743   sad0_0 = (v8u16)__msa_ldi_h(0);
744   sad0_1 = (v8u16)__msa_ldi_h(0);
745   sad1_0 = (v8u16)__msa_ldi_h(0);
746   sad1_1 = (v8u16)__msa_ldi_h(0);
747   sad2_0 = (v8u16)__msa_ldi_h(0);
748   sad2_1 = (v8u16)__msa_ldi_h(0);
749   sad3_0 = (v8u16)__msa_ldi_h(0);
750   sad3_1 = (v8u16)__msa_ldi_h(0);
751 
752   for (ht_cnt = 64; ht_cnt--;) {
753     LD_UB4(src_dup, 16, src0, src1, src2, src3);
754     src_dup += src_stride;
755     LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
756     ref_dup += ref_stride;
757 
758     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
759     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
760     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
761     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
762 
763     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
764     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
765     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
766     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
767 
768     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
769     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
770     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
771     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
772 
773     SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
774     SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
775     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
776     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
777   }
778 
779   sad = __msa_hadd_u_w(sad0_0, sad0_0);
780   sad += __msa_hadd_u_w(sad0_1, sad0_1);
781   sad_array[4] = HADD_SW_S32(sad);
782 
783   sad = __msa_hadd_u_w(sad1_0, sad1_0);
784   sad += __msa_hadd_u_w(sad1_1, sad1_1);
785   sad_array[5] = HADD_SW_S32(sad);
786 
787   sad = __msa_hadd_u_w(sad2_0, sad2_0);
788   sad += __msa_hadd_u_w(sad2_1, sad2_1);
789   sad_array[6] = HADD_SW_S32(sad);
790 
791   sad = __msa_hadd_u_w(sad3_0, sad3_0);
792   sad += __msa_hadd_u_w(sad3_1, sad3_1);
793   sad_array[7] = HADD_SW_S32(sad);
794 }
795 
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)796 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
797                                const uint8_t *const aref_ptr[],
798                                int32_t ref_stride, int32_t height,
799                                uint32_t *sad_array) {
800   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
801   int32_t ht_cnt;
802   uint32_t src0, src1, src2, src3;
803   uint32_t ref0, ref1, ref2, ref3;
804   v16u8 src = { 0 };
805   v16u8 ref = { 0 };
806   v16u8 diff;
807   v8u16 sad0 = { 0 };
808   v8u16 sad1 = { 0 };
809   v8u16 sad2 = { 0 };
810   v8u16 sad3 = { 0 };
811 
812   ref0_ptr = aref_ptr[0];
813   ref1_ptr = aref_ptr[1];
814   ref2_ptr = aref_ptr[2];
815   ref3_ptr = aref_ptr[3];
816 
817   for (ht_cnt = (height >> 2); ht_cnt--;) {
818     LW4(src_ptr, src_stride, src0, src1, src2, src3);
819     INSERT_W4_UB(src0, src1, src2, src3, src);
820     src_ptr += (4 * src_stride);
821 
822     LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
823     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
824     ref0_ptr += (4 * ref_stride);
825 
826     diff = __msa_asub_u_b(src, ref);
827     sad0 += __msa_hadd_u_h(diff, diff);
828 
829     LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
830     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
831     ref1_ptr += (4 * ref_stride);
832 
833     diff = __msa_asub_u_b(src, ref);
834     sad1 += __msa_hadd_u_h(diff, diff);
835 
836     LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
837     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
838     ref2_ptr += (4 * ref_stride);
839 
840     diff = __msa_asub_u_b(src, ref);
841     sad2 += __msa_hadd_u_h(diff, diff);
842 
843     LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
844     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
845     ref3_ptr += (4 * ref_stride);
846 
847     diff = __msa_asub_u_b(src, ref);
848     sad3 += __msa_hadd_u_h(diff, diff);
849   }
850 
851   sad_array[0] = HADD_UH_U32(sad0);
852   sad_array[1] = HADD_UH_U32(sad1);
853   sad_array[2] = HADD_UH_U32(sad2);
854   sad_array[3] = HADD_UH_U32(sad3);
855 }
856 
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)857 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
858                                const uint8_t *const aref_ptr[],
859                                int32_t ref_stride, int32_t height,
860                                uint32_t *sad_array) {
861   int32_t ht_cnt;
862   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
863   v16u8 src0, src1, src2, src3;
864   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
865   v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
866   v8u16 sad0 = { 0 };
867   v8u16 sad1 = { 0 };
868   v8u16 sad2 = { 0 };
869   v8u16 sad3 = { 0 };
870 
871   ref0_ptr = aref_ptr[0];
872   ref1_ptr = aref_ptr[1];
873   ref2_ptr = aref_ptr[2];
874   ref3_ptr = aref_ptr[3];
875 
876   for (ht_cnt = (height >> 2); ht_cnt--;) {
877     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
878     src_ptr += (4 * src_stride);
879     LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
880     ref0_ptr += (4 * ref_stride);
881     LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
882     ref1_ptr += (4 * ref_stride);
883     LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
884     ref2_ptr += (4 * ref_stride);
885     LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
886     ref3_ptr += (4 * ref_stride);
887 
888     PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
889     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
890     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
891 
892     PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
893     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
894 
895     PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
896     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
897 
898     PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
899     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
900   }
901 
902   sad_array[0] = HADD_UH_U32(sad0);
903   sad_array[1] = HADD_UH_U32(sad1);
904   sad_array[2] = HADD_UH_U32(sad2);
905   sad_array[3] = HADD_UH_U32(sad3);
906 }
907 
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)908 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
909                                 const uint8_t *const aref_ptr[],
910                                 int32_t ref_stride, int32_t height,
911                                 uint32_t *sad_array) {
912   int32_t ht_cnt;
913   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
914   v16u8 src, ref0, ref1, ref2, ref3, diff;
915   v8u16 sad0 = { 0 };
916   v8u16 sad1 = { 0 };
917   v8u16 sad2 = { 0 };
918   v8u16 sad3 = { 0 };
919 
920   ref0_ptr = aref_ptr[0];
921   ref1_ptr = aref_ptr[1];
922   ref2_ptr = aref_ptr[2];
923   ref3_ptr = aref_ptr[3];
924 
925   for (ht_cnt = (height >> 1); ht_cnt--;) {
926     src = LD_UB(src_ptr);
927     src_ptr += src_stride;
928     ref0 = LD_UB(ref0_ptr);
929     ref0_ptr += ref_stride;
930     ref1 = LD_UB(ref1_ptr);
931     ref1_ptr += ref_stride;
932     ref2 = LD_UB(ref2_ptr);
933     ref2_ptr += ref_stride;
934     ref3 = LD_UB(ref3_ptr);
935     ref3_ptr += ref_stride;
936 
937     diff = __msa_asub_u_b(src, ref0);
938     sad0 += __msa_hadd_u_h(diff, diff);
939     diff = __msa_asub_u_b(src, ref1);
940     sad1 += __msa_hadd_u_h(diff, diff);
941     diff = __msa_asub_u_b(src, ref2);
942     sad2 += __msa_hadd_u_h(diff, diff);
943     diff = __msa_asub_u_b(src, ref3);
944     sad3 += __msa_hadd_u_h(diff, diff);
945 
946     src = LD_UB(src_ptr);
947     src_ptr += src_stride;
948     ref0 = LD_UB(ref0_ptr);
949     ref0_ptr += ref_stride;
950     ref1 = LD_UB(ref1_ptr);
951     ref1_ptr += ref_stride;
952     ref2 = LD_UB(ref2_ptr);
953     ref2_ptr += ref_stride;
954     ref3 = LD_UB(ref3_ptr);
955     ref3_ptr += ref_stride;
956 
957     diff = __msa_asub_u_b(src, ref0);
958     sad0 += __msa_hadd_u_h(diff, diff);
959     diff = __msa_asub_u_b(src, ref1);
960     sad1 += __msa_hadd_u_h(diff, diff);
961     diff = __msa_asub_u_b(src, ref2);
962     sad2 += __msa_hadd_u_h(diff, diff);
963     diff = __msa_asub_u_b(src, ref3);
964     sad3 += __msa_hadd_u_h(diff, diff);
965   }
966 
967   sad_array[0] = HADD_UH_U32(sad0);
968   sad_array[1] = HADD_UH_U32(sad1);
969   sad_array[2] = HADD_UH_U32(sad2);
970   sad_array[3] = HADD_UH_U32(sad3);
971 }
972 
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)973 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
974                                 const uint8_t *const aref_ptr[],
975                                 int32_t ref_stride, int32_t height,
976                                 uint32_t *sad_array) {
977   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
978   int32_t ht_cnt;
979   v16u8 src0, src1, ref0, ref1;
980   v8u16 sad0 = { 0 };
981   v8u16 sad1 = { 0 };
982   v8u16 sad2 = { 0 };
983   v8u16 sad3 = { 0 };
984 
985   ref0_ptr = aref_ptr[0];
986   ref1_ptr = aref_ptr[1];
987   ref2_ptr = aref_ptr[2];
988   ref3_ptr = aref_ptr[3];
989 
990   for (ht_cnt = height; ht_cnt--;) {
991     LD_UB2(src, 16, src0, src1);
992     src += src_stride;
993 
994     LD_UB2(ref0_ptr, 16, ref0, ref1);
995     ref0_ptr += ref_stride;
996     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
997 
998     LD_UB2(ref1_ptr, 16, ref0, ref1);
999     ref1_ptr += ref_stride;
1000     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
1001 
1002     LD_UB2(ref2_ptr, 16, ref0, ref1);
1003     ref2_ptr += ref_stride;
1004     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
1005 
1006     LD_UB2(ref3_ptr, 16, ref0, ref1);
1007     ref3_ptr += ref_stride;
1008     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
1009   }
1010 
1011   sad_array[0] = HADD_UH_U32(sad0);
1012   sad_array[1] = HADD_UH_U32(sad1);
1013   sad_array[2] = HADD_UH_U32(sad2);
1014   sad_array[3] = HADD_UH_U32(sad3);
1015 }
1016 
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)1017 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
1018                                 const uint8_t *const aref_ptr[],
1019                                 int32_t ref_stride, int32_t height,
1020                                 uint32_t *sad_array) {
1021   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
1022   int32_t ht_cnt;
1023   v16u8 src0, src1, src2, src3;
1024   v16u8 ref0, ref1, ref2, ref3;
1025   v8u16 sad0_0 = { 0 };
1026   v8u16 sad0_1 = { 0 };
1027   v8u16 sad1_0 = { 0 };
1028   v8u16 sad1_1 = { 0 };
1029   v8u16 sad2_0 = { 0 };
1030   v8u16 sad2_1 = { 0 };
1031   v8u16 sad3_0 = { 0 };
1032   v8u16 sad3_1 = { 0 };
1033   v4u32 sad;
1034 
1035   ref0_ptr = aref_ptr[0];
1036   ref1_ptr = aref_ptr[1];
1037   ref2_ptr = aref_ptr[2];
1038   ref3_ptr = aref_ptr[3];
1039 
1040   for (ht_cnt = height; ht_cnt--;) {
1041     LD_UB4(src, 16, src0, src1, src2, src3);
1042     src += src_stride;
1043 
1044     LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
1045     ref0_ptr += ref_stride;
1046     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1047     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1048 
1049     LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
1050     ref1_ptr += ref_stride;
1051     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1052     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1053 
1054     LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
1055     ref2_ptr += ref_stride;
1056     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1057     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1058 
1059     LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
1060     ref3_ptr += ref_stride;
1061     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
1062     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
1063   }
1064 
1065   sad = __msa_hadd_u_w(sad0_0, sad0_0);
1066   sad += __msa_hadd_u_w(sad0_1, sad0_1);
1067   sad_array[0] = HADD_UW_U32(sad);
1068 
1069   sad = __msa_hadd_u_w(sad1_0, sad1_0);
1070   sad += __msa_hadd_u_w(sad1_1, sad1_1);
1071   sad_array[1] = HADD_UW_U32(sad);
1072 
1073   sad = __msa_hadd_u_w(sad2_0, sad2_0);
1074   sad += __msa_hadd_u_w(sad2_1, sad2_1);
1075   sad_array[2] = HADD_UW_U32(sad);
1076 
1077   sad = __msa_hadd_u_w(sad3_0, sad3_0);
1078   sad += __msa_hadd_u_w(sad3_1, sad3_1);
1079   sad_array[3] = HADD_UW_U32(sad);
1080 }
1081 
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1082 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
1083                                   const uint8_t *ref_ptr, int32_t ref_stride,
1084                                   int32_t height, const uint8_t *sec_pred) {
1085   int32_t ht_cnt;
1086   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1087   v16u8 src = { 0 };
1088   v16u8 ref = { 0 };
1089   v16u8 diff, pred, comp;
1090   v8u16 sad = { 0 };
1091 
1092   for (ht_cnt = (height >> 2); ht_cnt--;) {
1093     LW4(src_ptr, src_stride, src0, src1, src2, src3);
1094     src_ptr += (4 * src_stride);
1095     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
1096     ref_ptr += (4 * ref_stride);
1097     pred = LD_UB(sec_pred);
1098     sec_pred += 16;
1099 
1100     INSERT_W4_UB(src0, src1, src2, src3, src);
1101     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
1102 
1103     comp = __msa_aver_u_b(pred, ref);
1104     diff = __msa_asub_u_b(src, comp);
1105     sad += __msa_hadd_u_h(diff, diff);
1106   }
1107 
1108   return HADD_UH_U32(sad);
1109 }
1110 
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1111 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
1112                                   const uint8_t *ref, int32_t ref_stride,
1113                                   int32_t height, const uint8_t *sec_pred) {
1114   int32_t ht_cnt;
1115   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1116   v16u8 diff0, diff1, pred0, pred1;
1117   v8u16 sad = { 0 };
1118 
1119   for (ht_cnt = (height >> 2); ht_cnt--;) {
1120     LD_UB4(src, src_stride, src0, src1, src2, src3);
1121     src += (4 * src_stride);
1122     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1123     ref += (4 * ref_stride);
1124     LD_UB2(sec_pred, 16, pred0, pred1);
1125     sec_pred += 32;
1126     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
1127                 ref0, ref1);
1128     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
1129     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
1130   }
1131 
1132   return HADD_UH_U32(sad);
1133 }
1134 
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1135 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
1136                                    const uint8_t *ref, int32_t ref_stride,
1137                                    int32_t height, const uint8_t *sec_pred) {
1138   int32_t ht_cnt;
1139   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
1140   v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
1141   v8u16 sad = { 0 };
1142 
1143   for (ht_cnt = (height >> 3); ht_cnt--;) {
1144     LD_UB4(src, src_stride, src0, src1, src2, src3);
1145     src += (4 * src_stride);
1146     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1147     ref += (4 * ref_stride);
1148     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1149     sec_pred += (4 * 16);
1150     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1151     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1152     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1153     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1154 
1155     LD_UB4(src, src_stride, src0, src1, src2, src3);
1156     src += (4 * src_stride);
1157     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
1158     ref += (4 * ref_stride);
1159     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1160     sec_pred += (4 * 16);
1161     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1162     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1163     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1164     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1165   }
1166 
1167   return HADD_UH_U32(sad);
1168 }
1169 
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1170 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
1171                                    const uint8_t *ref, int32_t ref_stride,
1172                                    int32_t height, const uint8_t *sec_pred) {
1173   int32_t ht_cnt;
1174   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1175   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
1176   v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
1177   v16u8 comp0, comp1;
1178   v8u16 sad = { 0 };
1179 
1180   for (ht_cnt = (height >> 2); ht_cnt--;) {
1181     LD_UB4(src, src_stride, src0, src2, src4, src6);
1182     LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
1183     src += (4 * src_stride);
1184 
1185     LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
1186     LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
1187     ref += (4 * ref_stride);
1188 
1189     LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
1190     LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
1191     sec_pred += (4 * 32);
1192 
1193     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
1194     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
1195     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
1196     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
1197     AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
1198     sad += SAD_UB2_UH(src4, src5, comp0, comp1);
1199     AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
1200     sad += SAD_UB2_UH(src6, src7, comp0, comp1);
1201   }
1202 
1203   return HADD_UH_U32(sad);
1204 }
1205 
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)1206 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
1207                                    const uint8_t *ref, int32_t ref_stride,
1208                                    int32_t height, const uint8_t *sec_pred) {
1209   int32_t ht_cnt;
1210   v16u8 src0, src1, src2, src3;
1211   v16u8 ref0, ref1, ref2, ref3;
1212   v16u8 comp0, comp1, comp2, comp3;
1213   v16u8 pred0, pred1, pred2, pred3;
1214   v8u16 sad0 = { 0 };
1215   v8u16 sad1 = { 0 };
1216   v4u32 sad;
1217 
1218   for (ht_cnt = (height >> 2); ht_cnt--;) {
1219     LD_UB4(src, 16, src0, src1, src2, src3);
1220     src += src_stride;
1221     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1222     ref += ref_stride;
1223     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1224     sec_pred += 64;
1225     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1226                 comp1, comp2, comp3);
1227     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1228     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1229 
1230     LD_UB4(src, 16, src0, src1, src2, src3);
1231     src += src_stride;
1232     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1233     ref += ref_stride;
1234     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1235     sec_pred += 64;
1236     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1237                 comp1, comp2, comp3);
1238     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1239     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1240 
1241     LD_UB4(src, 16, src0, src1, src2, src3);
1242     src += src_stride;
1243     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1244     ref += ref_stride;
1245     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1246     sec_pred += 64;
1247     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1248                 comp1, comp2, comp3);
1249     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1250     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1251 
1252     LD_UB4(src, 16, src0, src1, src2, src3);
1253     src += src_stride;
1254     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
1255     ref += ref_stride;
1256     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
1257     sec_pred += 64;
1258     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
1259                 comp1, comp2, comp3);
1260     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1261     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1262   }
1263 
1264   sad = __msa_hadd_u_w(sad0, sad0);
1265   sad += __msa_hadd_u_w(sad1, sad1);
1266 
1267   return HADD_SW_S32(sad);
1268 }
1269 
1270 #define VPX_SAD_4xHEIGHT_MSA(height)                                         \
1271   uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1272                                    const uint8_t *ref, int32_t ref_stride) { \
1273     return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
1274   }
1275 
1276 #define VPX_SAD_8xHEIGHT_MSA(height)                                         \
1277   uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1278                                    const uint8_t *ref, int32_t ref_stride) { \
1279     return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
1280   }
1281 
1282 #define VPX_SAD_16xHEIGHT_MSA(height)                                         \
1283   uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1284                                     const uint8_t *ref, int32_t ref_stride) { \
1285     return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
1286   }
1287 
1288 #define VPX_SAD_32xHEIGHT_MSA(height)                                         \
1289   uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1290                                     const uint8_t *ref, int32_t ref_stride) { \
1291     return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
1292   }
1293 
1294 #define VPX_SAD_64xHEIGHT_MSA(height)                                         \
1295   uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1296                                     const uint8_t *ref, int32_t ref_stride) { \
1297     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
1298   }
1299 
1300 #define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
1301   void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1302                                  const uint8_t *ref, int32_t ref_stride, \
1303                                  uint32_t *sads) {                       \
1304     sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1305   }
1306 
1307 #define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
1308   void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1309                                  const uint8_t *ref, int32_t ref_stride, \
1310                                  uint32_t *sads) {                       \
1311     sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1312   }
1313 
1314 #define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
1315   void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1316                                   const uint8_t *ref, int32_t ref_stride, \
1317                                   uint32_t *sads) {                       \
1318     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1319   }
1320 
1321 #define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
1322   void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1323                                   const uint8_t *ref, int32_t ref_stride, \
1324                                   uint32_t *sads) {                       \
1325     sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1326   }
1327 
1328 #define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
1329   void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1330                                   const uint8_t *ref, int32_t ref_stride, \
1331                                   uint32_t *sads) {                       \
1332     sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1333   }
1334 
1335 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
1336   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1337                                  const uint8_t *ref, int32_t ref_stride, \
1338                                  uint32_t *sads) {                       \
1339     sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1340   }
1341 
1342 #define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
1343   void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1344                                  const uint8_t *ref, int32_t ref_stride, \
1345                                  uint32_t *sads) {                       \
1346     sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1347   }
1348 
1349 #define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
1350   void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1351                                   const uint8_t *ref, int32_t ref_stride, \
1352                                   uint32_t *sads) {                       \
1353     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1354   }
1355 
1356 #define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
1357   void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1358                                   const uint8_t *ref, int32_t ref_stride, \
1359                                   uint32_t *sads) {                       \
1360     sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1361   }
1362 
1363 #define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
1364   void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1365                                   const uint8_t *ref, int32_t ref_stride, \
1366                                   uint32_t *sads) {                       \
1367     sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1368   }
1369 
1370 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
1371   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1372                                   const uint8_t *const refs[],            \
1373                                   int32_t ref_stride, uint32_t *sads) {   \
1374     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1375   }
1376 
1377 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
1378   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1379                                   const uint8_t *const refs[],            \
1380                                   int32_t ref_stride, uint32_t *sads) {   \
1381     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1382   }
1383 
1384 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
1385   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1386                                    const uint8_t *const refs[],            \
1387                                    int32_t ref_stride, uint32_t *sads) {   \
1388     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1389   }
1390 
1391 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
1392   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1393                                    const uint8_t *const refs[],            \
1394                                    int32_t ref_stride, uint32_t *sads) {   \
1395     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1396   }
1397 
1398 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
1399   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1400                                    const uint8_t *const refs[],            \
1401                                    int32_t ref_stride, uint32_t *sads) {   \
1402     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1403   }
1404 
1405 #define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
1406   uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1407                                        const uint8_t *ref, int32_t ref_stride, \
1408                                        const uint8_t *second_pred) {           \
1409     return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
1410                              second_pred);                                     \
1411   }
1412 
1413 #define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
1414   uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1415                                        const uint8_t *ref, int32_t ref_stride, \
1416                                        const uint8_t *second_pred) {           \
1417     return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
1418                              second_pred);                                     \
1419   }
1420 
1421 #define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
1422   uint32_t vpx_sad16x##height##_avg_msa(                                \
1423       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1424       int32_t ref_stride, const uint8_t *second_pred) {                 \
1425     return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1426                               second_pred);                             \
1427   }
1428 
1429 #define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
1430   uint32_t vpx_sad32x##height##_avg_msa(                                \
1431       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1432       int32_t ref_stride, const uint8_t *second_pred) {                 \
1433     return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1434                               second_pred);                             \
1435   }
1436 
1437 #define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
1438   uint32_t vpx_sad64x##height##_avg_msa(                                \
1439       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1440       int32_t ref_stride, const uint8_t *second_pred) {                 \
1441     return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1442                               second_pred);                             \
1443   }
1444 
1445 // 64x64
1446 VPX_SAD_64xHEIGHT_MSA(64);
1447 VPX_SAD_64xHEIGHTx3_MSA(64);
1448 VPX_SAD_64xHEIGHTx8_MSA(64);
1449 VPX_SAD_64xHEIGHTx4D_MSA(64);
1450 VPX_AVGSAD_64xHEIGHT_MSA(64);
1451 
1452 // 64x32
1453 VPX_SAD_64xHEIGHT_MSA(32);
1454 VPX_SAD_64xHEIGHTx3_MSA(32);
1455 VPX_SAD_64xHEIGHTx8_MSA(32);
1456 VPX_SAD_64xHEIGHTx4D_MSA(32);
1457 VPX_AVGSAD_64xHEIGHT_MSA(32);
1458 
1459 // 32x64
1460 VPX_SAD_32xHEIGHT_MSA(64);
1461 VPX_SAD_32xHEIGHTx3_MSA(64);
1462 VPX_SAD_32xHEIGHTx8_MSA(64);
1463 VPX_SAD_32xHEIGHTx4D_MSA(64);
1464 VPX_AVGSAD_32xHEIGHT_MSA(64);
1465 
1466 // 32x32
1467 VPX_SAD_32xHEIGHT_MSA(32);
1468 VPX_SAD_32xHEIGHTx3_MSA(32);
1469 VPX_SAD_32xHEIGHTx8_MSA(32);
1470 VPX_SAD_32xHEIGHTx4D_MSA(32);
1471 VPX_AVGSAD_32xHEIGHT_MSA(32);
1472 
1473 // 32x16
1474 VPX_SAD_32xHEIGHT_MSA(16);
1475 VPX_SAD_32xHEIGHTx3_MSA(16);
1476 VPX_SAD_32xHEIGHTx8_MSA(16);
1477 VPX_SAD_32xHEIGHTx4D_MSA(16);
1478 VPX_AVGSAD_32xHEIGHT_MSA(16);
1479 
1480 // 16x32
1481 VPX_SAD_16xHEIGHT_MSA(32);
1482 VPX_SAD_16xHEIGHTx3_MSA(32);
1483 VPX_SAD_16xHEIGHTx8_MSA(32);
1484 VPX_SAD_16xHEIGHTx4D_MSA(32);
1485 VPX_AVGSAD_16xHEIGHT_MSA(32);
1486 
1487 // 16x16
1488 VPX_SAD_16xHEIGHT_MSA(16);
1489 VPX_SAD_16xHEIGHTx3_MSA(16);
1490 VPX_SAD_16xHEIGHTx8_MSA(16);
1491 VPX_SAD_16xHEIGHTx4D_MSA(16);
1492 VPX_AVGSAD_16xHEIGHT_MSA(16);
1493 
1494 // 16x8
1495 VPX_SAD_16xHEIGHT_MSA(8);
1496 VPX_SAD_16xHEIGHTx3_MSA(8);
1497 VPX_SAD_16xHEIGHTx8_MSA(8);
1498 VPX_SAD_16xHEIGHTx4D_MSA(8);
1499 VPX_AVGSAD_16xHEIGHT_MSA(8);
1500 
1501 // 8x16
1502 VPX_SAD_8xHEIGHT_MSA(16);
1503 VPX_SAD_8xHEIGHTx3_MSA(16);
1504 VPX_SAD_8xHEIGHTx8_MSA(16);
1505 VPX_SAD_8xHEIGHTx4D_MSA(16);
1506 VPX_AVGSAD_8xHEIGHT_MSA(16);
1507 
1508 // 8x8
1509 VPX_SAD_8xHEIGHT_MSA(8);
1510 VPX_SAD_8xHEIGHTx3_MSA(8);
1511 VPX_SAD_8xHEIGHTx8_MSA(8);
1512 VPX_SAD_8xHEIGHTx4D_MSA(8);
1513 VPX_AVGSAD_8xHEIGHT_MSA(8);
1514 
1515 // 8x4
1516 VPX_SAD_8xHEIGHT_MSA(4);
1517 VPX_SAD_8xHEIGHTx3_MSA(4);
1518 VPX_SAD_8xHEIGHTx8_MSA(4);
1519 VPX_SAD_8xHEIGHTx4D_MSA(4);
1520 VPX_AVGSAD_8xHEIGHT_MSA(4);
1521 
1522 // 4x8
1523 VPX_SAD_4xHEIGHT_MSA(8);
1524 VPX_SAD_4xHEIGHTx3_MSA(8);
1525 VPX_SAD_4xHEIGHTx8_MSA(8);
1526 VPX_SAD_4xHEIGHTx4D_MSA(8);
1527 VPX_AVGSAD_4xHEIGHT_MSA(8);
1528 
1529 // 4x4
1530 VPX_SAD_4xHEIGHT_MSA(4);
1531 VPX_SAD_4xHEIGHTx3_MSA(4);
1532 VPX_SAD_4xHEIGHTx8_MSA(4);
1533 VPX_SAD_4xHEIGHTx4D_MSA(4);
1534 VPX_AVGSAD_4xHEIGHT_MSA(4);
1535