1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13 
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out)       \
15   {                                                        \
16     out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17     out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18     out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19     out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20   }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22 
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24                                const uint8_t *ref_ptr, int32_t ref_stride,
25                                int32_t height) {
26   int32_t ht_cnt;
27   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28   v16u8 src = { 0 };
29   v16u8 ref = { 0 };
30   v16u8 diff;
31   v8u16 sad = { 0 };
32 
33   for (ht_cnt = (height >> 2); ht_cnt--;) {
34     LW4(src_ptr, src_stride, src0, src1, src2, src3);
35     src_ptr += (4 * src_stride);
36     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37     ref_ptr += (4 * ref_stride);
38 
39     INSERT_W4_UB(src0, src1, src2, src3, src);
40     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41 
42     diff = __msa_asub_u_b(src, ref);
43     sad += __msa_hadd_u_h(diff, diff);
44   }
45 
46   return HADD_UH_U32(sad);
47 }
48 
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50                                const uint8_t *ref, int32_t ref_stride,
51                                int32_t height) {
52   int32_t ht_cnt;
53   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54   v8u16 sad = { 0 };
55 
56   for (ht_cnt = (height >> 2); ht_cnt--;) {
57     LD_UB4(src, src_stride, src0, src1, src2, src3);
58     src += (4 * src_stride);
59     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60     ref += (4 * ref_stride);
61 
62     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63                 ref0, ref1);
64     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65   }
66 
67   return HADD_UH_U32(sad);
68 }
69 
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71                                 const uint8_t *ref, int32_t ref_stride,
72                                 int32_t height) {
73   int32_t ht_cnt;
74   v16u8 src0, src1, ref0, ref1;
75   v8u16 sad = { 0 };
76 
77   for (ht_cnt = (height >> 2); ht_cnt--;) {
78     LD_UB2(src, src_stride, src0, src1);
79     src += (2 * src_stride);
80     LD_UB2(ref, ref_stride, ref0, ref1);
81     ref += (2 * ref_stride);
82     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83 
84     LD_UB2(src, src_stride, src0, src1);
85     src += (2 * src_stride);
86     LD_UB2(ref, ref_stride, ref0, ref1);
87     ref += (2 * ref_stride);
88     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89   }
90 
91   return HADD_UH_U32(sad);
92 }
93 
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95                                 const uint8_t *ref, int32_t ref_stride,
96                                 int32_t height) {
97   int32_t ht_cnt;
98   v16u8 src0, src1, ref0, ref1;
99   v8u16 sad = { 0 };
100 
101   for (ht_cnt = (height >> 2); ht_cnt--;) {
102     LD_UB2(src, 16, src0, src1);
103     src += src_stride;
104     LD_UB2(ref, 16, ref0, ref1);
105     ref += ref_stride;
106     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107 
108     LD_UB2(src, 16, src0, src1);
109     src += src_stride;
110     LD_UB2(ref, 16, ref0, ref1);
111     ref += ref_stride;
112     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113 
114     LD_UB2(src, 16, src0, src1);
115     src += src_stride;
116     LD_UB2(ref, 16, ref0, ref1);
117     ref += ref_stride;
118     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119 
120     LD_UB2(src, 16, src0, src1);
121     src += src_stride;
122     LD_UB2(ref, 16, ref0, ref1);
123     ref += ref_stride;
124     sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125   }
126 
127   return HADD_UH_U32(sad);
128 }
129 
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131                                 const uint8_t *ref, int32_t ref_stride,
132                                 int32_t height) {
133   int32_t ht_cnt;
134   uint32_t sad = 0;
135   v16u8 src0, src1, src2, src3;
136   v16u8 ref0, ref1, ref2, ref3;
137   v8u16 sad0 = { 0 };
138   v8u16 sad1 = { 0 };
139 
140   for (ht_cnt = (height >> 1); ht_cnt--;) {
141     LD_UB4(src, 16, src0, src1, src2, src3);
142     src += src_stride;
143     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144     ref += ref_stride;
145     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147 
148     LD_UB4(src, 16, src0, src1, src2, src3);
149     src += src_stride;
150     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151     ref += ref_stride;
152     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153     sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154   }
155 
156   sad = HADD_UH_U32(sad0);
157   sad += HADD_UH_U32(sad1);
158 
159   return sad;
160 }
161 
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163                               const uint8_t *ref_ptr, int32_t ref_stride,
164                               int32_t height, uint32_t *sad_array) {
165   int32_t ht_cnt;
166   uint32_t src0, src1, src2, src3;
167   v16u8 src = { 0 };
168   v16u8 ref = { 0 };
169   v16u8 ref0, ref1, ref2, ref3, diff;
170   v8u16 sad0 = { 0 };
171   v8u16 sad1 = { 0 };
172   v8u16 sad2 = { 0 };
173 
174   for (ht_cnt = (height >> 2); ht_cnt--;) {
175     LW4(src_ptr, src_stride, src0, src1, src2, src3);
176     src_ptr += (4 * src_stride);
177     INSERT_W4_UB(src0, src1, src2, src3, src);
178 
179     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180     ref_ptr += (4 * ref_stride);
181     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182     diff = __msa_asub_u_b(src, ref);
183     sad0 += __msa_hadd_u_h(diff, diff);
184 
185     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188     diff = __msa_asub_u_b(src, ref);
189     sad1 += __msa_hadd_u_h(diff, diff);
190 
191     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194     diff = __msa_asub_u_b(src, ref);
195     sad2 += __msa_hadd_u_h(diff, diff);
196   }
197 
198   sad_array[0] = HADD_UH_U32(sad0);
199   sad_array[1] = HADD_UH_U32(sad1);
200   sad_array[2] = HADD_UH_U32(sad2);
201 }
202 
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204                               const uint8_t *ref, int32_t ref_stride,
205                               int32_t height, uint32_t *sad_array) {
206   int32_t ht_cnt;
207   v16u8 src0, src1, src2, src3;
208   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209   v8u16 sad0 = { 0 };
210   v8u16 sad1 = { 0 };
211   v8u16 sad2 = { 0 };
212 
213   for (ht_cnt = (height >> 2); ht_cnt--;) {
214     LD_UB4(src, src_stride, src0, src1, src2, src3);
215     src += (4 * src_stride);
216     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217     ref += (4 * ref_stride);
218     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219                 ref0, ref1);
220     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221 
222     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226 
227     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231   }
232 
233   sad_array[0] = HADD_UH_U32(sad0);
234   sad_array[1] = HADD_UH_U32(sad1);
235   sad_array[2] = HADD_UH_U32(sad2);
236 }
237 
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239                                const uint8_t *ref_ptr, int32_t ref_stride,
240                                int32_t height, uint32_t *sad_array) {
241   int32_t ht_cnt;
242   v16u8 src, ref, ref0, ref1, diff;
243   v8u16 sad0 = { 0 };
244   v8u16 sad1 = { 0 };
245   v8u16 sad2 = { 0 };
246 
247   for (ht_cnt = (height >> 1); ht_cnt--;) {
248     src = LD_UB(src_ptr);
249     src_ptr += src_stride;
250     LD_UB2(ref_ptr, 16, ref0, ref1);
251     ref_ptr += ref_stride;
252 
253     diff = __msa_asub_u_b(src, ref0);
254     sad0 += __msa_hadd_u_h(diff, diff);
255 
256     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257     diff = __msa_asub_u_b(src, ref);
258     sad1 += __msa_hadd_u_h(diff, diff);
259 
260     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261     diff = __msa_asub_u_b(src, ref);
262     sad2 += __msa_hadd_u_h(diff, diff);
263 
264     src = LD_UB(src_ptr);
265     src_ptr += src_stride;
266     LD_UB2(ref_ptr, 16, ref0, ref1);
267     ref_ptr += ref_stride;
268 
269     diff = __msa_asub_u_b(src, ref0);
270     sad0 += __msa_hadd_u_h(diff, diff);
271 
272     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273     diff = __msa_asub_u_b(src, ref);
274     sad1 += __msa_hadd_u_h(diff, diff);
275 
276     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277     diff = __msa_asub_u_b(src, ref);
278     sad2 += __msa_hadd_u_h(diff, diff);
279   }
280 
281   sad_array[0] = HADD_UH_U32(sad0);
282   sad_array[1] = HADD_UH_U32(sad1);
283   sad_array[2] = HADD_UH_U32(sad2);
284 }
285 
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)286 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
287                               const uint8_t *ref_ptr, int32_t ref_stride,
288                               int32_t height, uint32_t *sad_array) {
289   int32_t ht_cnt;
290   uint32_t src0, src1, src2, src3;
291   v16u8 ref0, ref1, ref2, ref3, diff;
292   v16u8 src = { 0 };
293   v16u8 ref = { 0 };
294   v8u16 sad0 = { 0 };
295   v8u16 sad1 = { 0 };
296   v8u16 sad2 = { 0 };
297   v8u16 sad3 = { 0 };
298   v8u16 sad4 = { 0 };
299   v8u16 sad5 = { 0 };
300   v8u16 sad6 = { 0 };
301   v8u16 sad7 = { 0 };
302 
303   for (ht_cnt = (height >> 2); ht_cnt--;) {
304     LW4(src_ptr, src_stride, src0, src1, src2, src3);
305     INSERT_W4_UB(src0, src1, src2, src3, src);
306     src_ptr += (4 * src_stride);
307     LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
308     ref_ptr += (4 * ref_stride);
309 
310     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
311     diff = __msa_asub_u_b(src, ref);
312     sad0 += __msa_hadd_u_h(diff, diff);
313 
314     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
315     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
316     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
317     diff = __msa_asub_u_b(src, ref);
318     sad1 += __msa_hadd_u_h(diff, diff);
319 
320     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
321     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
322     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
323     diff = __msa_asub_u_b(src, ref);
324     sad2 += __msa_hadd_u_h(diff, diff);
325 
326     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
327     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
328     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
329     diff = __msa_asub_u_b(src, ref);
330     sad3 += __msa_hadd_u_h(diff, diff);
331 
332     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
333     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
334     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
335     diff = __msa_asub_u_b(src, ref);
336     sad4 += __msa_hadd_u_h(diff, diff);
337 
338     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
339     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
340     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
341     diff = __msa_asub_u_b(src, ref);
342     sad5 += __msa_hadd_u_h(diff, diff);
343 
344     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
345     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
346     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
347     diff = __msa_asub_u_b(src, ref);
348     sad6 += __msa_hadd_u_h(diff, diff);
349 
350     SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
351     SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
352     SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
353     diff = __msa_asub_u_b(src, ref);
354     sad7 += __msa_hadd_u_h(diff, diff);
355   }
356 
357   sad_array[0] = HADD_UH_U32(sad0);
358   sad_array[1] = HADD_UH_U32(sad1);
359   sad_array[2] = HADD_UH_U32(sad2);
360   sad_array[3] = HADD_UH_U32(sad3);
361   sad_array[4] = HADD_UH_U32(sad4);
362   sad_array[5] = HADD_UH_U32(sad5);
363   sad_array[6] = HADD_UH_U32(sad6);
364   sad_array[7] = HADD_UH_U32(sad7);
365 }
366 
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)367 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
368                               const uint8_t *ref, int32_t ref_stride,
369                               int32_t height, uint32_t *sad_array) {
370   int32_t ht_cnt;
371   v16u8 src0, src1, src2, src3;
372   v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
373   v8u16 sad0 = { 0 };
374   v8u16 sad1 = { 0 };
375   v8u16 sad2 = { 0 };
376   v8u16 sad3 = { 0 };
377   v8u16 sad4 = { 0 };
378   v8u16 sad5 = { 0 };
379   v8u16 sad6 = { 0 };
380   v8u16 sad7 = { 0 };
381 
382   for (ht_cnt = (height >> 2); ht_cnt--;) {
383     LD_UB4(src, src_stride, src0, src1, src2, src3);
384     src += (4 * src_stride);
385     LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
386     ref += (4 * ref_stride);
387     PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
388                 ref0, ref1);
389     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
390 
391     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
392     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
393     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
394     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
395 
396     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
397     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
398     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
399     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
400 
401     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
402     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
403     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
404     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
405 
406     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
407     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
408     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
409     sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
410 
411     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
412     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
413     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
414     sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
415 
416     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
417     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
418     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
419     sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
420 
421     SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
422     SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
423     PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
424     sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
425   }
426 
427   sad_array[0] = HADD_UH_U32(sad0);
428   sad_array[1] = HADD_UH_U32(sad1);
429   sad_array[2] = HADD_UH_U32(sad2);
430   sad_array[3] = HADD_UH_U32(sad3);
431   sad_array[4] = HADD_UH_U32(sad4);
432   sad_array[5] = HADD_UH_U32(sad5);
433   sad_array[6] = HADD_UH_U32(sad6);
434   sad_array[7] = HADD_UH_U32(sad7);
435 }
436 
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)437 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
438                                const uint8_t *ref_ptr, int32_t ref_stride,
439                                int32_t height, uint32_t *sad_array) {
440   int32_t ht_cnt;
441   v16u8 src, ref0, ref1, ref;
442   v16u8 diff;
443   v8u16 sad0 = { 0 };
444   v8u16 sad1 = { 0 };
445   v8u16 sad2 = { 0 };
446   v8u16 sad3 = { 0 };
447   v8u16 sad4 = { 0 };
448   v8u16 sad5 = { 0 };
449   v8u16 sad6 = { 0 };
450   v8u16 sad7 = { 0 };
451 
452   for (ht_cnt = (height >> 1); ht_cnt--;) {
453     src = LD_UB(src_ptr);
454     src_ptr += src_stride;
455     LD_UB2(ref_ptr, 16, ref0, ref1);
456     ref_ptr += ref_stride;
457 
458     diff = __msa_asub_u_b(src, ref0);
459     sad0 += __msa_hadd_u_h(diff, diff);
460 
461     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
462     diff = __msa_asub_u_b(src, ref);
463     sad1 += __msa_hadd_u_h(diff, diff);
464 
465     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
466     diff = __msa_asub_u_b(src, ref);
467     sad2 += __msa_hadd_u_h(diff, diff);
468 
469     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
470     diff = __msa_asub_u_b(src, ref);
471     sad3 += __msa_hadd_u_h(diff, diff);
472 
473     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
474     diff = __msa_asub_u_b(src, ref);
475     sad4 += __msa_hadd_u_h(diff, diff);
476 
477     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
478     diff = __msa_asub_u_b(src, ref);
479     sad5 += __msa_hadd_u_h(diff, diff);
480 
481     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
482     diff = __msa_asub_u_b(src, ref);
483     sad6 += __msa_hadd_u_h(diff, diff);
484 
485     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
486     diff = __msa_asub_u_b(src, ref);
487     sad7 += __msa_hadd_u_h(diff, diff);
488 
489     src = LD_UB(src_ptr);
490     src_ptr += src_stride;
491     LD_UB2(ref_ptr, 16, ref0, ref1);
492     ref_ptr += ref_stride;
493 
494     diff = __msa_asub_u_b(src, ref0);
495     sad0 += __msa_hadd_u_h(diff, diff);
496 
497     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
498     diff = __msa_asub_u_b(src, ref);
499     sad1 += __msa_hadd_u_h(diff, diff);
500 
501     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
502     diff = __msa_asub_u_b(src, ref);
503     sad2 += __msa_hadd_u_h(diff, diff);
504 
505     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
506     diff = __msa_asub_u_b(src, ref);
507     sad3 += __msa_hadd_u_h(diff, diff);
508 
509     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
510     diff = __msa_asub_u_b(src, ref);
511     sad4 += __msa_hadd_u_h(diff, diff);
512 
513     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
514     diff = __msa_asub_u_b(src, ref);
515     sad5 += __msa_hadd_u_h(diff, diff);
516 
517     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
518     diff = __msa_asub_u_b(src, ref);
519     sad6 += __msa_hadd_u_h(diff, diff);
520 
521     ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
522     diff = __msa_asub_u_b(src, ref);
523     sad7 += __msa_hadd_u_h(diff, diff);
524   }
525 
526   sad_array[0] = HADD_UH_U32(sad0);
527   sad_array[1] = HADD_UH_U32(sad1);
528   sad_array[2] = HADD_UH_U32(sad2);
529   sad_array[3] = HADD_UH_U32(sad3);
530   sad_array[4] = HADD_UH_U32(sad4);
531   sad_array[5] = HADD_UH_U32(sad5);
532   sad_array[6] = HADD_UH_U32(sad6);
533   sad_array[7] = HADD_UH_U32(sad7);
534 }
535 
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)536 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
537                                const uint8_t *const aref_ptr[],
538                                int32_t ref_stride, int32_t height,
539                                uint32_t *sad_array) {
540   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
541   int32_t ht_cnt;
542   uint32_t src0, src1, src2, src3;
543   uint32_t ref0, ref1, ref2, ref3;
544   v16u8 src = { 0 };
545   v16u8 ref = { 0 };
546   v16u8 diff;
547   v8u16 sad0 = { 0 };
548   v8u16 sad1 = { 0 };
549   v8u16 sad2 = { 0 };
550   v8u16 sad3 = { 0 };
551 
552   ref0_ptr = aref_ptr[0];
553   ref1_ptr = aref_ptr[1];
554   ref2_ptr = aref_ptr[2];
555   ref3_ptr = aref_ptr[3];
556 
557   for (ht_cnt = (height >> 2); ht_cnt--;) {
558     LW4(src_ptr, src_stride, src0, src1, src2, src3);
559     INSERT_W4_UB(src0, src1, src2, src3, src);
560     src_ptr += (4 * src_stride);
561 
562     LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
563     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
564     ref0_ptr += (4 * ref_stride);
565 
566     diff = __msa_asub_u_b(src, ref);
567     sad0 += __msa_hadd_u_h(diff, diff);
568 
569     LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
570     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
571     ref1_ptr += (4 * ref_stride);
572 
573     diff = __msa_asub_u_b(src, ref);
574     sad1 += __msa_hadd_u_h(diff, diff);
575 
576     LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
577     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
578     ref2_ptr += (4 * ref_stride);
579 
580     diff = __msa_asub_u_b(src, ref);
581     sad2 += __msa_hadd_u_h(diff, diff);
582 
583     LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
584     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
585     ref3_ptr += (4 * ref_stride);
586 
587     diff = __msa_asub_u_b(src, ref);
588     sad3 += __msa_hadd_u_h(diff, diff);
589   }
590 
591   sad_array[0] = HADD_UH_U32(sad0);
592   sad_array[1] = HADD_UH_U32(sad1);
593   sad_array[2] = HADD_UH_U32(sad2);
594   sad_array[3] = HADD_UH_U32(sad3);
595 }
596 
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)597 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
598                                const uint8_t *const aref_ptr[],
599                                int32_t ref_stride, int32_t height,
600                                uint32_t *sad_array) {
601   int32_t ht_cnt;
602   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
603   v16u8 src0, src1, src2, src3;
604   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
605   v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
606   v8u16 sad0 = { 0 };
607   v8u16 sad1 = { 0 };
608   v8u16 sad2 = { 0 };
609   v8u16 sad3 = { 0 };
610 
611   ref0_ptr = aref_ptr[0];
612   ref1_ptr = aref_ptr[1];
613   ref2_ptr = aref_ptr[2];
614   ref3_ptr = aref_ptr[3];
615 
616   for (ht_cnt = (height >> 2); ht_cnt--;) {
617     LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
618     src_ptr += (4 * src_stride);
619     LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
620     ref0_ptr += (4 * ref_stride);
621     LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
622     ref1_ptr += (4 * ref_stride);
623     LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
624     ref2_ptr += (4 * ref_stride);
625     LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
626     ref3_ptr += (4 * ref_stride);
627 
628     PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
629     PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
630     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
631 
632     PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
633     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
634 
635     PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
636     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
637 
638     PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
639     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
640   }
641 
642   sad_array[0] = HADD_UH_U32(sad0);
643   sad_array[1] = HADD_UH_U32(sad1);
644   sad_array[2] = HADD_UH_U32(sad2);
645   sad_array[3] = HADD_UH_U32(sad3);
646 }
647 
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)648 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
649                                 const uint8_t *const aref_ptr[],
650                                 int32_t ref_stride, int32_t height,
651                                 uint32_t *sad_array) {
652   int32_t ht_cnt;
653   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
654   v16u8 src, ref0, ref1, ref2, ref3, diff;
655   v8u16 sad0 = { 0 };
656   v8u16 sad1 = { 0 };
657   v8u16 sad2 = { 0 };
658   v8u16 sad3 = { 0 };
659 
660   ref0_ptr = aref_ptr[0];
661   ref1_ptr = aref_ptr[1];
662   ref2_ptr = aref_ptr[2];
663   ref3_ptr = aref_ptr[3];
664 
665   for (ht_cnt = (height >> 1); ht_cnt--;) {
666     src = LD_UB(src_ptr);
667     src_ptr += src_stride;
668     ref0 = LD_UB(ref0_ptr);
669     ref0_ptr += ref_stride;
670     ref1 = LD_UB(ref1_ptr);
671     ref1_ptr += ref_stride;
672     ref2 = LD_UB(ref2_ptr);
673     ref2_ptr += ref_stride;
674     ref3 = LD_UB(ref3_ptr);
675     ref3_ptr += ref_stride;
676 
677     diff = __msa_asub_u_b(src, ref0);
678     sad0 += __msa_hadd_u_h(diff, diff);
679     diff = __msa_asub_u_b(src, ref1);
680     sad1 += __msa_hadd_u_h(diff, diff);
681     diff = __msa_asub_u_b(src, ref2);
682     sad2 += __msa_hadd_u_h(diff, diff);
683     diff = __msa_asub_u_b(src, ref3);
684     sad3 += __msa_hadd_u_h(diff, diff);
685 
686     src = LD_UB(src_ptr);
687     src_ptr += src_stride;
688     ref0 = LD_UB(ref0_ptr);
689     ref0_ptr += ref_stride;
690     ref1 = LD_UB(ref1_ptr);
691     ref1_ptr += ref_stride;
692     ref2 = LD_UB(ref2_ptr);
693     ref2_ptr += ref_stride;
694     ref3 = LD_UB(ref3_ptr);
695     ref3_ptr += ref_stride;
696 
697     diff = __msa_asub_u_b(src, ref0);
698     sad0 += __msa_hadd_u_h(diff, diff);
699     diff = __msa_asub_u_b(src, ref1);
700     sad1 += __msa_hadd_u_h(diff, diff);
701     diff = __msa_asub_u_b(src, ref2);
702     sad2 += __msa_hadd_u_h(diff, diff);
703     diff = __msa_asub_u_b(src, ref3);
704     sad3 += __msa_hadd_u_h(diff, diff);
705   }
706 
707   sad_array[0] = HADD_UH_U32(sad0);
708   sad_array[1] = HADD_UH_U32(sad1);
709   sad_array[2] = HADD_UH_U32(sad2);
710   sad_array[3] = HADD_UH_U32(sad3);
711 }
712 
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)713 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
714                                 const uint8_t *const aref_ptr[],
715                                 int32_t ref_stride, int32_t height,
716                                 uint32_t *sad_array) {
717   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
718   int32_t ht_cnt;
719   v16u8 src0, src1, ref0, ref1;
720   v8u16 sad0 = { 0 };
721   v8u16 sad1 = { 0 };
722   v8u16 sad2 = { 0 };
723   v8u16 sad3 = { 0 };
724 
725   ref0_ptr = aref_ptr[0];
726   ref1_ptr = aref_ptr[1];
727   ref2_ptr = aref_ptr[2];
728   ref3_ptr = aref_ptr[3];
729 
730   for (ht_cnt = height; ht_cnt--;) {
731     LD_UB2(src, 16, src0, src1);
732     src += src_stride;
733 
734     LD_UB2(ref0_ptr, 16, ref0, ref1);
735     ref0_ptr += ref_stride;
736     sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
737 
738     LD_UB2(ref1_ptr, 16, ref0, ref1);
739     ref1_ptr += ref_stride;
740     sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
741 
742     LD_UB2(ref2_ptr, 16, ref0, ref1);
743     ref2_ptr += ref_stride;
744     sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
745 
746     LD_UB2(ref3_ptr, 16, ref0, ref1);
747     ref3_ptr += ref_stride;
748     sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
749   }
750 
751   sad_array[0] = HADD_UH_U32(sad0);
752   sad_array[1] = HADD_UH_U32(sad1);
753   sad_array[2] = HADD_UH_U32(sad2);
754   sad_array[3] = HADD_UH_U32(sad3);
755 }
756 
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)757 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
758                                 const uint8_t *const aref_ptr[],
759                                 int32_t ref_stride, int32_t height,
760                                 uint32_t *sad_array) {
761   const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
762   int32_t ht_cnt;
763   v16u8 src0, src1, src2, src3;
764   v16u8 ref0, ref1, ref2, ref3;
765   v8u16 sad0_0 = { 0 };
766   v8u16 sad0_1 = { 0 };
767   v8u16 sad1_0 = { 0 };
768   v8u16 sad1_1 = { 0 };
769   v8u16 sad2_0 = { 0 };
770   v8u16 sad2_1 = { 0 };
771   v8u16 sad3_0 = { 0 };
772   v8u16 sad3_1 = { 0 };
773   v4u32 sad;
774 
775   ref0_ptr = aref_ptr[0];
776   ref1_ptr = aref_ptr[1];
777   ref2_ptr = aref_ptr[2];
778   ref3_ptr = aref_ptr[3];
779 
780   for (ht_cnt = height; ht_cnt--;) {
781     LD_UB4(src, 16, src0, src1, src2, src3);
782     src += src_stride;
783 
784     LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
785     ref0_ptr += ref_stride;
786     sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
787     sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
788 
789     LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
790     ref1_ptr += ref_stride;
791     sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
792     sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
793 
794     LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
795     ref2_ptr += ref_stride;
796     sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
797     sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
798 
799     LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
800     ref3_ptr += ref_stride;
801     sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
802     sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
803   }
804 
805   sad = __msa_hadd_u_w(sad0_0, sad0_0);
806   sad += __msa_hadd_u_w(sad0_1, sad0_1);
807   sad_array[0] = HADD_UW_U32(sad);
808 
809   sad = __msa_hadd_u_w(sad1_0, sad1_0);
810   sad += __msa_hadd_u_w(sad1_1, sad1_1);
811   sad_array[1] = HADD_UW_U32(sad);
812 
813   sad = __msa_hadd_u_w(sad2_0, sad2_0);
814   sad += __msa_hadd_u_w(sad2_1, sad2_1);
815   sad_array[2] = HADD_UW_U32(sad);
816 
817   sad = __msa_hadd_u_w(sad3_0, sad3_0);
818   sad += __msa_hadd_u_w(sad3_1, sad3_1);
819   sad_array[3] = HADD_UW_U32(sad);
820 }
821 
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)822 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
823                                   const uint8_t *ref_ptr, int32_t ref_stride,
824                                   int32_t height, const uint8_t *sec_pred) {
825   int32_t ht_cnt;
826   uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
827   v16u8 src = { 0 };
828   v16u8 ref = { 0 };
829   v16u8 diff, pred, comp;
830   v8u16 sad = { 0 };
831 
832   for (ht_cnt = (height >> 2); ht_cnt--;) {
833     LW4(src_ptr, src_stride, src0, src1, src2, src3);
834     src_ptr += (4 * src_stride);
835     LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
836     ref_ptr += (4 * ref_stride);
837     pred = LD_UB(sec_pred);
838     sec_pred += 16;
839 
840     INSERT_W4_UB(src0, src1, src2, src3, src);
841     INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
842 
843     comp = __msa_aver_u_b(pred, ref);
844     diff = __msa_asub_u_b(src, comp);
845     sad += __msa_hadd_u_h(diff, diff);
846   }
847 
848   return HADD_UH_U32(sad);
849 }
850 
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)851 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
852                                   const uint8_t *ref, int32_t ref_stride,
853                                   int32_t height, const uint8_t *sec_pred) {
854   int32_t ht_cnt;
855   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
856   v16u8 diff0, diff1, pred0, pred1;
857   v8u16 sad = { 0 };
858 
859   for (ht_cnt = (height >> 2); ht_cnt--;) {
860     LD_UB4(src, src_stride, src0, src1, src2, src3);
861     src += (4 * src_stride);
862     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
863     ref += (4 * ref_stride);
864     LD_UB2(sec_pred, 16, pred0, pred1);
865     sec_pred += 32;
866     PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
867                 ref0, ref1);
868     AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
869     sad += SAD_UB2_UH(src0, src1, diff0, diff1);
870   }
871 
872   return HADD_UH_U32(sad);
873 }
874 
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)875 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
876                                    const uint8_t *ref, int32_t ref_stride,
877                                    int32_t height, const uint8_t *sec_pred) {
878   int32_t ht_cnt;
879   v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
880   v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
881   v8u16 sad = { 0 };
882 
883   for (ht_cnt = (height >> 3); ht_cnt--;) {
884     LD_UB4(src, src_stride, src0, src1, src2, src3);
885     src += (4 * src_stride);
886     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
887     ref += (4 * ref_stride);
888     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
889     sec_pred += (4 * 16);
890     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
891     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
892     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
893     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
894 
895     LD_UB4(src, src_stride, src0, src1, src2, src3);
896     src += (4 * src_stride);
897     LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
898     ref += (4 * ref_stride);
899     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
900     sec_pred += (4 * 16);
901     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
902     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
903     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
904     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
905   }
906 
907   return HADD_UH_U32(sad);
908 }
909 
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)910 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
911                                    const uint8_t *ref, int32_t ref_stride,
912                                    int32_t height, const uint8_t *sec_pred) {
913   int32_t ht_cnt;
914   v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
915   v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
916   v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
917   v16u8 comp0, comp1;
918   v8u16 sad = { 0 };
919 
920   for (ht_cnt = (height >> 2); ht_cnt--;) {
921     LD_UB4(src, src_stride, src0, src2, src4, src6);
922     LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
923     src += (4 * src_stride);
924 
925     LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
926     LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
927     ref += (4 * ref_stride);
928 
929     LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
930     LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
931     sec_pred += (4 * 32);
932 
933     AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
934     sad += SAD_UB2_UH(src0, src1, comp0, comp1);
935     AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
936     sad += SAD_UB2_UH(src2, src3, comp0, comp1);
937     AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
938     sad += SAD_UB2_UH(src4, src5, comp0, comp1);
939     AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
940     sad += SAD_UB2_UH(src6, src7, comp0, comp1);
941   }
942 
943   return HADD_UH_U32(sad);
944 }
945 
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)946 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
947                                    const uint8_t *ref, int32_t ref_stride,
948                                    int32_t height, const uint8_t *sec_pred) {
949   int32_t ht_cnt;
950   v16u8 src0, src1, src2, src3;
951   v16u8 ref0, ref1, ref2, ref3;
952   v16u8 comp0, comp1, comp2, comp3;
953   v16u8 pred0, pred1, pred2, pred3;
954   v8u16 sad0 = { 0 };
955   v8u16 sad1 = { 0 };
956   v4u32 sad;
957 
958   for (ht_cnt = (height >> 2); ht_cnt--;) {
959     LD_UB4(src, 16, src0, src1, src2, src3);
960     src += src_stride;
961     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
962     ref += ref_stride;
963     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
964     sec_pred += 64;
965     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
966                 comp1, comp2, comp3);
967     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
968     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
969 
970     LD_UB4(src, 16, src0, src1, src2, src3);
971     src += src_stride;
972     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
973     ref += ref_stride;
974     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
975     sec_pred += 64;
976     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
977                 comp1, comp2, comp3);
978     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
979     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
980 
981     LD_UB4(src, 16, src0, src1, src2, src3);
982     src += src_stride;
983     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
984     ref += ref_stride;
985     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
986     sec_pred += 64;
987     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
988                 comp1, comp2, comp3);
989     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
990     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
991 
992     LD_UB4(src, 16, src0, src1, src2, src3);
993     src += src_stride;
994     LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
995     ref += ref_stride;
996     LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
997     sec_pred += 64;
998     AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
999                 comp1, comp2, comp3);
1000     sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1001     sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1002   }
1003 
1004   sad = __msa_hadd_u_w(sad0, sad0);
1005   sad += __msa_hadd_u_w(sad1, sad1);
1006 
1007   return HADD_SW_S32(sad);
1008 }
1009 
1010 #define VPX_SAD_4xHEIGHT_MSA(height)                                         \
1011   uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1012                                    const uint8_t *ref, int32_t ref_stride) { \
1013     return sad_4width_msa(src, src_stride, ref, ref_stride, height);         \
1014   }
1015 
1016 #define VPX_SAD_8xHEIGHT_MSA(height)                                         \
1017   uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1018                                    const uint8_t *ref, int32_t ref_stride) { \
1019     return sad_8width_msa(src, src_stride, ref, ref_stride, height);         \
1020   }
1021 
1022 #define VPX_SAD_16xHEIGHT_MSA(height)                                         \
1023   uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1024                                     const uint8_t *ref, int32_t ref_stride) { \
1025     return sad_16width_msa(src, src_stride, ref, ref_stride, height);         \
1026   }
1027 
1028 #define VPX_SAD_32xHEIGHT_MSA(height)                                         \
1029   uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1030                                     const uint8_t *ref, int32_t ref_stride) { \
1031     return sad_32width_msa(src, src_stride, ref, ref_stride, height);         \
1032   }
1033 
1034 #define VPX_SAD_64xHEIGHT_MSA(height)                                         \
1035   uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride,   \
1036                                     const uint8_t *ref, int32_t ref_stride) { \
1037     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
1038   }
1039 
1040 #define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
1041   void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1042                                  const uint8_t *ref, int32_t ref_stride, \
1043                                  uint32_t *sads) {                       \
1044     sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1045   }
1046 
1047 #define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
1048   void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1049                                  const uint8_t *ref, int32_t ref_stride, \
1050                                  uint32_t *sads) {                       \
1051     sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1052   }
1053 
1054 #define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
1055   void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1056                                   const uint8_t *ref, int32_t ref_stride, \
1057                                   uint32_t *sads) {                       \
1058     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
1059   }
1060 
1061 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
1062   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1063                                  const uint8_t *ref, int32_t ref_stride, \
1064                                  uint32_t *sads) {                       \
1065     sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1066   }
1067 
1068 #define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
1069   void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1070                                  const uint8_t *ref, int32_t ref_stride, \
1071                                  uint32_t *sads) {                       \
1072     sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1073   }
1074 
1075 #define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
1076   void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1077                                   const uint8_t *ref, int32_t ref_stride, \
1078                                   uint32_t *sads) {                       \
1079     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
1080   }
1081 
1082 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
1083   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1084                                   const uint8_t *const refs[],            \
1085                                   int32_t ref_stride, uint32_t *sads) {   \
1086     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1087   }
1088 
1089 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
1090   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1091                                   const uint8_t *const refs[],            \
1092                                   int32_t ref_stride, uint32_t *sads) {   \
1093     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1094   }
1095 
1096 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
1097   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1098                                    const uint8_t *const refs[],            \
1099                                    int32_t ref_stride, uint32_t *sads) {   \
1100     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1101   }
1102 
1103 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
1104   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1105                                    const uint8_t *const refs[],            \
1106                                    int32_t ref_stride, uint32_t *sads) {   \
1107     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1108   }
1109 
1110 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
1111   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1112                                    const uint8_t *const refs[],            \
1113                                    int32_t ref_stride, uint32_t *sads) {   \
1114     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
1115   }
1116 
1117 #define VPX_AVGSAD_4xHEIGHT_MSA(height)                                        \
1118   uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1119                                        const uint8_t *ref, int32_t ref_stride, \
1120                                        const uint8_t *second_pred) {           \
1121     return avgsad_4width_msa(src, src_stride, ref, ref_stride, height,         \
1122                              second_pred);                                     \
1123   }
1124 
1125 #define VPX_AVGSAD_8xHEIGHT_MSA(height)                                        \
1126   uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1127                                        const uint8_t *ref, int32_t ref_stride, \
1128                                        const uint8_t *second_pred) {           \
1129     return avgsad_8width_msa(src, src_stride, ref, ref_stride, height,         \
1130                              second_pred);                                     \
1131   }
1132 
1133 #define VPX_AVGSAD_16xHEIGHT_MSA(height)                                \
1134   uint32_t vpx_sad16x##height##_avg_msa(                                \
1135       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1136       int32_t ref_stride, const uint8_t *second_pred) {                 \
1137     return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1138                               second_pred);                             \
1139   }
1140 
1141 #define VPX_AVGSAD_32xHEIGHT_MSA(height)                                \
1142   uint32_t vpx_sad32x##height##_avg_msa(                                \
1143       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1144       int32_t ref_stride, const uint8_t *second_pred) {                 \
1145     return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1146                               second_pred);                             \
1147   }
1148 
1149 #define VPX_AVGSAD_64xHEIGHT_MSA(height)                                \
1150   uint32_t vpx_sad64x##height##_avg_msa(                                \
1151       const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
1152       int32_t ref_stride, const uint8_t *second_pred) {                 \
1153     return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1154                               second_pred);                             \
1155   }
1156 
1157 // 64x64
1158 VPX_SAD_64xHEIGHT_MSA(64);
1159 VPX_SAD_64xHEIGHTx4D_MSA(64);
1160 VPX_AVGSAD_64xHEIGHT_MSA(64);
1161 
1162 // 64x32
1163 VPX_SAD_64xHEIGHT_MSA(32);
1164 VPX_SAD_64xHEIGHTx4D_MSA(32);
1165 VPX_AVGSAD_64xHEIGHT_MSA(32);
1166 
1167 // 32x64
1168 VPX_SAD_32xHEIGHT_MSA(64);
1169 VPX_SAD_32xHEIGHTx4D_MSA(64);
1170 VPX_AVGSAD_32xHEIGHT_MSA(64);
1171 
1172 // 32x32
1173 VPX_SAD_32xHEIGHT_MSA(32);
1174 VPX_SAD_32xHEIGHTx4D_MSA(32);
1175 VPX_AVGSAD_32xHEIGHT_MSA(32);
1176 
1177 // 32x16
1178 VPX_SAD_32xHEIGHT_MSA(16);
1179 VPX_SAD_32xHEIGHTx4D_MSA(16);
1180 VPX_AVGSAD_32xHEIGHT_MSA(16);
1181 
1182 // 16x32
1183 VPX_SAD_16xHEIGHT_MSA(32);
1184 VPX_SAD_16xHEIGHTx4D_MSA(32);
1185 VPX_AVGSAD_16xHEIGHT_MSA(32);
1186 
1187 // 16x16
1188 VPX_SAD_16xHEIGHT_MSA(16);
1189 VPX_SAD_16xHEIGHTx3_MSA(16);
1190 VPX_SAD_16xHEIGHTx8_MSA(16);
1191 VPX_SAD_16xHEIGHTx4D_MSA(16);
1192 VPX_AVGSAD_16xHEIGHT_MSA(16);
1193 
1194 // 16x8
1195 VPX_SAD_16xHEIGHT_MSA(8);
1196 VPX_SAD_16xHEIGHTx3_MSA(8);
1197 VPX_SAD_16xHEIGHTx8_MSA(8);
1198 VPX_SAD_16xHEIGHTx4D_MSA(8);
1199 VPX_AVGSAD_16xHEIGHT_MSA(8);
1200 
1201 // 8x16
1202 VPX_SAD_8xHEIGHT_MSA(16);
1203 VPX_SAD_8xHEIGHTx3_MSA(16);
1204 VPX_SAD_8xHEIGHTx8_MSA(16);
1205 VPX_SAD_8xHEIGHTx4D_MSA(16);
1206 VPX_AVGSAD_8xHEIGHT_MSA(16);
1207 
1208 // 8x8
1209 VPX_SAD_8xHEIGHT_MSA(8);
1210 VPX_SAD_8xHEIGHTx3_MSA(8);
1211 VPX_SAD_8xHEIGHTx8_MSA(8);
1212 VPX_SAD_8xHEIGHTx4D_MSA(8);
1213 VPX_AVGSAD_8xHEIGHT_MSA(8);
1214 
1215 // 8x4
1216 VPX_SAD_8xHEIGHT_MSA(4);
1217 VPX_SAD_8xHEIGHTx4D_MSA(4);
1218 VPX_AVGSAD_8xHEIGHT_MSA(4);
1219 
1220 // 4x8
1221 VPX_SAD_4xHEIGHT_MSA(8);
1222 VPX_SAD_4xHEIGHTx4D_MSA(8);
1223 VPX_AVGSAD_4xHEIGHT_MSA(8);
1224 
1225 // 4x4
1226 VPX_SAD_4xHEIGHT_MSA(4);
1227 VPX_SAD_4xHEIGHTx3_MSA(4);
1228 VPX_SAD_4xHEIGHTx8_MSA(4);
1229 VPX_SAD_4xHEIGHTx4D_MSA(4);
1230 VPX_AVGSAD_4xHEIGHT_MSA(4);
1231