1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "me_cmp_mips.h"
23 
sad_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)24 static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,
25                                uint8_t *ref, int32_t ref_stride,
26                                int32_t height)
27 {
28     int32_t ht_cnt;
29     v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
30     v8u16 sad = { 0 };
31 
32     for (ht_cnt = (height >> 2); ht_cnt--;) {
33         LD_UB4(src, src_stride, src0, src1, src2, src3);
34         src += (4 * src_stride);
35         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
36         ref += (4 * ref_stride);
37 
38         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
39                     src0, src1, ref0, ref1);
40         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
41     }
42 
43     return (HADD_UH_U32(sad));
44 }
45 
sad_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)46 static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,
47                                 uint8_t *ref, int32_t ref_stride,
48                                 int32_t height)
49 {
50     int32_t ht_cnt;
51     v16u8 src0, src1, ref0, ref1;
52     v8u16 sad = { 0 };
53 
54     for (ht_cnt = (height >> 2); ht_cnt--;) {
55         LD_UB2(src, src_stride, src0, src1);
56         src += (2 * src_stride);
57         LD_UB2(ref, ref_stride, ref0, ref1);
58         ref += (2 * ref_stride);
59         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
60 
61         LD_UB2(src, src_stride, src0, src1);
62         src += (2 * src_stride);
63         LD_UB2(ref, ref_stride, ref0, ref1);
64         ref += (2 * ref_stride);
65         sad += SAD_UB2_UH(src0, src1, ref0, ref1);
66     }
67 
68     return (HADD_UH_U32(sad));
69 }
70 
sad_horiz_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)71 static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,
72                                                      int32_t src_stride,
73                                                      uint8_t *ref,
74                                                      int32_t ref_stride,
75                                                      int32_t height)
76 {
77     int32_t ht_cnt;
78     v16u8 src0, src1, src2, src3, comp0, comp1;
79     v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
80     v8u16 sad = { 0 };
81 
82     for (ht_cnt = (height >> 3); ht_cnt--;) {
83         LD_UB4(src, src_stride, src0, src1, src2, src3);
84         src += (4 * src_stride);
85         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
86         ref += (4 * ref_stride);
87 
88         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
89         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
90         SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91                    ref0, ref1, ref2, ref3);
92         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
93         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
94         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
95 
96         LD_UB4(src, src_stride, src0, src1, src2, src3);
97         src += (4 * src_stride);
98         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
99         ref += (4 * ref_stride);
100 
101         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
102         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);
103         SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104                    ref0, ref1, ref2, ref3);
105         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
106         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);
107         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
108     }
109 
110     return (HADD_UH_U32(sad));
111 }
112 
sad_horiz_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)113 static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,
114                                                       int32_t src_stride,
115                                                       uint8_t *ref,
116                                                       int32_t ref_stride,
117                                                       int32_t height)
118 {
119     int32_t ht_cnt;
120     v16u8 src0, src1, src2, src3, comp0, comp1;
121     v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
122     v8u16 sad = { 0 };
123 
124     for (ht_cnt = (height >> 3); ht_cnt--;) {
125         LD_UB4(src, src_stride, src0, src1, src2, src3);
126         src += (4 * src_stride);
127         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
128         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129         ref += (4 * ref_stride);
130 
131         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
132         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
133         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
134         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
135 
136         LD_UB4(src, src_stride, src0, src1, src2, src3);
137         src += (4 * src_stride);
138         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);
139         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140         ref += (4 * ref_stride);
141 
142         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
143         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
144         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
145         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
146     }
147 
148     return (HADD_UH_U32(sad));
149 }
150 
sad_vert_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)151 static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,
152                                                     int32_t src_stride,
153                                                     uint8_t *ref,
154                                                     int32_t ref_stride,
155                                                     int32_t height)
156 {
157     int32_t ht_cnt;
158     v16u8 src0, src1, src2, src3, comp0, comp1;
159     v16u8 ref0, ref1, ref2, ref3, ref4;
160     v8u16 sad = { 0 };
161 
162     for (ht_cnt = (height >> 3); ht_cnt--;) {
163         LD_UB4(src, src_stride, src0, src1, src2, src3);
164         src += (4 * src_stride);
165         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166         ref += (4 * ref_stride);
167 
168         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
169         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
170         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
171         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
172         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
173 
174         LD_UB4(src, src_stride, src0, src1, src2, src3);
175         src += (4 * src_stride);
176         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177         ref += (4 * ref_stride);
178 
179         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
180         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);
181         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);
182         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);
183         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
184     }
185 
186     return (HADD_UH_U32(sad));
187 }
188 
sad_vert_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)189 static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,
190                                                      int32_t src_stride,
191                                                      uint8_t *ref,
192                                                      int32_t ref_stride,
193                                                      int32_t height)
194 {
195     int32_t ht_cnt;
196     v16u8 src0, src1, src2, src3, comp0, comp1;
197     v16u8 ref0, ref1, ref2, ref3, ref4;
198     v8u16 sad = { 0 };
199 
200     for (ht_cnt = (height >> 3); ht_cnt--;) {
201         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202         ref += (5 * ref_stride);
203         LD_UB4(src, src_stride, src0, src1, src2, src3);
204         src += (4 * src_stride);
205 
206         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
207         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
208         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
209         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
210 
211         ref4 = ref3;
212 
213         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
214         ref += (3 * ref_stride);
215         LD_UB4(src, src_stride, src0, src1, src2, src3);
216         src += (4 * src_stride);
217 
218         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);
219         sad += SAD_UB2_UH(src0, src1, comp0, comp1);
220         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);
221         sad += SAD_UB2_UH(src2, src3, comp0, comp1);
222     }
223 
224     return (HADD_UH_U32(sad));
225 }
226 
sad_hv_bilinear_filter_8width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)227 static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,
228                                                   int32_t src_stride,
229                                                   uint8_t *ref,
230                                                   int32_t ref_stride,
231                                                   int32_t height)
232 {
233     int32_t ht_cnt;
234     v16u8 src0, src1, src2, src3, temp0, temp1, diff;
235     v16u8 ref0, ref1, ref2, ref3, ref4;
236     v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237     v8u16 comp0, comp1, comp2, comp3;
238     v8u16 sad = { 0 };
239 
240     for (ht_cnt = (height >> 2); ht_cnt--;) {
241         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242         ref += (4 * ref_stride);
243         LD_UB4(src, src_stride, src0, src1, src2, src3);
244         src += (4 * src_stride);
245 
246         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
247 
248         VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);
249         comp0 = __msa_hadd_u_h(temp0, temp0);
250         comp1 = __msa_hadd_u_h(temp1, temp1);
251         comp0 += comp1;
252         comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253         comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
254 
255         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);
256         comp2 = __msa_hadd_u_h(temp0, temp0);
257         comp1 += comp2;
258         comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259         comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260         comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261         diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);
262         sad += __msa_hadd_u_h(diff, diff);
263 
264         temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);
265         comp3 = __msa_hadd_u_h(temp1, temp1);
266         comp2 += comp3;
267         comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268         comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
269 
270         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);
271         comp0 = __msa_hadd_u_h(temp0, temp0);
272         comp3 += comp0;
273         comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274         comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275         comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276         diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);
277         sad += __msa_hadd_u_h(diff, diff);
278     }
279 
280     return (HADD_UH_U32(sad));
281 }
282 
sad_hv_bilinear_filter_16width_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride,int32_t height)283 static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,
284                                                    int32_t src_stride,
285                                                    uint8_t *ref,
286                                                    int32_t ref_stride,
287                                                    int32_t height)
288 {
289     int32_t ht_cnt;
290     v16u8 src0, src1, src2, src3, comp, diff;
291     v16u8 temp0, temp1, temp2, temp3;
292     v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293     v8u16 comp0, comp1, comp2, comp3;
294     v8u16 sad = { 0 };
295 
296     for (ht_cnt = (height >> 3); ht_cnt--;) {
297         LD_UB4(src, src_stride, src0, src1, src2, src3);
298         src += (4 * src_stride);
299         LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300         LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301         ref += (5 * ref_stride);
302 
303         ILVRL_B2_UB(ref14, ref04, temp0, temp1);
304         comp0 = __msa_hadd_u_h(temp0, temp0);
305         comp1 = __msa_hadd_u_h(temp1, temp1);
306         ILVRL_B2_UB(ref10, ref00, temp2, temp3);
307         comp2 = __msa_hadd_u_h(temp2, temp2);
308         comp3 = __msa_hadd_u_h(temp3, temp3);
309         comp0 += comp2;
310         comp1 += comp3;
311         SRARI_H2_UH(comp0, comp1, 2);
312         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
313         diff = __msa_asub_u_b(src0, comp);
314         sad += __msa_hadd_u_h(diff, diff);
315 
316         ILVRL_B2_UB(ref11, ref01, temp0, temp1);
317         comp0 = __msa_hadd_u_h(temp0, temp0);
318         comp1 = __msa_hadd_u_h(temp1, temp1);
319         comp2 += comp0;
320         comp3 += comp1;
321         SRARI_H2_UH(comp2, comp3, 2);
322         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
323         diff = __msa_asub_u_b(src1, comp);
324         sad += __msa_hadd_u_h(diff, diff);
325 
326         ILVRL_B2_UB(ref12, ref02, temp2, temp3);
327         comp2 = __msa_hadd_u_h(temp2, temp2);
328         comp3 = __msa_hadd_u_h(temp3, temp3);
329         comp0 += comp2;
330         comp1 += comp3;
331         SRARI_H2_UH(comp0, comp1, 2);
332         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
333         diff = __msa_asub_u_b(src2, comp);
334         sad += __msa_hadd_u_h(diff, diff);
335 
336         ILVRL_B2_UB(ref13, ref03, temp0, temp1);
337         comp0 = __msa_hadd_u_h(temp0, temp0);
338         comp1 = __msa_hadd_u_h(temp1, temp1);
339         comp2 += comp0;
340         comp3 += comp1;
341         SRARI_H2_UH(comp2, comp3, 2);
342         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
343         diff = __msa_asub_u_b(src3, comp);
344         sad += __msa_hadd_u_h(diff, diff);
345 
346         LD_UB4(src, src_stride, src0, src1, src2, src3);
347         src += (4 * src_stride);
348         LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);
349         LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350         ref += (3 * ref_stride);
351 
352         ILVRL_B2_UB(ref10, ref00, temp2, temp3);
353         comp2 = __msa_hadd_u_h(temp2, temp2);
354         comp3 = __msa_hadd_u_h(temp3, temp3);
355         comp0 += comp2;
356         comp1 += comp3;
357         SRARI_H2_UH(comp0, comp1, 2);
358         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
359         diff = __msa_asub_u_b(src0, comp);
360         sad += __msa_hadd_u_h(diff, diff);
361 
362         ILVRL_B2_UB(ref11, ref01, temp0, temp1);
363         comp0 = __msa_hadd_u_h(temp0, temp0);
364         comp1 = __msa_hadd_u_h(temp1, temp1);
365         comp2 += comp0;
366         comp3 += comp1;
367         SRARI_H2_UH(comp2, comp3, 2);
368         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
369         diff = __msa_asub_u_b(src1, comp);
370         sad += __msa_hadd_u_h(diff, diff);
371 
372         ILVRL_B2_UB(ref12, ref02, temp2, temp3);
373         comp2 = __msa_hadd_u_h(temp2, temp2);
374         comp3 = __msa_hadd_u_h(temp3, temp3);
375         comp0 += comp2;
376         comp1 += comp3;
377         SRARI_H2_UH(comp0, comp1, 2);
378         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
379         diff = __msa_asub_u_b(src2, comp);
380         sad += __msa_hadd_u_h(diff, diff);
381 
382         ILVRL_B2_UB(ref13, ref03, temp0, temp1);
383         comp0 = __msa_hadd_u_h(temp0, temp0);
384         comp1 = __msa_hadd_u_h(temp1, temp1);
385         comp2 += comp0;
386         comp3 += comp1;
387         SRARI_H2_UH(comp2, comp3, 2);
388         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
389         diff = __msa_asub_u_b(src3, comp);
390         sad += __msa_hadd_u_h(diff, diff);
391     }
392 
393     return (HADD_UH_U32(sad));
394 }
395 
396 #define CALC_MSE_B(src, ref, var)                                    \
397 {                                                                    \
398     v16u8 src_l0_m, src_l1_m;                                        \
399     v8i16 res_l0_m, res_l1_m;                                        \
400                                                                      \
401     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
402     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
403     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
404 }
405 
sse_4width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)406 static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,
407                                uint8_t *ref_ptr, int32_t ref_stride,
408                                int32_t height)
409 {
410     int32_t ht_cnt;
411     uint32_t sse;
412     uint32_t src0, src1, src2, src3;
413     uint32_t ref0, ref1, ref2, ref3;
414     v16u8 src = { 0 };
415     v16u8 ref = { 0 };
416     v4i32 var = { 0 };
417 
418     for (ht_cnt = (height >> 2); ht_cnt--;) {
419         LW4(src_ptr, src_stride, src0, src1, src2, src3);
420         src_ptr += (4 * src_stride);
421         LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422         ref_ptr += (4 * ref_stride);
423 
424         INSERT_W4_UB(src0, src1, src2, src3, src);
425         INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
426         CALC_MSE_B(src, ref, var);
427     }
428 
429     sse = HADD_SW_S32(var);
430 
431     return sse;
432 }
433 
sse_8width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)434 static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,
435                                uint8_t *ref_ptr, int32_t ref_stride,
436                                int32_t height)
437 {
438     int32_t ht_cnt;
439     uint32_t sse;
440     v16u8 src0, src1, src2, src3;
441     v16u8 ref0, ref1, ref2, ref3;
442     v4i32 var = { 0 };
443 
444     for (ht_cnt = (height >> 2); ht_cnt--;) {
445         LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
446         src_ptr += (4 * src_stride);
447         LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448         ref_ptr += (4 * ref_stride);
449 
450         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
451                     src0, src1, ref0, ref1);
452         CALC_MSE_B(src0, ref0, var);
453         CALC_MSE_B(src1, ref1, var);
454     }
455 
456     sse = HADD_SW_S32(var);
457 
458     return sse;
459 }
460 
sse_16width_msa(uint8_t * src_ptr,int32_t src_stride,uint8_t * ref_ptr,int32_t ref_stride,int32_t height)461 static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,
462                                 uint8_t *ref_ptr, int32_t ref_stride,
463                                 int32_t height)
464 {
465     int32_t ht_cnt;
466     uint32_t sse;
467     v16u8 src, ref;
468     v4i32 var = { 0 };
469 
470     for (ht_cnt = (height >> 2); ht_cnt--;) {
471         src = LD_UB(src_ptr);
472         src_ptr += src_stride;
473         ref = LD_UB(ref_ptr);
474         ref_ptr += ref_stride;
475         CALC_MSE_B(src, ref, var);
476 
477         src = LD_UB(src_ptr);
478         src_ptr += src_stride;
479         ref = LD_UB(ref_ptr);
480         ref_ptr += ref_stride;
481         CALC_MSE_B(src, ref, var);
482 
483         src = LD_UB(src_ptr);
484         src_ptr += src_stride;
485         ref = LD_UB(ref_ptr);
486         ref_ptr += ref_stride;
487         CALC_MSE_B(src, ref, var);
488 
489         src = LD_UB(src_ptr);
490         src_ptr += src_stride;
491         ref = LD_UB(ref_ptr);
492         ref_ptr += ref_stride;
493         CALC_MSE_B(src, ref, var);
494     }
495 
496     sse = HADD_SW_S32(var);
497 
498     return sse;
499 }
500 
hadamard_diff_8x8_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride)501 static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,
502                                      uint8_t *ref, int32_t ref_stride)
503 {
504     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
505     v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
508     v8i16 sum = { 0 };
509     v8i16 zero = { 0 };
510 
511     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
512     LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
513     ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,
514                src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516     HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517     HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
518     TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
519                        diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
526     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
527                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533          diff0, diff1, diff2, diff3);
534     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538     sum += __msa_add_a_h((v8i16) diff0, zero);
539     sum += __msa_add_a_h((v8i16) diff1, zero);
540     sum += __msa_add_a_h((v8i16) diff2, zero);
541     sum += __msa_add_a_h((v8i16) diff3, zero);
542 
543     return (HADD_UH_U32(sum));
544 }
545 
hadamard_intra_8x8_msa(uint8_t * src,int32_t src_stride,uint8_t * ref,int32_t ref_stride)546 static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,
547                                       uint8_t *ref, int32_t ref_stride)
548 {
549     int32_t sum_res = 0;
550     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
551     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
553     v8i16 sum = { 0 };
554     v16i8 zero = { 0 };
555 
556     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
557     TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,
558                        src0, src1, src2, src3, src4, src5, src6, src7);
559     ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,
560                zero, src4, zero, src5, zero, src6, zero, src7,
561                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
568     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,
569                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575          diff0, diff1, diff2, diff3);
576     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580     sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);
581     sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);
582     sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);
583     sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);
584     sum_res = (HADD_UH_U32(sum));
585     sum_res -= abs(temp0[0] + temp4[0]);
586 
587     return sum_res;
588 }
589 
ff_pix_abs16_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)590 int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
591                      ptrdiff_t stride, int height)
592 {
593     return sad_16width_msa(src, stride, ref, stride, height);
594 }
595 
ff_pix_abs8_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)596 int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
597                     ptrdiff_t stride, int height)
598 {
599     return sad_8width_msa(src, stride, ref, stride, height);
600 }
601 
ff_pix_abs16_x2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)602 int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
603                         ptrdiff_t stride, int h)
604 {
605     return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
606 }
607 
ff_pix_abs16_y2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)608 int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
609                         ptrdiff_t stride, int h)
610 {
611     return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
612 }
613 
ff_pix_abs16_xy2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)614 int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
615                          ptrdiff_t stride, int h)
616 {
617     return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);
618 }
619 
ff_pix_abs8_x2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)620 int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
621                        ptrdiff_t stride, int h)
622 {
623     return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
624 }
625 
ff_pix_abs8_y2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)626 int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
627                        ptrdiff_t stride, int h)
628 {
629     return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
630 }
631 
ff_pix_abs8_xy2_msa(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)632 int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
633                         ptrdiff_t stride, int h)
634 {
635     return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);
636 }
637 
ff_sse16_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)638 int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
639                  ptrdiff_t stride, int height)
640 {
641     return sse_16width_msa(src, stride, ref, stride, height);
642 }
643 
ff_sse8_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)644 int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
645                 ptrdiff_t stride, int height)
646 {
647     return sse_8width_msa(src, stride, ref, stride, height);
648 }
649 
ff_sse4_msa(MpegEncContext * v,uint8_t * src,uint8_t * ref,ptrdiff_t stride,int height)650 int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,
651                 ptrdiff_t stride, int height)
652 {
653     return sse_4width_msa(src, stride, ref, stride, height);
654 }
655 
ff_hadamard8_diff8x8_msa(MpegEncContext * s,uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h)656 int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
657                              ptrdiff_t stride, int h)
658 {
659     return hadamard_diff_8x8_msa(src, stride, dst, stride);
660 }
661 
ff_hadamard8_intra8x8_msa(MpegEncContext * s,uint8_t * dst,uint8_t * src,ptrdiff_t stride,int h)662 int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,
663                               ptrdiff_t stride, int h)
664 {
665     return hadamard_intra_8x8_msa(src, stride, dst, stride);
666 }
667 
668 /* Hadamard Transform functions */
669 #define WRAPPER8_16_SQ(name8, name16)                      \
670 int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \
671            ptrdiff_t stride, int h)                        \
672 {                                                          \
673     int score = 0;                                         \
674     score += name8(s, dst, src, stride, 8);                \
675     score += name8(s, dst + 8, src + 8, stride, 8);        \
676     if(h == 16) {                                          \
677         dst += 8 * stride;                                 \
678         src += 8 * stride;                                 \
679         score +=name8(s, dst, src, stride, 8);             \
680         score +=name8(s, dst + 8, src + 8, stride, 8);     \
681     }                                                      \
682     return score;                                          \
683 }
684 
685 WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);
686 WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);
687