1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/macros_msa.h"
13
14 #define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) \
15 { \
16 out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0); \
17 out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1); \
18 out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2); \
19 out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3); \
20 }
21 #define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
22
sad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height)23 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
24 const uint8_t *ref_ptr, int32_t ref_stride,
25 int32_t height) {
26 int32_t ht_cnt;
27 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
28 v16u8 src = { 0 };
29 v16u8 ref = { 0 };
30 v16u8 diff;
31 v8u16 sad = { 0 };
32
33 for (ht_cnt = (height >> 2); ht_cnt--;) {
34 LW4(src_ptr, src_stride, src0, src1, src2, src3);
35 src_ptr += (4 * src_stride);
36 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
37 ref_ptr += (4 * ref_stride);
38
39 INSERT_W4_UB(src0, src1, src2, src3, src);
40 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
41
42 diff = __msa_asub_u_b(src, ref);
43 sad += __msa_hadd_u_h(diff, diff);
44 }
45
46 return HADD_UH_U32(sad);
47 }
48
sad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)49 static uint32_t sad_8width_msa(const uint8_t *src, int32_t src_stride,
50 const uint8_t *ref, int32_t ref_stride,
51 int32_t height) {
52 int32_t ht_cnt;
53 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
54 v8u16 sad = { 0 };
55
56 for (ht_cnt = (height >> 2); ht_cnt--;) {
57 LD_UB4(src, src_stride, src0, src1, src2, src3);
58 src += (4 * src_stride);
59 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
60 ref += (4 * ref_stride);
61
62 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
63 ref0, ref1);
64 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
65 }
66
67 return HADD_UH_U32(sad);
68 }
69
sad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)70 static uint32_t sad_16width_msa(const uint8_t *src, int32_t src_stride,
71 const uint8_t *ref, int32_t ref_stride,
72 int32_t height) {
73 int32_t ht_cnt;
74 v16u8 src0, src1, ref0, ref1;
75 v8u16 sad = { 0 };
76
77 for (ht_cnt = (height >> 2); ht_cnt--;) {
78 LD_UB2(src, src_stride, src0, src1);
79 src += (2 * src_stride);
80 LD_UB2(ref, ref_stride, ref0, ref1);
81 ref += (2 * ref_stride);
82 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
83
84 LD_UB2(src, src_stride, src0, src1);
85 src += (2 * src_stride);
86 LD_UB2(ref, ref_stride, ref0, ref1);
87 ref += (2 * ref_stride);
88 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
89 }
90
91 return HADD_UH_U32(sad);
92 }
93
sad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)94 static uint32_t sad_32width_msa(const uint8_t *src, int32_t src_stride,
95 const uint8_t *ref, int32_t ref_stride,
96 int32_t height) {
97 int32_t ht_cnt;
98 v16u8 src0, src1, ref0, ref1;
99 v8u16 sad = { 0 };
100
101 for (ht_cnt = (height >> 2); ht_cnt--;) {
102 LD_UB2(src, 16, src0, src1);
103 src += src_stride;
104 LD_UB2(ref, 16, ref0, ref1);
105 ref += ref_stride;
106 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
107
108 LD_UB2(src, 16, src0, src1);
109 src += src_stride;
110 LD_UB2(ref, 16, ref0, ref1);
111 ref += ref_stride;
112 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
113
114 LD_UB2(src, 16, src0, src1);
115 src += src_stride;
116 LD_UB2(ref, 16, ref0, ref1);
117 ref += ref_stride;
118 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
119
120 LD_UB2(src, 16, src0, src1);
121 src += src_stride;
122 LD_UB2(ref, 16, ref0, ref1);
123 ref += ref_stride;
124 sad += SAD_UB2_UH(src0, src1, ref0, ref1);
125 }
126
127 return HADD_UH_U32(sad);
128 }
129
sad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height)130 static uint32_t sad_64width_msa(const uint8_t *src, int32_t src_stride,
131 const uint8_t *ref, int32_t ref_stride,
132 int32_t height) {
133 int32_t ht_cnt;
134 uint32_t sad = 0;
135 v16u8 src0, src1, src2, src3;
136 v16u8 ref0, ref1, ref2, ref3;
137 v8u16 sad0 = { 0 };
138 v8u16 sad1 = { 0 };
139
140 for (ht_cnt = (height >> 1); ht_cnt--;) {
141 LD_UB4(src, 16, src0, src1, src2, src3);
142 src += src_stride;
143 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
144 ref += ref_stride;
145 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
146 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
147
148 LD_UB4(src, 16, src0, src1, src2, src3);
149 src += src_stride;
150 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
151 ref += ref_stride;
152 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
153 sad1 += SAD_UB2_UH(src2, src3, ref2, ref3);
154 }
155
156 sad = HADD_UH_U32(sad0);
157 sad += HADD_UH_U32(sad1);
158
159 return sad;
160 }
161
sad_4width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)162 static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
163 const uint8_t *ref_ptr, int32_t ref_stride,
164 int32_t height, uint32_t *sad_array) {
165 int32_t ht_cnt;
166 uint32_t src0, src1, src2, src3;
167 v16u8 src = { 0 };
168 v16u8 ref = { 0 };
169 v16u8 ref0, ref1, ref2, ref3, diff;
170 v8u16 sad0 = { 0 };
171 v8u16 sad1 = { 0 };
172 v8u16 sad2 = { 0 };
173
174 for (ht_cnt = (height >> 2); ht_cnt--;) {
175 LW4(src_ptr, src_stride, src0, src1, src2, src3);
176 src_ptr += (4 * src_stride);
177 INSERT_W4_UB(src0, src1, src2, src3, src);
178
179 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
180 ref_ptr += (4 * ref_stride);
181 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
182 diff = __msa_asub_u_b(src, ref);
183 sad0 += __msa_hadd_u_h(diff, diff);
184
185 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
186 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
187 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
188 diff = __msa_asub_u_b(src, ref);
189 sad1 += __msa_hadd_u_h(diff, diff);
190
191 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
192 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
193 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
194 diff = __msa_asub_u_b(src, ref);
195 sad2 += __msa_hadd_u_h(diff, diff);
196 }
197
198 sad_array[0] = HADD_UH_U32(sad0);
199 sad_array[1] = HADD_UH_U32(sad1);
200 sad_array[2] = HADD_UH_U32(sad2);
201 }
202
sad_8width_x3_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)203 static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
204 const uint8_t *ref, int32_t ref_stride,
205 int32_t height, uint32_t *sad_array) {
206 int32_t ht_cnt;
207 v16u8 src0, src1, src2, src3;
208 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
209 v8u16 sad0 = { 0 };
210 v8u16 sad1 = { 0 };
211 v8u16 sad2 = { 0 };
212
213 for (ht_cnt = (height >> 2); ht_cnt--;) {
214 LD_UB4(src, src_stride, src0, src1, src2, src3);
215 src += (4 * src_stride);
216 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
217 ref += (4 * ref_stride);
218 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
219 ref0, ref1);
220 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
221
222 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
223 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
224 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
225 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
226
227 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
228 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
229 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
230 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
231 }
232
233 sad_array[0] = HADD_UH_U32(sad0);
234 sad_array[1] = HADD_UH_U32(sad1);
235 sad_array[2] = HADD_UH_U32(sad2);
236 }
237
sad_16width_x3_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)238 static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
239 const uint8_t *ref_ptr, int32_t ref_stride,
240 int32_t height, uint32_t *sad_array) {
241 int32_t ht_cnt;
242 v16u8 src, ref, ref0, ref1, diff;
243 v8u16 sad0 = { 0 };
244 v8u16 sad1 = { 0 };
245 v8u16 sad2 = { 0 };
246
247 for (ht_cnt = (height >> 1); ht_cnt--;) {
248 src = LD_UB(src_ptr);
249 src_ptr += src_stride;
250 LD_UB2(ref_ptr, 16, ref0, ref1);
251 ref_ptr += ref_stride;
252
253 diff = __msa_asub_u_b(src, ref0);
254 sad0 += __msa_hadd_u_h(diff, diff);
255
256 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
257 diff = __msa_asub_u_b(src, ref);
258 sad1 += __msa_hadd_u_h(diff, diff);
259
260 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
261 diff = __msa_asub_u_b(src, ref);
262 sad2 += __msa_hadd_u_h(diff, diff);
263
264 src = LD_UB(src_ptr);
265 src_ptr += src_stride;
266 LD_UB2(ref_ptr, 16, ref0, ref1);
267 ref_ptr += ref_stride;
268
269 diff = __msa_asub_u_b(src, ref0);
270 sad0 += __msa_hadd_u_h(diff, diff);
271
272 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
273 diff = __msa_asub_u_b(src, ref);
274 sad1 += __msa_hadd_u_h(diff, diff);
275
276 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
277 diff = __msa_asub_u_b(src, ref);
278 sad2 += __msa_hadd_u_h(diff, diff);
279 }
280
281 sad_array[0] = HADD_UH_U32(sad0);
282 sad_array[1] = HADD_UH_U32(sad1);
283 sad_array[2] = HADD_UH_U32(sad2);
284 }
285
sad_4width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)286 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
287 const uint8_t *ref_ptr, int32_t ref_stride,
288 int32_t height, uint32_t *sad_array) {
289 int32_t ht_cnt;
290 uint32_t src0, src1, src2, src3;
291 v16u8 ref0, ref1, ref2, ref3, diff;
292 v16u8 src = { 0 };
293 v16u8 ref = { 0 };
294 v8u16 sad0 = { 0 };
295 v8u16 sad1 = { 0 };
296 v8u16 sad2 = { 0 };
297 v8u16 sad3 = { 0 };
298 v8u16 sad4 = { 0 };
299 v8u16 sad5 = { 0 };
300 v8u16 sad6 = { 0 };
301 v8u16 sad7 = { 0 };
302
303 for (ht_cnt = (height >> 2); ht_cnt--;) {
304 LW4(src_ptr, src_stride, src0, src1, src2, src3);
305 INSERT_W4_UB(src0, src1, src2, src3, src);
306 src_ptr += (4 * src_stride);
307 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
308 ref_ptr += (4 * ref_stride);
309
310 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
311 diff = __msa_asub_u_b(src, ref);
312 sad0 += __msa_hadd_u_h(diff, diff);
313
314 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
315 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
316 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
317 diff = __msa_asub_u_b(src, ref);
318 sad1 += __msa_hadd_u_h(diff, diff);
319
320 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
321 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
322 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
323 diff = __msa_asub_u_b(src, ref);
324 sad2 += __msa_hadd_u_h(diff, diff);
325
326 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
327 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
328 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
329 diff = __msa_asub_u_b(src, ref);
330 sad3 += __msa_hadd_u_h(diff, diff);
331
332 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
333 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
334 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
335 diff = __msa_asub_u_b(src, ref);
336 sad4 += __msa_hadd_u_h(diff, diff);
337
338 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
339 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
340 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
341 diff = __msa_asub_u_b(src, ref);
342 sad5 += __msa_hadd_u_h(diff, diff);
343
344 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
345 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
346 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
347 diff = __msa_asub_u_b(src, ref);
348 sad6 += __msa_hadd_u_h(diff, diff);
349
350 SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
351 SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
352 SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
353 diff = __msa_asub_u_b(src, ref);
354 sad7 += __msa_hadd_u_h(diff, diff);
355 }
356
357 sad_array[0] = HADD_UH_U32(sad0);
358 sad_array[1] = HADD_UH_U32(sad1);
359 sad_array[2] = HADD_UH_U32(sad2);
360 sad_array[3] = HADD_UH_U32(sad3);
361 sad_array[4] = HADD_UH_U32(sad4);
362 sad_array[5] = HADD_UH_U32(sad5);
363 sad_array[6] = HADD_UH_U32(sad6);
364 sad_array[7] = HADD_UH_U32(sad7);
365 }
366
sad_8width_x8_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,uint32_t * sad_array)367 static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
368 const uint8_t *ref, int32_t ref_stride,
369 int32_t height, uint32_t *sad_array) {
370 int32_t ht_cnt;
371 v16u8 src0, src1, src2, src3;
372 v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
373 v8u16 sad0 = { 0 };
374 v8u16 sad1 = { 0 };
375 v8u16 sad2 = { 0 };
376 v8u16 sad3 = { 0 };
377 v8u16 sad4 = { 0 };
378 v8u16 sad5 = { 0 };
379 v8u16 sad6 = { 0 };
380 v8u16 sad7 = { 0 };
381
382 for (ht_cnt = (height >> 2); ht_cnt--;) {
383 LD_UB4(src, src_stride, src0, src1, src2, src3);
384 src += (4 * src_stride);
385 LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
386 ref += (4 * ref_stride);
387 PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
388 ref0, ref1);
389 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
390
391 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
392 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
393 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
394 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
395
396 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
397 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
398 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
399 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
400
401 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
402 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
403 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
404 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
405
406 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
407 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
408 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
409 sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
410
411 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
412 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
413 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
414 sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
415
416 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
417 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
418 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
419 sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
420
421 SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
422 SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
423 PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
424 sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
425 }
426
427 sad_array[0] = HADD_UH_U32(sad0);
428 sad_array[1] = HADD_UH_U32(sad1);
429 sad_array[2] = HADD_UH_U32(sad2);
430 sad_array[3] = HADD_UH_U32(sad3);
431 sad_array[4] = HADD_UH_U32(sad4);
432 sad_array[5] = HADD_UH_U32(sad5);
433 sad_array[6] = HADD_UH_U32(sad6);
434 sad_array[7] = HADD_UH_U32(sad7);
435 }
436
sad_16width_x8_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,uint32_t * sad_array)437 static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
438 const uint8_t *ref_ptr, int32_t ref_stride,
439 int32_t height, uint32_t *sad_array) {
440 int32_t ht_cnt;
441 v16u8 src, ref0, ref1, ref;
442 v16u8 diff;
443 v8u16 sad0 = { 0 };
444 v8u16 sad1 = { 0 };
445 v8u16 sad2 = { 0 };
446 v8u16 sad3 = { 0 };
447 v8u16 sad4 = { 0 };
448 v8u16 sad5 = { 0 };
449 v8u16 sad6 = { 0 };
450 v8u16 sad7 = { 0 };
451
452 for (ht_cnt = (height >> 1); ht_cnt--;) {
453 src = LD_UB(src_ptr);
454 src_ptr += src_stride;
455 LD_UB2(ref_ptr, 16, ref0, ref1);
456 ref_ptr += ref_stride;
457
458 diff = __msa_asub_u_b(src, ref0);
459 sad0 += __msa_hadd_u_h(diff, diff);
460
461 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
462 diff = __msa_asub_u_b(src, ref);
463 sad1 += __msa_hadd_u_h(diff, diff);
464
465 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
466 diff = __msa_asub_u_b(src, ref);
467 sad2 += __msa_hadd_u_h(diff, diff);
468
469 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
470 diff = __msa_asub_u_b(src, ref);
471 sad3 += __msa_hadd_u_h(diff, diff);
472
473 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
474 diff = __msa_asub_u_b(src, ref);
475 sad4 += __msa_hadd_u_h(diff, diff);
476
477 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
478 diff = __msa_asub_u_b(src, ref);
479 sad5 += __msa_hadd_u_h(diff, diff);
480
481 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
482 diff = __msa_asub_u_b(src, ref);
483 sad6 += __msa_hadd_u_h(diff, diff);
484
485 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
486 diff = __msa_asub_u_b(src, ref);
487 sad7 += __msa_hadd_u_h(diff, diff);
488
489 src = LD_UB(src_ptr);
490 src_ptr += src_stride;
491 LD_UB2(ref_ptr, 16, ref0, ref1);
492 ref_ptr += ref_stride;
493
494 diff = __msa_asub_u_b(src, ref0);
495 sad0 += __msa_hadd_u_h(diff, diff);
496
497 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
498 diff = __msa_asub_u_b(src, ref);
499 sad1 += __msa_hadd_u_h(diff, diff);
500
501 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
502 diff = __msa_asub_u_b(src, ref);
503 sad2 += __msa_hadd_u_h(diff, diff);
504
505 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
506 diff = __msa_asub_u_b(src, ref);
507 sad3 += __msa_hadd_u_h(diff, diff);
508
509 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
510 diff = __msa_asub_u_b(src, ref);
511 sad4 += __msa_hadd_u_h(diff, diff);
512
513 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
514 diff = __msa_asub_u_b(src, ref);
515 sad5 += __msa_hadd_u_h(diff, diff);
516
517 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
518 diff = __msa_asub_u_b(src, ref);
519 sad6 += __msa_hadd_u_h(diff, diff);
520
521 ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
522 diff = __msa_asub_u_b(src, ref);
523 sad7 += __msa_hadd_u_h(diff, diff);
524 }
525
526 sad_array[0] = HADD_UH_U32(sad0);
527 sad_array[1] = HADD_UH_U32(sad1);
528 sad_array[2] = HADD_UH_U32(sad2);
529 sad_array[3] = HADD_UH_U32(sad3);
530 sad_array[4] = HADD_UH_U32(sad4);
531 sad_array[5] = HADD_UH_U32(sad5);
532 sad_array[6] = HADD_UH_U32(sad6);
533 sad_array[7] = HADD_UH_U32(sad7);
534 }
535
sad_4width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)536 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
537 const uint8_t *const aref_ptr[],
538 int32_t ref_stride, int32_t height,
539 uint32_t *sad_array) {
540 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
541 int32_t ht_cnt;
542 uint32_t src0, src1, src2, src3;
543 uint32_t ref0, ref1, ref2, ref3;
544 v16u8 src = { 0 };
545 v16u8 ref = { 0 };
546 v16u8 diff;
547 v8u16 sad0 = { 0 };
548 v8u16 sad1 = { 0 };
549 v8u16 sad2 = { 0 };
550 v8u16 sad3 = { 0 };
551
552 ref0_ptr = aref_ptr[0];
553 ref1_ptr = aref_ptr[1];
554 ref2_ptr = aref_ptr[2];
555 ref3_ptr = aref_ptr[3];
556
557 for (ht_cnt = (height >> 2); ht_cnt--;) {
558 LW4(src_ptr, src_stride, src0, src1, src2, src3);
559 INSERT_W4_UB(src0, src1, src2, src3, src);
560 src_ptr += (4 * src_stride);
561
562 LW4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
563 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
564 ref0_ptr += (4 * ref_stride);
565
566 diff = __msa_asub_u_b(src, ref);
567 sad0 += __msa_hadd_u_h(diff, diff);
568
569 LW4(ref1_ptr, ref_stride, ref0, ref1, ref2, ref3);
570 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
571 ref1_ptr += (4 * ref_stride);
572
573 diff = __msa_asub_u_b(src, ref);
574 sad1 += __msa_hadd_u_h(diff, diff);
575
576 LW4(ref2_ptr, ref_stride, ref0, ref1, ref2, ref3);
577 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
578 ref2_ptr += (4 * ref_stride);
579
580 diff = __msa_asub_u_b(src, ref);
581 sad2 += __msa_hadd_u_h(diff, diff);
582
583 LW4(ref3_ptr, ref_stride, ref0, ref1, ref2, ref3);
584 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
585 ref3_ptr += (4 * ref_stride);
586
587 diff = __msa_asub_u_b(src, ref);
588 sad3 += __msa_hadd_u_h(diff, diff);
589 }
590
591 sad_array[0] = HADD_UH_U32(sad0);
592 sad_array[1] = HADD_UH_U32(sad1);
593 sad_array[2] = HADD_UH_U32(sad2);
594 sad_array[3] = HADD_UH_U32(sad3);
595 }
596
sad_8width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)597 static void sad_8width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
598 const uint8_t *const aref_ptr[],
599 int32_t ref_stride, int32_t height,
600 uint32_t *sad_array) {
601 int32_t ht_cnt;
602 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
603 v16u8 src0, src1, src2, src3;
604 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
605 v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
606 v8u16 sad0 = { 0 };
607 v8u16 sad1 = { 0 };
608 v8u16 sad2 = { 0 };
609 v8u16 sad3 = { 0 };
610
611 ref0_ptr = aref_ptr[0];
612 ref1_ptr = aref_ptr[1];
613 ref2_ptr = aref_ptr[2];
614 ref3_ptr = aref_ptr[3];
615
616 for (ht_cnt = (height >> 2); ht_cnt--;) {
617 LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
618 src_ptr += (4 * src_stride);
619 LD_UB4(ref0_ptr, ref_stride, ref0, ref1, ref2, ref3);
620 ref0_ptr += (4 * ref_stride);
621 LD_UB4(ref1_ptr, ref_stride, ref4, ref5, ref6, ref7);
622 ref1_ptr += (4 * ref_stride);
623 LD_UB4(ref2_ptr, ref_stride, ref8, ref9, ref10, ref11);
624 ref2_ptr += (4 * ref_stride);
625 LD_UB4(ref3_ptr, ref_stride, ref12, ref13, ref14, ref15);
626 ref3_ptr += (4 * ref_stride);
627
628 PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);
629 PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
630 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
631
632 PCKEV_D2_UB(ref5, ref4, ref7, ref6, ref0, ref1);
633 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
634
635 PCKEV_D2_UB(ref9, ref8, ref11, ref10, ref0, ref1);
636 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
637
638 PCKEV_D2_UB(ref13, ref12, ref15, ref14, ref0, ref1);
639 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
640 }
641
642 sad_array[0] = HADD_UH_U32(sad0);
643 sad_array[1] = HADD_UH_U32(sad1);
644 sad_array[2] = HADD_UH_U32(sad2);
645 sad_array[3] = HADD_UH_U32(sad3);
646 }
647
sad_16width_x4d_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)648 static void sad_16width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
649 const uint8_t *const aref_ptr[],
650 int32_t ref_stride, int32_t height,
651 uint32_t *sad_array) {
652 int32_t ht_cnt;
653 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
654 v16u8 src, ref0, ref1, ref2, ref3, diff;
655 v8u16 sad0 = { 0 };
656 v8u16 sad1 = { 0 };
657 v8u16 sad2 = { 0 };
658 v8u16 sad3 = { 0 };
659
660 ref0_ptr = aref_ptr[0];
661 ref1_ptr = aref_ptr[1];
662 ref2_ptr = aref_ptr[2];
663 ref3_ptr = aref_ptr[3];
664
665 for (ht_cnt = (height >> 1); ht_cnt--;) {
666 src = LD_UB(src_ptr);
667 src_ptr += src_stride;
668 ref0 = LD_UB(ref0_ptr);
669 ref0_ptr += ref_stride;
670 ref1 = LD_UB(ref1_ptr);
671 ref1_ptr += ref_stride;
672 ref2 = LD_UB(ref2_ptr);
673 ref2_ptr += ref_stride;
674 ref3 = LD_UB(ref3_ptr);
675 ref3_ptr += ref_stride;
676
677 diff = __msa_asub_u_b(src, ref0);
678 sad0 += __msa_hadd_u_h(diff, diff);
679 diff = __msa_asub_u_b(src, ref1);
680 sad1 += __msa_hadd_u_h(diff, diff);
681 diff = __msa_asub_u_b(src, ref2);
682 sad2 += __msa_hadd_u_h(diff, diff);
683 diff = __msa_asub_u_b(src, ref3);
684 sad3 += __msa_hadd_u_h(diff, diff);
685
686 src = LD_UB(src_ptr);
687 src_ptr += src_stride;
688 ref0 = LD_UB(ref0_ptr);
689 ref0_ptr += ref_stride;
690 ref1 = LD_UB(ref1_ptr);
691 ref1_ptr += ref_stride;
692 ref2 = LD_UB(ref2_ptr);
693 ref2_ptr += ref_stride;
694 ref3 = LD_UB(ref3_ptr);
695 ref3_ptr += ref_stride;
696
697 diff = __msa_asub_u_b(src, ref0);
698 sad0 += __msa_hadd_u_h(diff, diff);
699 diff = __msa_asub_u_b(src, ref1);
700 sad1 += __msa_hadd_u_h(diff, diff);
701 diff = __msa_asub_u_b(src, ref2);
702 sad2 += __msa_hadd_u_h(diff, diff);
703 diff = __msa_asub_u_b(src, ref3);
704 sad3 += __msa_hadd_u_h(diff, diff);
705 }
706
707 sad_array[0] = HADD_UH_U32(sad0);
708 sad_array[1] = HADD_UH_U32(sad1);
709 sad_array[2] = HADD_UH_U32(sad2);
710 sad_array[3] = HADD_UH_U32(sad3);
711 }
712
sad_32width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)713 static void sad_32width_x4d_msa(const uint8_t *src, int32_t src_stride,
714 const uint8_t *const aref_ptr[],
715 int32_t ref_stride, int32_t height,
716 uint32_t *sad_array) {
717 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
718 int32_t ht_cnt;
719 v16u8 src0, src1, ref0, ref1;
720 v8u16 sad0 = { 0 };
721 v8u16 sad1 = { 0 };
722 v8u16 sad2 = { 0 };
723 v8u16 sad3 = { 0 };
724
725 ref0_ptr = aref_ptr[0];
726 ref1_ptr = aref_ptr[1];
727 ref2_ptr = aref_ptr[2];
728 ref3_ptr = aref_ptr[3];
729
730 for (ht_cnt = height; ht_cnt--;) {
731 LD_UB2(src, 16, src0, src1);
732 src += src_stride;
733
734 LD_UB2(ref0_ptr, 16, ref0, ref1);
735 ref0_ptr += ref_stride;
736 sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
737
738 LD_UB2(ref1_ptr, 16, ref0, ref1);
739 ref1_ptr += ref_stride;
740 sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
741
742 LD_UB2(ref2_ptr, 16, ref0, ref1);
743 ref2_ptr += ref_stride;
744 sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
745
746 LD_UB2(ref3_ptr, 16, ref0, ref1);
747 ref3_ptr += ref_stride;
748 sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
749 }
750
751 sad_array[0] = HADD_UH_U32(sad0);
752 sad_array[1] = HADD_UH_U32(sad1);
753 sad_array[2] = HADD_UH_U32(sad2);
754 sad_array[3] = HADD_UH_U32(sad3);
755 }
756
sad_64width_x4d_msa(const uint8_t * src,int32_t src_stride,const uint8_t * const aref_ptr[],int32_t ref_stride,int32_t height,uint32_t * sad_array)757 static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride,
758 const uint8_t *const aref_ptr[],
759 int32_t ref_stride, int32_t height,
760 uint32_t *sad_array) {
761 const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
762 int32_t ht_cnt;
763 v16u8 src0, src1, src2, src3;
764 v16u8 ref0, ref1, ref2, ref3;
765 v8u16 sad0_0 = { 0 };
766 v8u16 sad0_1 = { 0 };
767 v8u16 sad1_0 = { 0 };
768 v8u16 sad1_1 = { 0 };
769 v8u16 sad2_0 = { 0 };
770 v8u16 sad2_1 = { 0 };
771 v8u16 sad3_0 = { 0 };
772 v8u16 sad3_1 = { 0 };
773 v4u32 sad;
774
775 ref0_ptr = aref_ptr[0];
776 ref1_ptr = aref_ptr[1];
777 ref2_ptr = aref_ptr[2];
778 ref3_ptr = aref_ptr[3];
779
780 for (ht_cnt = height; ht_cnt--;) {
781 LD_UB4(src, 16, src0, src1, src2, src3);
782 src += src_stride;
783
784 LD_UB4(ref0_ptr, 16, ref0, ref1, ref2, ref3);
785 ref0_ptr += ref_stride;
786 sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
787 sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
788
789 LD_UB4(ref1_ptr, 16, ref0, ref1, ref2, ref3);
790 ref1_ptr += ref_stride;
791 sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
792 sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
793
794 LD_UB4(ref2_ptr, 16, ref0, ref1, ref2, ref3);
795 ref2_ptr += ref_stride;
796 sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
797 sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
798
799 LD_UB4(ref3_ptr, 16, ref0, ref1, ref2, ref3);
800 ref3_ptr += ref_stride;
801 sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
802 sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
803 }
804
805 sad = __msa_hadd_u_w(sad0_0, sad0_0);
806 sad += __msa_hadd_u_w(sad0_1, sad0_1);
807 sad_array[0] = HADD_UW_U32(sad);
808
809 sad = __msa_hadd_u_w(sad1_0, sad1_0);
810 sad += __msa_hadd_u_w(sad1_1, sad1_1);
811 sad_array[1] = HADD_UW_U32(sad);
812
813 sad = __msa_hadd_u_w(sad2_0, sad2_0);
814 sad += __msa_hadd_u_w(sad2_1, sad2_1);
815 sad_array[2] = HADD_UW_U32(sad);
816
817 sad = __msa_hadd_u_w(sad3_0, sad3_0);
818 sad += __msa_hadd_u_w(sad3_1, sad3_1);
819 sad_array[3] = HADD_UW_U32(sad);
820 }
821
avgsad_4width_msa(const uint8_t * src_ptr,int32_t src_stride,const uint8_t * ref_ptr,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)822 static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
823 const uint8_t *ref_ptr, int32_t ref_stride,
824 int32_t height, const uint8_t *sec_pred) {
825 int32_t ht_cnt;
826 uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
827 v16u8 src = { 0 };
828 v16u8 ref = { 0 };
829 v16u8 diff, pred, comp;
830 v8u16 sad = { 0 };
831
832 for (ht_cnt = (height >> 2); ht_cnt--;) {
833 LW4(src_ptr, src_stride, src0, src1, src2, src3);
834 src_ptr += (4 * src_stride);
835 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
836 ref_ptr += (4 * ref_stride);
837 pred = LD_UB(sec_pred);
838 sec_pred += 16;
839
840 INSERT_W4_UB(src0, src1, src2, src3, src);
841 INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
842
843 comp = __msa_aver_u_b(pred, ref);
844 diff = __msa_asub_u_b(src, comp);
845 sad += __msa_hadd_u_h(diff, diff);
846 }
847
848 return HADD_UH_U32(sad);
849 }
850
avgsad_8width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)851 static uint32_t avgsad_8width_msa(const uint8_t *src, int32_t src_stride,
852 const uint8_t *ref, int32_t ref_stride,
853 int32_t height, const uint8_t *sec_pred) {
854 int32_t ht_cnt;
855 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
856 v16u8 diff0, diff1, pred0, pred1;
857 v8u16 sad = { 0 };
858
859 for (ht_cnt = (height >> 2); ht_cnt--;) {
860 LD_UB4(src, src_stride, src0, src1, src2, src3);
861 src += (4 * src_stride);
862 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
863 ref += (4 * ref_stride);
864 LD_UB2(sec_pred, 16, pred0, pred1);
865 sec_pred += 32;
866 PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2, src0, src1,
867 ref0, ref1);
868 AVER_UB2_UB(pred0, ref0, pred1, ref1, diff0, diff1);
869 sad += SAD_UB2_UH(src0, src1, diff0, diff1);
870 }
871
872 return HADD_UH_U32(sad);
873 }
874
avgsad_16width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)875 static uint32_t avgsad_16width_msa(const uint8_t *src, int32_t src_stride,
876 const uint8_t *ref, int32_t ref_stride,
877 int32_t height, const uint8_t *sec_pred) {
878 int32_t ht_cnt;
879 v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
880 v16u8 pred0, pred1, pred2, pred3, comp0, comp1;
881 v8u16 sad = { 0 };
882
883 for (ht_cnt = (height >> 3); ht_cnt--;) {
884 LD_UB4(src, src_stride, src0, src1, src2, src3);
885 src += (4 * src_stride);
886 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
887 ref += (4 * ref_stride);
888 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
889 sec_pred += (4 * 16);
890 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
891 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
892 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
893 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
894
895 LD_UB4(src, src_stride, src0, src1, src2, src3);
896 src += (4 * src_stride);
897 LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);
898 ref += (4 * ref_stride);
899 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
900 sec_pred += (4 * 16);
901 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
902 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
903 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
904 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
905 }
906
907 return HADD_UH_U32(sad);
908 }
909
avgsad_32width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)910 static uint32_t avgsad_32width_msa(const uint8_t *src, int32_t src_stride,
911 const uint8_t *ref, int32_t ref_stride,
912 int32_t height, const uint8_t *sec_pred) {
913 int32_t ht_cnt;
914 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
915 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
916 v16u8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
917 v16u8 comp0, comp1;
918 v8u16 sad = { 0 };
919
920 for (ht_cnt = (height >> 2); ht_cnt--;) {
921 LD_UB4(src, src_stride, src0, src2, src4, src6);
922 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
923 src += (4 * src_stride);
924
925 LD_UB4(ref, ref_stride, ref0, ref2, ref4, ref6);
926 LD_UB4(ref + 16, ref_stride, ref1, ref3, ref5, ref7);
927 ref += (4 * ref_stride);
928
929 LD_UB4(sec_pred, 32, pred0, pred2, pred4, pred6);
930 LD_UB4(sec_pred + 16, 32, pred1, pred3, pred5, pred7);
931 sec_pred += (4 * 32);
932
933 AVER_UB2_UB(pred0, ref0, pred1, ref1, comp0, comp1);
934 sad += SAD_UB2_UH(src0, src1, comp0, comp1);
935 AVER_UB2_UB(pred2, ref2, pred3, ref3, comp0, comp1);
936 sad += SAD_UB2_UH(src2, src3, comp0, comp1);
937 AVER_UB2_UB(pred4, ref4, pred5, ref5, comp0, comp1);
938 sad += SAD_UB2_UH(src4, src5, comp0, comp1);
939 AVER_UB2_UB(pred6, ref6, pred7, ref7, comp0, comp1);
940 sad += SAD_UB2_UH(src6, src7, comp0, comp1);
941 }
942
943 return HADD_UH_U32(sad);
944 }
945
avgsad_64width_msa(const uint8_t * src,int32_t src_stride,const uint8_t * ref,int32_t ref_stride,int32_t height,const uint8_t * sec_pred)946 static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
947 const uint8_t *ref, int32_t ref_stride,
948 int32_t height, const uint8_t *sec_pred) {
949 int32_t ht_cnt;
950 v16u8 src0, src1, src2, src3;
951 v16u8 ref0, ref1, ref2, ref3;
952 v16u8 comp0, comp1, comp2, comp3;
953 v16u8 pred0, pred1, pred2, pred3;
954 v8u16 sad0 = { 0 };
955 v8u16 sad1 = { 0 };
956 v4u32 sad;
957
958 for (ht_cnt = (height >> 2); ht_cnt--;) {
959 LD_UB4(src, 16, src0, src1, src2, src3);
960 src += src_stride;
961 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
962 ref += ref_stride;
963 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
964 sec_pred += 64;
965 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
966 comp1, comp2, comp3);
967 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
968 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
969
970 LD_UB4(src, 16, src0, src1, src2, src3);
971 src += src_stride;
972 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
973 ref += ref_stride;
974 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
975 sec_pred += 64;
976 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
977 comp1, comp2, comp3);
978 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
979 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
980
981 LD_UB4(src, 16, src0, src1, src2, src3);
982 src += src_stride;
983 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
984 ref += ref_stride;
985 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
986 sec_pred += 64;
987 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
988 comp1, comp2, comp3);
989 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
990 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
991
992 LD_UB4(src, 16, src0, src1, src2, src3);
993 src += src_stride;
994 LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
995 ref += ref_stride;
996 LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
997 sec_pred += 64;
998 AVER_UB4_UB(pred0, ref0, pred1, ref1, pred2, ref2, pred3, ref3, comp0,
999 comp1, comp2, comp3);
1000 sad0 += SAD_UB2_UH(src0, src1, comp0, comp1);
1001 sad1 += SAD_UB2_UH(src2, src3, comp2, comp3);
1002 }
1003
1004 sad = __msa_hadd_u_w(sad0, sad0);
1005 sad += __msa_hadd_u_w(sad1, sad1);
1006
1007 return HADD_SW_S32(sad);
1008 }
1009
1010 #define VPX_SAD_4xHEIGHT_MSA(height) \
1011 uint32_t vpx_sad4x##height##_msa(const uint8_t *src, int32_t src_stride, \
1012 const uint8_t *ref, int32_t ref_stride) { \
1013 return sad_4width_msa(src, src_stride, ref, ref_stride, height); \
1014 }
1015
1016 #define VPX_SAD_8xHEIGHT_MSA(height) \
1017 uint32_t vpx_sad8x##height##_msa(const uint8_t *src, int32_t src_stride, \
1018 const uint8_t *ref, int32_t ref_stride) { \
1019 return sad_8width_msa(src, src_stride, ref, ref_stride, height); \
1020 }
1021
1022 #define VPX_SAD_16xHEIGHT_MSA(height) \
1023 uint32_t vpx_sad16x##height##_msa(const uint8_t *src, int32_t src_stride, \
1024 const uint8_t *ref, int32_t ref_stride) { \
1025 return sad_16width_msa(src, src_stride, ref, ref_stride, height); \
1026 }
1027
1028 #define VPX_SAD_32xHEIGHT_MSA(height) \
1029 uint32_t vpx_sad32x##height##_msa(const uint8_t *src, int32_t src_stride, \
1030 const uint8_t *ref, int32_t ref_stride) { \
1031 return sad_32width_msa(src, src_stride, ref, ref_stride, height); \
1032 }
1033
1034 #define VPX_SAD_64xHEIGHT_MSA(height) \
1035 uint32_t vpx_sad64x##height##_msa(const uint8_t *src, int32_t src_stride, \
1036 const uint8_t *ref, int32_t ref_stride) { \
1037 return sad_64width_msa(src, src_stride, ref, ref_stride, height); \
1038 }
1039
1040 #define VPX_SAD_4xHEIGHTx3_MSA(height) \
1041 void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1042 const uint8_t *ref, int32_t ref_stride, \
1043 uint32_t *sads) { \
1044 sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1045 }
1046
1047 #define VPX_SAD_8xHEIGHTx3_MSA(height) \
1048 void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1049 const uint8_t *ref, int32_t ref_stride, \
1050 uint32_t *sads) { \
1051 sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1052 }
1053
1054 #define VPX_SAD_16xHEIGHTx3_MSA(height) \
1055 void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
1056 const uint8_t *ref, int32_t ref_stride, \
1057 uint32_t *sads) { \
1058 sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
1059 }
1060
1061 #define VPX_SAD_4xHEIGHTx8_MSA(height) \
1062 void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1063 const uint8_t *ref, int32_t ref_stride, \
1064 uint32_t *sads) { \
1065 sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1066 }
1067
1068 #define VPX_SAD_8xHEIGHTx8_MSA(height) \
1069 void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1070 const uint8_t *ref, int32_t ref_stride, \
1071 uint32_t *sads) { \
1072 sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1073 }
1074
1075 #define VPX_SAD_16xHEIGHTx8_MSA(height) \
1076 void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
1077 const uint8_t *ref, int32_t ref_stride, \
1078 uint32_t *sads) { \
1079 sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
1080 }
1081
1082 #define VPX_SAD_4xHEIGHTx4D_MSA(height) \
1083 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1084 const uint8_t *const refs[], \
1085 int32_t ref_stride, uint32_t *sads) { \
1086 sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1087 }
1088
1089 #define VPX_SAD_8xHEIGHTx4D_MSA(height) \
1090 void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1091 const uint8_t *const refs[], \
1092 int32_t ref_stride, uint32_t *sads) { \
1093 sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1094 }
1095
1096 #define VPX_SAD_16xHEIGHTx4D_MSA(height) \
1097 void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1098 const uint8_t *const refs[], \
1099 int32_t ref_stride, uint32_t *sads) { \
1100 sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1101 }
1102
1103 #define VPX_SAD_32xHEIGHTx4D_MSA(height) \
1104 void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1105 const uint8_t *const refs[], \
1106 int32_t ref_stride, uint32_t *sads) { \
1107 sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1108 }
1109
1110 #define VPX_SAD_64xHEIGHTx4D_MSA(height) \
1111 void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
1112 const uint8_t *const refs[], \
1113 int32_t ref_stride, uint32_t *sads) { \
1114 sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
1115 }
1116
1117 #define VPX_AVGSAD_4xHEIGHT_MSA(height) \
1118 uint32_t vpx_sad4x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1119 const uint8_t *ref, int32_t ref_stride, \
1120 const uint8_t *second_pred) { \
1121 return avgsad_4width_msa(src, src_stride, ref, ref_stride, height, \
1122 second_pred); \
1123 }
1124
1125 #define VPX_AVGSAD_8xHEIGHT_MSA(height) \
1126 uint32_t vpx_sad8x##height##_avg_msa(const uint8_t *src, int32_t src_stride, \
1127 const uint8_t *ref, int32_t ref_stride, \
1128 const uint8_t *second_pred) { \
1129 return avgsad_8width_msa(src, src_stride, ref, ref_stride, height, \
1130 second_pred); \
1131 }
1132
1133 #define VPX_AVGSAD_16xHEIGHT_MSA(height) \
1134 uint32_t vpx_sad16x##height##_avg_msa( \
1135 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1136 int32_t ref_stride, const uint8_t *second_pred) { \
1137 return avgsad_16width_msa(src, src_stride, ref, ref_stride, height, \
1138 second_pred); \
1139 }
1140
1141 #define VPX_AVGSAD_32xHEIGHT_MSA(height) \
1142 uint32_t vpx_sad32x##height##_avg_msa( \
1143 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1144 int32_t ref_stride, const uint8_t *second_pred) { \
1145 return avgsad_32width_msa(src, src_stride, ref, ref_stride, height, \
1146 second_pred); \
1147 }
1148
1149 #define VPX_AVGSAD_64xHEIGHT_MSA(height) \
1150 uint32_t vpx_sad64x##height##_avg_msa( \
1151 const uint8_t *src, int32_t src_stride, const uint8_t *ref, \
1152 int32_t ref_stride, const uint8_t *second_pred) { \
1153 return avgsad_64width_msa(src, src_stride, ref, ref_stride, height, \
1154 second_pred); \
1155 }
1156
1157 // 64x64
1158 VPX_SAD_64xHEIGHT_MSA(64);
1159 VPX_SAD_64xHEIGHTx4D_MSA(64);
1160 VPX_AVGSAD_64xHEIGHT_MSA(64);
1161
1162 // 64x32
1163 VPX_SAD_64xHEIGHT_MSA(32);
1164 VPX_SAD_64xHEIGHTx4D_MSA(32);
1165 VPX_AVGSAD_64xHEIGHT_MSA(32);
1166
1167 // 32x64
1168 VPX_SAD_32xHEIGHT_MSA(64);
1169 VPX_SAD_32xHEIGHTx4D_MSA(64);
1170 VPX_AVGSAD_32xHEIGHT_MSA(64);
1171
1172 // 32x32
1173 VPX_SAD_32xHEIGHT_MSA(32);
1174 VPX_SAD_32xHEIGHTx4D_MSA(32);
1175 VPX_AVGSAD_32xHEIGHT_MSA(32);
1176
1177 // 32x16
1178 VPX_SAD_32xHEIGHT_MSA(16);
1179 VPX_SAD_32xHEIGHTx4D_MSA(16);
1180 VPX_AVGSAD_32xHEIGHT_MSA(16);
1181
1182 // 16x32
1183 VPX_SAD_16xHEIGHT_MSA(32);
1184 VPX_SAD_16xHEIGHTx4D_MSA(32);
1185 VPX_AVGSAD_16xHEIGHT_MSA(32);
1186
1187 // 16x16
1188 VPX_SAD_16xHEIGHT_MSA(16);
1189 VPX_SAD_16xHEIGHTx3_MSA(16);
1190 VPX_SAD_16xHEIGHTx8_MSA(16);
1191 VPX_SAD_16xHEIGHTx4D_MSA(16);
1192 VPX_AVGSAD_16xHEIGHT_MSA(16);
1193
1194 // 16x8
1195 VPX_SAD_16xHEIGHT_MSA(8);
1196 VPX_SAD_16xHEIGHTx3_MSA(8);
1197 VPX_SAD_16xHEIGHTx8_MSA(8);
1198 VPX_SAD_16xHEIGHTx4D_MSA(8);
1199 VPX_AVGSAD_16xHEIGHT_MSA(8);
1200
1201 // 8x16
1202 VPX_SAD_8xHEIGHT_MSA(16);
1203 VPX_SAD_8xHEIGHTx3_MSA(16);
1204 VPX_SAD_8xHEIGHTx8_MSA(16);
1205 VPX_SAD_8xHEIGHTx4D_MSA(16);
1206 VPX_AVGSAD_8xHEIGHT_MSA(16);
1207
1208 // 8x8
1209 VPX_SAD_8xHEIGHT_MSA(8);
1210 VPX_SAD_8xHEIGHTx3_MSA(8);
1211 VPX_SAD_8xHEIGHTx8_MSA(8);
1212 VPX_SAD_8xHEIGHTx4D_MSA(8);
1213 VPX_AVGSAD_8xHEIGHT_MSA(8);
1214
1215 // 8x4
1216 VPX_SAD_8xHEIGHT_MSA(4);
1217 VPX_SAD_8xHEIGHTx4D_MSA(4);
1218 VPX_AVGSAD_8xHEIGHT_MSA(4);
1219
1220 // 4x8
1221 VPX_SAD_4xHEIGHT_MSA(8);
1222 VPX_SAD_4xHEIGHTx4D_MSA(8);
1223 VPX_AVGSAD_4xHEIGHT_MSA(8);
1224
1225 // 4x4
1226 VPX_SAD_4xHEIGHT_MSA(4);
1227 VPX_SAD_4xHEIGHTx3_MSA(4);
1228 VPX_SAD_4xHEIGHTx8_MSA(4);
1229 VPX_SAD_4xHEIGHTx4D_MSA(4);
1230 VPX_AVGSAD_4xHEIGHT_MSA(4);
1231