1 #include <gtest/gtest.h>
2 #include "cpu.h"
3 #include "cpu_core.h"
4 #include "util.h"
5 #include "macros.h"
6 #include "IWelsVP.h"
7 #include "vaacalculation.h"
8 
9 using namespace WelsVP;
10 
VAACalcSadSsd_ref(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16)11 void VAACalcSadSsd_ref (const uint8_t* pCurData, const uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
12                         int32_t iPicStride,
13                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16) {
14   const uint8_t* tmp_ref = pRefData;
15   const uint8_t* tmp_cur = pCurData;
16   int32_t iMbWidth = (iPicWidth >> 4);
17   int32_t mb_height = (iPicHeight >> 4);
18   int32_t mb_index = 0;
19   int32_t pic_stride_x8 = iPicStride << 3;
20   int32_t step = (iPicStride << 4) - iPicWidth;
21 
22   *pFrameSad = 0;
23   for (int32_t i = 0; i < mb_height; i ++) {
24     for (int32_t j = 0; j < iMbWidth; j ++) {
25       int32_t k, l;
26       int32_t l_sad, l_sqdiff, l_sum, l_sqsum;
27       const uint8_t* tmp_cur_row;
28       const uint8_t* tmp_ref_row;
29 
30       pSum16x16[mb_index] = 0;
31       psqsum16x16[mb_index] = 0;
32       psqdiff16x16[mb_index] = 0;
33 
34       l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
35       tmp_cur_row = tmp_cur;
36       tmp_ref_row = tmp_ref;
37       for (k = 0; k < 8; k ++) {
38         for (l = 0; l < 8; l ++) {
39           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
40           l_sad += diff;
41           l_sqdiff += diff * diff;
42           l_sum += tmp_cur_row[l];
43           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
44         }
45         tmp_cur_row += iPicStride;
46         tmp_ref_row += iPicStride;
47       }
48       *pFrameSad += l_sad;
49       pSad8x8[ (mb_index << 2) + 0] = l_sad;
50       pSum16x16[mb_index] += l_sum;
51       psqsum16x16[mb_index] += l_sqsum;
52       psqdiff16x16[mb_index] += l_sqdiff;
53 
54       l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
55       tmp_cur_row = tmp_cur + 8;
56       tmp_ref_row = tmp_ref + 8;
57       for (k = 0; k < 8; k ++) {
58         for (l = 0; l < 8; l ++) {
59           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
60           l_sad += diff;
61           l_sqdiff += diff * diff;
62           l_sum += tmp_cur_row[l];
63           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
64         }
65         tmp_cur_row += iPicStride;
66         tmp_ref_row += iPicStride;
67       }
68       *pFrameSad += l_sad;
69       pSad8x8[ (mb_index << 2) + 1] = l_sad;
70       pSum16x16[mb_index] += l_sum;
71       psqsum16x16[mb_index] += l_sqsum;
72       psqdiff16x16[mb_index] += l_sqdiff;
73 
74       l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
75       tmp_cur_row = tmp_cur + pic_stride_x8;
76       tmp_ref_row = tmp_ref + pic_stride_x8;
77       for (k = 0; k < 8; k ++) {
78         for (l = 0; l < 8; l ++) {
79           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
80           l_sad += diff;
81           l_sqdiff += diff * diff;
82           l_sum += tmp_cur_row[l];
83           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
84         }
85         tmp_cur_row += iPicStride;
86         tmp_ref_row += iPicStride;
87       }
88       *pFrameSad += l_sad;
89       pSad8x8[ (mb_index << 2) + 2] = l_sad;
90       pSum16x16[mb_index] += l_sum;
91       psqsum16x16[mb_index] += l_sqsum;
92       psqdiff16x16[mb_index] += l_sqdiff;
93 
94       l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
95       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
96       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
97       for (k = 0; k < 8; k ++) {
98         for (l = 0; l < 8; l ++) {
99           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
100           l_sad += diff;
101           l_sqdiff += diff * diff;
102           l_sum += tmp_cur_row[l];
103           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
104         }
105         tmp_cur_row += iPicStride;
106         tmp_ref_row += iPicStride;
107       }
108       *pFrameSad += l_sad;
109       pSad8x8[ (mb_index << 2) + 3] = l_sad;
110       pSum16x16[mb_index] += l_sum;
111       psqsum16x16[mb_index] += l_sqsum;
112       psqdiff16x16[mb_index] += l_sqdiff;
113 
114 
115       tmp_ref += 16;
116       tmp_cur += 16;
117       ++mb_index;
118     }
119     tmp_ref += step;
120     tmp_cur += step;
121   }
122 }
VAACalcSadVar_ref(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSum16x16,int32_t * psqsum16x16)123 void VAACalcSadVar_ref (const uint8_t* pCurData, const uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
124                         int32_t iPicStride,
125                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16) {
126   const uint8_t* tmp_ref = pRefData;
127   const uint8_t* tmp_cur = pCurData;
128   int32_t iMbWidth = (iPicWidth >> 4);
129   int32_t mb_height = (iPicHeight >> 4);
130   int32_t mb_index = 0;
131   int32_t pic_stride_x8 = iPicStride << 3;
132   int32_t step = (iPicStride << 4) - iPicWidth;
133 
134   *pFrameSad = 0;
135   for (int32_t i = 0; i < mb_height; i ++) {
136     for (int32_t j = 0; j < iMbWidth; j ++) {
137       int32_t k, l;
138       int32_t l_sad, l_sum, l_sqsum;
139       const uint8_t* tmp_cur_row;
140       const uint8_t* tmp_ref_row;
141 
142       pSum16x16[mb_index] = 0;
143       psqsum16x16[mb_index] = 0;
144 
145       l_sad =  l_sum =  l_sqsum = 0;
146       tmp_cur_row = tmp_cur;
147       tmp_ref_row = tmp_ref;
148       for (k = 0; k < 8; k ++) {
149         for (l = 0; l < 8; l ++) {
150           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
151           l_sad += diff;
152           l_sum += tmp_cur_row[l];
153           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
154         }
155         tmp_cur_row += iPicStride;
156         tmp_ref_row += iPicStride;
157       }
158       *pFrameSad += l_sad;
159       pSad8x8[ (mb_index << 2) + 0] = l_sad;
160       pSum16x16[mb_index] += l_sum;
161       psqsum16x16[mb_index] += l_sqsum;
162 
163       l_sad =  l_sum =  l_sqsum = 0;
164       tmp_cur_row = tmp_cur + 8;
165       tmp_ref_row = tmp_ref + 8;
166       for (k = 0; k < 8; k ++) {
167         for (l = 0; l < 8; l ++) {
168           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
169           l_sad += diff;
170           l_sum += tmp_cur_row[l];
171           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
172         }
173         tmp_cur_row += iPicStride;
174         tmp_ref_row += iPicStride;
175       }
176       *pFrameSad += l_sad;
177       pSad8x8[ (mb_index << 2) + 1] = l_sad;
178       pSum16x16[mb_index] += l_sum;
179       psqsum16x16[mb_index] += l_sqsum;
180 
181       l_sad =  l_sum =  l_sqsum = 0;
182       tmp_cur_row = tmp_cur + pic_stride_x8;
183       tmp_ref_row = tmp_ref + pic_stride_x8;
184       for (k = 0; k < 8; k ++) {
185         for (l = 0; l < 8; l ++) {
186           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
187           l_sad += diff;
188           l_sum += tmp_cur_row[l];
189           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
190         }
191         tmp_cur_row += iPicStride;
192         tmp_ref_row += iPicStride;
193       }
194       *pFrameSad += l_sad;
195       pSad8x8[ (mb_index << 2) + 2] = l_sad;
196       pSum16x16[mb_index] += l_sum;
197       psqsum16x16[mb_index] += l_sqsum;
198 
199       l_sad =  l_sum =  l_sqsum = 0;
200       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
201       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
202       for (k = 0; k < 8; k ++) {
203         for (l = 0; l < 8; l ++) {
204           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
205           l_sad += diff;
206           l_sum += tmp_cur_row[l];
207           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
208         }
209         tmp_cur_row += iPicStride;
210         tmp_ref_row += iPicStride;
211       }
212       *pFrameSad += l_sad;
213       pSad8x8[ (mb_index << 2) + 3] = l_sad;
214       pSum16x16[mb_index] += l_sum;
215       psqsum16x16[mb_index] += l_sqsum;
216 
217 
218       tmp_ref += 16;
219       tmp_cur += 16;
220       ++mb_index;
221     }
222     tmp_ref += step;
223     tmp_cur += step;
224   }
225 }
226 
VAACalcSad_ref(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8)227 void VAACalcSad_ref (const uint8_t* pCurData, const uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
228                      int32_t iPicStride,
229                      int32_t* pFrameSad, int32_t* pSad8x8) {
230   const uint8_t* tmp_ref = pRefData;
231   const uint8_t* tmp_cur = pCurData;
232   int32_t iMbWidth = (iPicWidth >> 4);
233   int32_t mb_height = (iPicHeight >> 4);
234   int32_t mb_index = 0;
235   int32_t pic_stride_x8 = iPicStride << 3;
236   int32_t step = (iPicStride << 4) - iPicWidth;
237 
238   *pFrameSad = 0;
239   for (int32_t i = 0; i < mb_height; i ++) {
240     for (int32_t j = 0; j < iMbWidth; j ++) {
241       int32_t k, l;
242       int32_t l_sad;
243       const uint8_t* tmp_cur_row;
244       const uint8_t* tmp_ref_row;
245 
246       l_sad =  0;
247       tmp_cur_row = tmp_cur;
248       tmp_ref_row = tmp_ref;
249       for (k = 0; k < 8; k ++) {
250         for (l = 0; l < 8; l ++) {
251           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
252           l_sad += diff;
253         }
254         tmp_cur_row += iPicStride;
255         tmp_ref_row += iPicStride;
256       }
257       *pFrameSad += l_sad;
258       pSad8x8[ (mb_index << 2) + 0] = l_sad;
259 
260       l_sad =  0;
261       tmp_cur_row = tmp_cur + 8;
262       tmp_ref_row = tmp_ref + 8;
263       for (k = 0; k < 8; k ++) {
264         for (l = 0; l < 8; l ++) {
265           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
266           l_sad += diff;
267         }
268         tmp_cur_row += iPicStride;
269         tmp_ref_row += iPicStride;
270       }
271       *pFrameSad += l_sad;
272       pSad8x8[ (mb_index << 2) + 1] = l_sad;
273 
274       l_sad =  0;
275       tmp_cur_row = tmp_cur + pic_stride_x8;
276       tmp_ref_row = tmp_ref + pic_stride_x8;
277       for (k = 0; k < 8; k ++) {
278         for (l = 0; l < 8; l ++) {
279           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
280           l_sad += diff;
281         }
282         tmp_cur_row += iPicStride;
283         tmp_ref_row += iPicStride;
284       }
285       *pFrameSad += l_sad;
286       pSad8x8[ (mb_index << 2) + 2] = l_sad;
287 
288       l_sad =  0;
289       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
290       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
291       for (k = 0; k < 8; k ++) {
292         for (l = 0; l < 8; l ++) {
293           int32_t diff = WELS_ABS (tmp_cur_row[l] - tmp_ref_row[l]);
294           l_sad += diff;
295         }
296         tmp_cur_row += iPicStride;
297         tmp_ref_row += iPicStride;
298       }
299       *pFrameSad += l_sad;
300       pSad8x8[ (mb_index << 2) + 3] = l_sad;
301 
302       tmp_ref += 16;
303       tmp_cur += 16;
304       ++mb_index;
305     }
306     tmp_ref += step;
307     tmp_cur += step;
308   }
309 }
310 
VAACalcSadSsdBgd_ref(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSum16x16,int32_t * psqsum16x16,int32_t * psqdiff16x16,int32_t * pSd8x8,uint8_t * pMad8x8)311 void VAACalcSadSsdBgd_ref (const uint8_t* pCurData, const uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
312                            int32_t iPicStride,
313                            int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSum16x16, int32_t* psqsum16x16, int32_t* psqdiff16x16, int32_t* pSd8x8,
314                            uint8_t* pMad8x8) {
315   const uint8_t* tmp_ref = pRefData;
316   const uint8_t* tmp_cur = pCurData;
317   int32_t iMbWidth = (iPicWidth >> 4);
318   int32_t mb_height = (iPicHeight >> 4);
319   int32_t mb_index = 0;
320   int32_t pic_stride_x8 = iPicStride << 3;
321   int32_t step = (iPicStride << 4) - iPicWidth;
322 
323   *pFrameSad = 0;
324   for (int32_t i = 0; i < mb_height; i ++) {
325     for (int32_t j = 0; j < iMbWidth; j ++) {
326       int32_t k, l;
327       int32_t l_sad, l_sqdiff, l_sum, l_sqsum, l_sd, l_mad;
328       const uint8_t* tmp_cur_row;
329       const uint8_t* tmp_ref_row;
330 
331       pSum16x16[mb_index] = 0;
332       psqsum16x16[mb_index] = 0;
333       psqdiff16x16[mb_index] = 0;
334 
335       l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
336       tmp_cur_row = tmp_cur;
337       tmp_ref_row = tmp_ref;
338       for (k = 0; k < 8; k ++) {
339         for (l = 0; l < 8; l ++) {
340           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
341           int32_t abs_diff = WELS_ABS (diff);
342 
343           l_sd += diff;
344           if (abs_diff > l_mad) {
345             l_mad = abs_diff;
346           }
347           l_sad += abs_diff;
348           l_sqdiff += abs_diff * abs_diff;
349           l_sum += tmp_cur_row[l];
350           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
351         }
352         tmp_cur_row += iPicStride;
353         tmp_ref_row += iPicStride;
354       }
355       *pFrameSad += l_sad;
356       pSad8x8[ (mb_index << 2) + 0] = l_sad;
357       pSum16x16[mb_index] += l_sum;
358       psqsum16x16[mb_index] += l_sqsum;
359       psqdiff16x16[mb_index] += l_sqdiff;
360       pSd8x8[ (mb_index << 2) + 0] = l_sd;
361       pMad8x8[ (mb_index << 2) + 0] = l_mad;
362 
363 
364       l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
365       tmp_cur_row = tmp_cur + 8;
366       tmp_ref_row = tmp_ref + 8;
367       for (k = 0; k < 8; k ++) {
368         for (l = 0; l < 8; l ++) {
369           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
370           int32_t abs_diff = WELS_ABS (diff);
371 
372           l_sd += diff;
373           if (abs_diff > l_mad) {
374             l_mad = abs_diff;
375           }
376           l_sad += abs_diff;
377           l_sqdiff += abs_diff * abs_diff;
378           l_sum += tmp_cur_row[l];
379           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
380         }
381         tmp_cur_row += iPicStride;
382         tmp_ref_row += iPicStride;
383       }
384       *pFrameSad += l_sad;
385       pSad8x8[ (mb_index << 2) + 1] = l_sad;
386       pSum16x16[mb_index] += l_sum;
387       psqsum16x16[mb_index] += l_sqsum;
388       psqdiff16x16[mb_index] += l_sqdiff;
389       pSd8x8[ (mb_index << 2) + 1] = l_sd;
390       pMad8x8[ (mb_index << 2) + 1] = l_mad;
391 
392       l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
393       tmp_cur_row = tmp_cur + pic_stride_x8;
394       tmp_ref_row = tmp_ref + pic_stride_x8;
395       for (k = 0; k < 8; k ++) {
396         for (l = 0; l < 8; l ++) {
397           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
398           int32_t abs_diff = WELS_ABS (diff);
399 
400           l_sd += diff;
401           if (abs_diff > l_mad) {
402             l_mad = abs_diff;
403           }
404           l_sad += abs_diff;
405           l_sqdiff += abs_diff * abs_diff;
406           l_sum += tmp_cur_row[l];
407           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
408         }
409         tmp_cur_row += iPicStride;
410         tmp_ref_row += iPicStride;
411       }
412       *pFrameSad += l_sad;
413       pSad8x8[ (mb_index << 2) + 2] = l_sad;
414       pSum16x16[mb_index] += l_sum;
415       psqsum16x16[mb_index] += l_sqsum;
416       psqdiff16x16[mb_index] += l_sqdiff;
417       pSd8x8[ (mb_index << 2) + 2] = l_sd;
418       pMad8x8[ (mb_index << 2) + 2] = l_mad;
419 
420       l_sd = l_mad = l_sad =  l_sqdiff =  l_sum =  l_sqsum = 0;
421       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
422       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
423       for (k = 0; k < 8; k ++) {
424         for (l = 0; l < 8; l ++) {
425           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
426           int32_t abs_diff = WELS_ABS (diff);
427 
428           l_sd += diff;
429           if (abs_diff > l_mad) {
430             l_mad = abs_diff;
431           }
432           l_sad += abs_diff;
433           l_sqdiff += abs_diff * abs_diff;
434           l_sum += tmp_cur_row[l];
435           l_sqsum += tmp_cur_row[l] * tmp_cur_row[l];
436         }
437         tmp_cur_row += iPicStride;
438         tmp_ref_row += iPicStride;
439       }
440       *pFrameSad += l_sad;
441       pSad8x8[ (mb_index << 2) + 3] = l_sad;
442       pSum16x16[mb_index] += l_sum;
443       psqsum16x16[mb_index] += l_sqsum;
444       psqdiff16x16[mb_index] += l_sqdiff;
445       pSd8x8[ (mb_index << 2) + 3] = l_sd;
446       pMad8x8[ (mb_index << 2) + 3] = l_mad;
447 
448       tmp_ref += 16;
449       tmp_cur += 16;
450       ++mb_index;
451     }
452     tmp_ref += step;
453     tmp_cur += step;
454   }
455 }
456 
VAACalcSadBgd_ref(const uint8_t * pCurData,const uint8_t * pRefData,int32_t iPicWidth,int32_t iPicHeight,int32_t iPicStride,int32_t * pFrameSad,int32_t * pSad8x8,int32_t * pSd8x8,uint8_t * pMad8x8)457 void VAACalcSadBgd_ref (const uint8_t* pCurData, const uint8_t* pRefData, int32_t iPicWidth, int32_t iPicHeight,
458                         int32_t iPicStride,
459                         int32_t* pFrameSad, int32_t* pSad8x8, int32_t* pSd8x8, uint8_t* pMad8x8) {
460   const uint8_t* tmp_ref = pRefData;
461   const uint8_t* tmp_cur = pCurData;
462   int32_t iMbWidth = (iPicWidth >> 4);
463   int32_t mb_height = (iPicHeight >> 4);
464   int32_t mb_index = 0;
465   int32_t pic_stride_x8 = iPicStride << 3;
466   int32_t step = (iPicStride << 4) - iPicWidth;
467 
468   *pFrameSad = 0;
469   for (int32_t i = 0; i < mb_height; i ++) {
470     for (int32_t j = 0; j < iMbWidth; j ++) {
471       int32_t k, l;
472       int32_t l_sad, l_sd, l_mad;
473       const uint8_t* tmp_cur_row;
474       const uint8_t* tmp_ref_row;
475 
476       l_mad = l_sd = l_sad =  0;
477       tmp_cur_row = tmp_cur;
478       tmp_ref_row = tmp_ref;
479       for (k = 0; k < 8; k ++) {
480         for (l = 0; l < 8; l ++) {
481           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
482           int32_t abs_diff = WELS_ABS (diff);
483           l_sd += diff;
484           l_sad += abs_diff;
485           if (abs_diff > l_mad) {
486             l_mad = abs_diff;
487           }
488         }
489         tmp_cur_row += iPicStride;
490         tmp_ref_row += iPicStride;
491       }
492       *pFrameSad += l_sad;
493       pSad8x8[ (mb_index << 2) + 0] = l_sad;
494       pSd8x8[ (mb_index << 2) + 0] = l_sd;
495       pMad8x8[ (mb_index << 2) + 0] = l_mad;
496 
497       l_mad = l_sd = l_sad =  0;
498       tmp_cur_row = tmp_cur + 8;
499       tmp_ref_row = tmp_ref + 8;
500       for (k = 0; k < 8; k ++) {
501         for (l = 0; l < 8; l ++) {
502           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
503           int32_t abs_diff = WELS_ABS (diff);
504           l_sd += diff;
505           l_sad += abs_diff;
506           if (abs_diff > l_mad) {
507             l_mad = abs_diff;
508           }
509         }
510         tmp_cur_row += iPicStride;
511         tmp_ref_row += iPicStride;
512       }
513       *pFrameSad += l_sad;
514       pSad8x8[ (mb_index << 2) + 1] = l_sad;
515       pSd8x8[ (mb_index << 2) + 1] = l_sd;
516       pMad8x8[ (mb_index << 2) + 1] = l_mad;
517 
518       l_mad = l_sd = l_sad =  0;
519       tmp_cur_row = tmp_cur + pic_stride_x8;
520       tmp_ref_row = tmp_ref + pic_stride_x8;
521       for (k = 0; k < 8; k ++) {
522         for (l = 0; l < 8; l ++) {
523           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
524           int32_t abs_diff = WELS_ABS (diff);
525           l_sd += diff;
526           l_sad += abs_diff;
527           if (abs_diff > l_mad) {
528             l_mad = abs_diff;
529           }
530         }
531         tmp_cur_row += iPicStride;
532         tmp_ref_row += iPicStride;
533       }
534       *pFrameSad += l_sad;
535       pSad8x8[ (mb_index << 2) + 2] = l_sad;
536       pSd8x8[ (mb_index << 2) + 2] = l_sd;
537       pMad8x8[ (mb_index << 2) + 2] = l_mad;
538 
539       l_mad = l_sd = l_sad =  0;
540       tmp_cur_row = tmp_cur + pic_stride_x8 + 8;
541       tmp_ref_row = tmp_ref + pic_stride_x8 + 8;
542       for (k = 0; k < 8; k ++) {
543         for (l = 0; l < 8; l ++) {
544           int32_t diff = tmp_cur_row[l] - tmp_ref_row[l];
545           int32_t abs_diff = WELS_ABS (diff);
546           l_sd += diff;
547           l_sad += abs_diff;
548           if (abs_diff > l_mad) {
549             l_mad = abs_diff;
550           }
551         }
552         tmp_cur_row += iPicStride;
553         tmp_ref_row += iPicStride;
554       }
555       *pFrameSad += l_sad;
556       pSad8x8[ (mb_index << 2) + 3] = l_sad;
557       pSd8x8[ (mb_index << 2) + 3] = l_sd;
558       pMad8x8[ (mb_index << 2) + 3] = l_mad;
559 
560       tmp_ref += 16;
561       tmp_cur += 16;
562       ++mb_index;
563     }
564     tmp_ref += step;
565     tmp_cur += step;
566   }
567 }
568 
569 #define BUFFER_SIZE (320*320)
570 
571 #define GENERATE_VAACalcSad_UT(func, ASM, CPUFLAGS) \
572 TEST (VAACalcFuncTest, func) { \
573     if (ASM) {\
574         int32_t iCpuCores = 0; \
575         uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
576         if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
577             return; \
578     } \
579     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_c, BUFFER_SIZE, 16); \
580     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_c, BUFFER_SIZE, 16); \
581     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_c, BUFFER_SIZE/64, 16); \
582     int32_t pic_width_c; \
583     int32_t pic_height_c; \
584     int32_t pic_stride_c; \
585     int32_t psadframe_c; \
586     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_a, BUFFER_SIZE, 16); \
587     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_a, BUFFER_SIZE, 16); \
588     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_a, BUFFER_SIZE/64, 16); \
589     int32_t pic_width_a; \
590     int32_t pic_height_a; \
591     int32_t pic_stride_a; \
592     int32_t psadframe_a; \
593     for (int i=0; i<4; i++) { \
594         pic_width_c  = pic_width_a = 320-16*i; \
595         pic_height_c = pic_height_a = 320; \
596         pic_stride_c = pic_stride_a = 320; \
597         psadframe_c = psadframe_a = 0; \
598         for (int j=0; j<BUFFER_SIZE; j++) { \
599             cur_data_c[j] = cur_data_a[j] = (rand()%256); \
600             ref_data_c[j] = ref_data_a[j] = (rand()%256); \
601             psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
602         } \
603         VAACalcSad_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c); \
604         func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a); \
605         ASSERT_EQ (psadframe_a, psadframe_c); \
606         for (int j=0; j<(BUFFER_SIZE/64); j++) \
607             ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
608     } \
609 }
610 
611 
612 #define GENERATE_VAACalcSadBgd_UT(func, ASM, CPUFLAGS) \
613 TEST (VAACalcFuncTest, func) { \
614     if (ASM) {\
615         int32_t iCpuCores = 0; \
616         uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
617         if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
618         return; \
619     } \
620     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_c, BUFFER_SIZE, 16); \
621     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_c, BUFFER_SIZE, 16); \
622     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_c, BUFFER_SIZE/64, 16); \
623     ENFORCE_STACK_ALIGN_1D (int32_t, psd8x8_c, BUFFER_SIZE/64, 16); \
624     ENFORCE_STACK_ALIGN_1D (uint8_t, pmad8x8_c, BUFFER_SIZE/64, 16); \
625     int32_t pic_width_c; \
626     int32_t pic_height_c; \
627     int32_t pic_stride_c; \
628     int32_t psadframe_c; \
629     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_a, BUFFER_SIZE, 16); \
630     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_a, BUFFER_SIZE, 16); \
631     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_a, BUFFER_SIZE/64, 16); \
632     ENFORCE_STACK_ALIGN_1D (int32_t, psd8x8_a, BUFFER_SIZE/64, 16); \
633     ENFORCE_STACK_ALIGN_1D (uint8_t, pmad8x8_a, BUFFER_SIZE/64, 16); \
634     int32_t pic_width_a; \
635     int32_t pic_height_a; \
636     int32_t pic_stride_a; \
637     int32_t psadframe_a; \
638     for (int i=0; i<4; i++) { \
639         pic_width_c  = pic_width_a = 320-16*i; \
640         pic_height_c = pic_height_a = 320; \
641         pic_stride_c = pic_stride_a = 320; \
642         psadframe_c = psadframe_a = 0; \
643         for (int j=0; j<BUFFER_SIZE; j++) { \
644             cur_data_c[j] = cur_data_a[j] = (rand()%256); \
645             ref_data_c[j] = ref_data_a[j] = (rand()%256); \
646             psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
647             psd8x8_c[j%(BUFFER_SIZE/64)]  = psd8x8_a[j%(BUFFER_SIZE/64)]  = (rand()%256); \
648             pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
649         } \
650         VAACalcSadBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psd8x8_c, pmad8x8_c); \
651         func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psd8x8_a, pmad8x8_a); \
652         ASSERT_EQ (psadframe_a, psadframe_c); \
653         for (int j=0; j<(BUFFER_SIZE/64); j++) {\
654             ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
655             ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
656             ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
657         } \
658     } \
659 }
660 
661 #define GENERATE_VAACalcSadSsd_UT(func, ASM, CPUFLAGS) \
662 TEST (VAACalcFuncTest, func) { \
663     if (ASM) {\
664         int32_t iCpuCores = 0; \
665         uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
666         if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
667         return; \
668     } \
669     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_c, BUFFER_SIZE, 16); \
670     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_c, BUFFER_SIZE, 16); \
671     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_c, BUFFER_SIZE/64, 16); \
672     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_c, BUFFER_SIZE/256, 16); \
673     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_c, BUFFER_SIZE/256, 16); \
674     ENFORCE_STACK_ALIGN_1D (int32_t, psqdiff16x16_c, BUFFER_SIZE/256, 16); \
675     int32_t pic_width_c; \
676     int32_t pic_height_c; \
677     int32_t pic_stride_c; \
678     int32_t psadframe_c; \
679     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_a, BUFFER_SIZE, 16); \
680     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_a, BUFFER_SIZE, 16); \
681     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_a, BUFFER_SIZE/64, 16); \
682     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_a, BUFFER_SIZE/256, 16); \
683     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_a, BUFFER_SIZE/256, 16); \
684     ENFORCE_STACK_ALIGN_1D (int32_t, psqdiff16x16_a, BUFFER_SIZE/256, 16); \
685     int32_t pic_width_a; \
686     int32_t pic_height_a; \
687     int32_t pic_stride_a; \
688     int32_t psadframe_a; \
689     for (int i=0; i<4; i++) { \
690         pic_width_c  = pic_width_a = 320-16*i; \
691         pic_height_c = pic_height_a = 320; \
692         pic_stride_c = pic_stride_a = 320; \
693         psadframe_c = psadframe_a = 0; \
694         for (int j=0; j<BUFFER_SIZE; j++) { \
695             cur_data_c[j] = cur_data_a[j] = (rand()%256); \
696             ref_data_c[j] = ref_data_a[j] = (rand()%256); \
697             psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
698             psum16x16_c[j%(BUFFER_SIZE/256)]    = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
699             psqsum16x16_c[j%(BUFFER_SIZE/256)]  = psqsum16x16_a[j%(BUFFER_SIZE/256)]  = (rand()%256); \
700             psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
701         } \
702         VAACalcSadSsd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c); \
703         func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a); \
704         ASSERT_EQ (psadframe_a, psadframe_c); \
705         for (int j=0; j<(BUFFER_SIZE/64); j++) {\
706             ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
707         } \
708         for (int j=0; j<(BUFFER_SIZE/256); j++) {\
709             ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
710             ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
711             ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
712         } \
713     } \
714 }
715 
716 #define GENERATE_VAACalcSadVar_UT(func, ASM, CPUFLAGS) \
717 TEST (VAACalcFuncTest, func) { \
718     if (ASM) {\
719         int32_t iCpuCores = 0; \
720         uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
721         if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
722         return; \
723     } \
724     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_c, BUFFER_SIZE, 16); \
725     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_c, BUFFER_SIZE, 16); \
726     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_c, BUFFER_SIZE/64, 16); \
727     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_c, BUFFER_SIZE/256, 16); \
728     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_c, BUFFER_SIZE/256, 16); \
729     int32_t pic_width_c; \
730     int32_t pic_height_c; \
731     int32_t pic_stride_c; \
732     int32_t psadframe_c; \
733     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_a, BUFFER_SIZE, 16); \
734     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_a, BUFFER_SIZE, 16); \
735     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_a, BUFFER_SIZE/64, 16); \
736     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_a, BUFFER_SIZE/256, 16); \
737     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_a, BUFFER_SIZE/256, 16); \
738     int32_t pic_width_a; \
739     int32_t pic_height_a; \
740     int32_t pic_stride_a; \
741     int32_t psadframe_a; \
742     for (int i=0; i<4; i++) { \
743         pic_width_c  = pic_width_a = 320-16*i; \
744         pic_height_c = pic_height_a = 320; \
745         pic_stride_c = pic_stride_a = 320; \
746         psadframe_c = psadframe_a = 0; \
747         for (int j=0; j<BUFFER_SIZE; j++) { \
748             cur_data_c[j] = cur_data_a[j] = (rand()%256); \
749             ref_data_c[j] = ref_data_a[j] = (rand()%256); \
750             psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
751             psum16x16_c[j%(BUFFER_SIZE/256)]    = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
752             psqsum16x16_c[j%(BUFFER_SIZE/256)]  = psqsum16x16_a[j%(BUFFER_SIZE/256)]  = (rand()%256); \
753         } \
754         VAACalcSadVar_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c); \
755         func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a); \
756         ASSERT_EQ (psadframe_a, psadframe_c); \
757         for (int j=0; j<(BUFFER_SIZE/64); j++) {\
758             ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
759         } \
760         for (int j=0; j<(BUFFER_SIZE/256); j++) {\
761             ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
762             ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
763         } \
764     } \
765 }
766 
767 #define GENERATE_VAACalcSadSsdBgd_UT(func, ASM, CPUFLAGS) \
768 TEST (VAACalcFuncTest, func) { \
769     if (ASM) {\
770         int32_t iCpuCores = 0; \
771         uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores); \
772         if (0 == (m_uiCpuFeatureFlag & CPUFLAGS)) \
773         return; \
774     } \
775     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_c, BUFFER_SIZE, 16); \
776     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_c, BUFFER_SIZE, 16); \
777     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_c, BUFFER_SIZE/64, 16); \
778     ENFORCE_STACK_ALIGN_1D (int32_t, psd8x8_c, BUFFER_SIZE/64, 16); \
779     ENFORCE_STACK_ALIGN_1D (uint8_t, pmad8x8_c, BUFFER_SIZE/64, 16); \
780     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_c, BUFFER_SIZE/256, 16); \
781     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_c, BUFFER_SIZE/256, 16); \
782     ENFORCE_STACK_ALIGN_1D (int32_t, psqdiff16x16_c, BUFFER_SIZE/256, 16); \
783     int32_t pic_width_c; \
784     int32_t pic_height_c; \
785     int32_t pic_stride_c; \
786     int32_t psadframe_c; \
787     ENFORCE_STACK_ALIGN_1D (uint8_t, cur_data_a, BUFFER_SIZE, 16); \
788     ENFORCE_STACK_ALIGN_1D (uint8_t, ref_data_a, BUFFER_SIZE, 16); \
789     ENFORCE_STACK_ALIGN_1D (int32_t, psad8x8_a, BUFFER_SIZE/64, 16); \
790     ENFORCE_STACK_ALIGN_1D (int32_t, psd8x8_a, BUFFER_SIZE/64, 16); \
791     ENFORCE_STACK_ALIGN_1D (uint8_t, pmad8x8_a, BUFFER_SIZE/64, 16); \
792     ENFORCE_STACK_ALIGN_1D (int32_t, psum16x16_a, BUFFER_SIZE/256, 16); \
793     ENFORCE_STACK_ALIGN_1D (int32_t, psqsum16x16_a, BUFFER_SIZE/256, 16); \
794     ENFORCE_STACK_ALIGN_1D (int32_t, psqdiff16x16_a, BUFFER_SIZE/256, 16); \
795     int32_t pic_width_a; \
796     int32_t pic_height_a; \
797     int32_t pic_stride_a; \
798     int32_t psadframe_a; \
799     for (int i=0; i<4; i++) { \
800         pic_width_c  = pic_width_a = 320-16*i; \
801         pic_height_c = pic_height_a = 320; \
802         pic_stride_c = pic_stride_a = 320; \
803         psadframe_c = psadframe_a = 0; \
804         for (int j=0; j<BUFFER_SIZE; j++) { \
805             cur_data_c[j] = cur_data_a[j] = (rand()%256); \
806             ref_data_c[j] = ref_data_a[j] = (rand()%256); \
807             psad8x8_c[j%(BUFFER_SIZE/64)] = psad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
808             psd8x8_c[j%(BUFFER_SIZE/64)]  = psd8x8_a[j%(BUFFER_SIZE/64)]  = (rand()%256); \
809             pmad8x8_c[j%(BUFFER_SIZE/64)] = pmad8x8_a[j%(BUFFER_SIZE/64)] = (rand()%256); \
810             psum16x16_c[j%(BUFFER_SIZE/256)]    = psum16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
811             psqsum16x16_c[j%(BUFFER_SIZE/256)]  = psqsum16x16_a[j%(BUFFER_SIZE/256)]  = (rand()%256); \
812             psqdiff16x16_c[j%(BUFFER_SIZE/256)] = psqdiff16x16_a[j%(BUFFER_SIZE/256)] = (rand()%256); \
813         } \
814         VAACalcSadSsdBgd_ref (cur_data_c, ref_data_c, pic_width_c, pic_height_c, pic_stride_c, &psadframe_c, psad8x8_c, psum16x16_c, psqsum16x16_c, psqdiff16x16_c, psd8x8_c, pmad8x8_c); \
815         func (cur_data_a, ref_data_a, pic_width_a, pic_height_a, pic_stride_a, &psadframe_a, psad8x8_a, psum16x16_a, psqsum16x16_a, psqdiff16x16_a, psd8x8_a, pmad8x8_a); \
816         ASSERT_EQ (psadframe_a, psadframe_c); \
817         for (int j=0; j<(BUFFER_SIZE/64); j++) {\
818             ASSERT_EQ (psad8x8_a[j], psad8x8_c[j]); \
819             ASSERT_EQ (psd8x8_a[j], psd8x8_c[j]); \
820             ASSERT_EQ (pmad8x8_a[j], pmad8x8_c[j]); \
821         } \
822         for (int j=0; j<(BUFFER_SIZE/256); j++) {\
823             ASSERT_EQ (psum16x16_a[j], psum16x16_c[j]); \
824             ASSERT_EQ (psqsum16x16_a[j], psqsum16x16_c[j]); \
825             ASSERT_EQ (psqdiff16x16_a[j], psqdiff16x16_c[j]); \
826         } \
827     } \
828 }
829 
830 GENERATE_VAACalcSad_UT (VAACalcSad_c, 0, 0)
831 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_c, 0, 0)
832 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_c, 0, 0)
833 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_c, 0, 0)
834 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_c, 0, 0)
835 #if defined(X86_ASM)
836 GENERATE_VAACalcSad_UT (VAACalcSad_sse2, 1, WELS_CPU_SSE2)
837 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_sse2, 1, WELS_CPU_SSE2)
838 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_sse2, 1, WELS_CPU_SSE2)
839 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_sse2, 1, WELS_CPU_SSE2)
840 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_sse2, 1, WELS_CPU_SSE2)
841 
842 #if defined(HAVE_AVX2)
843 GENERATE_VAACalcSad_UT (VAACalcSad_avx2, 1, WELS_CPU_AVX2)
844 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_avx2, 1, WELS_CPU_AVX2)
845 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_avx2, 1, WELS_CPU_AVX2)
846 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_avx2, 1, WELS_CPU_AVX2)
847 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_avx2, 1, WELS_CPU_AVX2)
848 #endif //HAVE_AVX2
849 #endif
850 
851 #if defined(HAVE_NEON)
852 GENERATE_VAACalcSad_UT (VAACalcSad_neon, 1, WELS_CPU_NEON)
853 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_neon, 1, WELS_CPU_NEON)
854 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_neon, 1, WELS_CPU_NEON)
855 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_neon, 1, WELS_CPU_NEON)
856 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_neon, 1, WELS_CPU_NEON)
857 #endif
858 
859 #if defined(HAVE_NEON_AARCH64)
860 GENERATE_VAACalcSad_UT (VAACalcSad_AArch64_neon, 1, WELS_CPU_NEON)
861 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_AArch64_neon, 1, WELS_CPU_NEON)
862 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_AArch64_neon, 1, WELS_CPU_NEON)
863 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_AArch64_neon, 1, WELS_CPU_NEON)
864 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_AArch64_neon, 1, WELS_CPU_NEON)
865 #endif
866 
867 #if defined(HAVE_MMI)
868 GENERATE_VAACalcSad_UT (VAACalcSad_mmi, 1, WELS_CPU_MMI)
869 GENERATE_VAACalcSadBgd_UT (VAACalcSadBgd_mmi, 1, WELS_CPU_MMI)
870 GENERATE_VAACalcSadSsdBgd_UT (VAACalcSadSsdBgd_mmi, 1, WELS_CPU_MMI)
871 GENERATE_VAACalcSadSsd_UT (VAACalcSadSsd_mmi, 1, WELS_CPU_MMI)
872 GENERATE_VAACalcSadVar_UT (VAACalcSadVar_mmi, 1, WELS_CPU_MMI)
873 #endif
874