1 #include <gtest/gtest.h>
2 #include "decode_mb_aux.h"
3 #include "wels_common_basis.h"
4 #include "macros.h"
5 #include "cpu.h"
6 
7 using namespace WelsEnc;
8 
9 
TEST(DecodeMbAuxTest,TestIhdm_4x4_dc)10 TEST (DecodeMbAuxTest, TestIhdm_4x4_dc) {
11   short W[16], T[16], Y[16];
12   for (int i = 0; i < 16; i++)
13     W[i] = rand() % 256 + 1;
14 
15   T[0] = W[0] + W[4] + W[8] + W[12];
16   T[1] = W[1] + W[5] + W[9] + W[13];
17   T[2] = W[2] + W[6] + W[10] + W[14];
18   T[3] = W[3] + W[7] + W[11] + W[15];
19 
20   T[4] = W[0] + W[4] - W[8] - W[12];
21   T[5] = W[1] + W[5] - W[9] - W[13];
22   T[6] = W[2] + W[6] - W[10] - W[14];
23   T[7] = W[3] + W[7] - W[11] - W[15];
24 
25   T[8] = W[0] - W[4] - W[8] + W[12];
26   T[9] = W[1] - W[5] - W[9] + W[13];
27   T[10] = W[2] - W[6] - W[10] + W[14];
28   T[11] = W[3] - W[7] - W[11] + W[15];
29 
30   T[12] = W[0] - W[4] + W[8] - W[12];
31   T[13] = W[1] - W[5] + W[9] - W[13];
32   T[14] = W[2] - W[6] + W[10] - W[14];
33   T[15] = W[3] - W[7] + W[11] - W[15];
34 
35   Y[0] = T[0] + T[1] + T[2] + T[3];
36   Y[1] = T[0] + T[1] - T[2] - T[3];
37   Y[2] = T[0] - T[1] - T[2] + T[3];
38   Y[3] = T[0] - T[1] + T[2] - T[3];
39 
40   Y[4] = T[4] + T[5] + T[6] + T[7];
41   Y[5] = T[4] + T[5] - T[6] - T[7];
42   Y[6] = T[4] - T[5] - T[6] + T[7];
43   Y[7] = T[4] - T[5] + T[6] - T[7];
44 
45   Y[8] = T[8] + T[9] + T[10] + T[11];
46   Y[9] = T[8] + T[9] - T[10] - T[11];
47   Y[10] = T[8] - T[9] - T[10] + T[11];
48   Y[11] = T[8] - T[9] + T[10] - T[11];
49 
50   Y[12] = T[12] + T[13] + T[14] + T[15];
51   Y[13] = T[12] + T[13] - T[14] - T[15];
52   Y[14] = T[12] - T[13] - T[14] + T[15];
53   Y[15] = T[12] - T[13] + T[14] - T[15];
54 
55   WelsIHadamard4x4Dc (W);
56   for (int i = 0; i < 16; i++)
57     EXPECT_EQ (Y[i], W[i]);
58 }
59 
TEST(DecodeMbAuxTest,TestDequant_4x4_luma_dc)60 TEST (DecodeMbAuxTest, TestDequant_4x4_luma_dc) {
61   short T[16], W[16];
62 
63   for (int qp = 0; qp < 12; qp++) {
64     for (int i = 0; i < 16; i++) {
65       T[i] = rand() % 256 + 1;
66       W[i] = T[i];
67     }
68     WelsDequantLumaDc4x4 (W, qp);
69     for (int i = 0; i < 16; i++) {
70       T[i] = (((T[i] * g_kuiDequantCoeff[qp % 6][0] + (1 << (1 -  qp / 6)))) >> (2 - qp / 6));
71       EXPECT_EQ (T[i], W[i]);
72     }
73   }
74 }
75 
TEST(DecodeMbAuxTest,TestDequant_ihdm_4x4_c)76 TEST (DecodeMbAuxTest, TestDequant_ihdm_4x4_c) {
77   short W[16], T[16], Y[16];
78   const unsigned short mf = rand() % 16 + 1;
79   for (int i = 0; i < 16; i++)
80     W[i] = rand() % 256 + 1;
81 
82   T[0] = W[0] + W[4] + W[8] + W[12];
83   T[1] = W[1] + W[5] + W[9] + W[13];
84   T[2] = W[2] + W[6] + W[10] + W[14];
85   T[3] = W[3] + W[7] + W[11] + W[15];
86 
87   T[4] = W[0] + W[4] - W[8] - W[12];
88   T[5] = W[1] + W[5] - W[9] - W[13];
89   T[6] = W[2] + W[6] - W[10] - W[14];
90   T[7] = W[3] + W[7] - W[11] - W[15];
91 
92   T[8] = W[0] - W[4] - W[8] + W[12];
93   T[9] = W[1] - W[5] - W[9] + W[13];
94   T[10] = W[2] - W[6] - W[10] + W[14];
95   T[11] = W[3] - W[7] - W[11] + W[15];
96 
97   T[12] = W[0] - W[4] + W[8] - W[12];
98   T[13] = W[1] - W[5] + W[9] - W[13];
99   T[14] = W[2] - W[6] + W[10] - W[14];
100   T[15] = W[3] - W[7] + W[11] - W[15];
101 
102   Y[0] = (T[0] + T[1] + T[2] + T[3]) * mf;
103   Y[1] = (T[0] + T[1] - T[2] - T[3]) * mf;
104   Y[2] = (T[0] - T[1] - T[2] + T[3]) * mf;
105   Y[3] = (T[0] - T[1] + T[2] - T[3]) * mf;
106 
107   Y[4] = (T[4] + T[5] + T[6] + T[7]) * mf;
108   Y[5] = (T[4] + T[5] - T[6] - T[7]) * mf;
109   Y[6] = (T[4] - T[5] - T[6] + T[7]) * mf;
110   Y[7] = (T[4] - T[5] + T[6] - T[7]) * mf;
111 
112   Y[8] = (T[8] + T[9] + T[10] + T[11]) * mf;
113   Y[9] = (T[8] + T[9] - T[10] - T[11]) * mf;
114   Y[10] = (T[8] - T[9] - T[10] + T[11]) * mf;
115   Y[11] = (T[8] - T[9] + T[10] - T[11]) * mf;
116 
117   Y[12] = (T[12] + T[13] + T[14] + T[15]) * mf;
118   Y[13] = (T[12] + T[13] - T[14] - T[15]) * mf;
119   Y[14] = (T[12] - T[13] - T[14] + T[15]) * mf;
120   Y[15] = (T[12] - T[13] + T[14] - T[15]) * mf;
121 
122   WelsDequantIHadamard4x4_c (W, mf);
123   for (int i = 0; i < 16; i++)
124     EXPECT_EQ (Y[i], W[i]);
125 }
126 
TEST(DecodeMbAuxTest,TestDequant_4x4_c)127 TEST (DecodeMbAuxTest, TestDequant_4x4_c) {
128   short W[16], T[16];
129   unsigned short mf[16];
130   for (int i = 0; i < 16; i++) {
131     W[i] = rand() % 256 + 1;
132     T[i] = W[i];
133   }
134 
135   for (int i = 0; i < 8; i++)
136     mf[i] = rand() % 16 + 1;
137   WelsDequant4x4_c (W, mf);
138   for (int i = 0; i < 16; i++)
139     EXPECT_EQ (T[i]*mf[i % 8], W[i]);
140 }
TEST(DecodeMbAuxTest,TestDequant_4_4x4_c)141 TEST (DecodeMbAuxTest, TestDequant_4_4x4_c) {
142   short W[64], T[64];
143   unsigned short mf[16];
144   for (int i = 0; i < 64; i++) {
145     W[i] = rand() % 256 + 1;
146     T[i] = W[i];
147   }
148   for (int i = 0; i < 8; i++)
149     mf[i] = rand() % 16 + 1;
150   WelsDequantFour4x4_c (W, mf);
151   for (int i = 0; i < 64; i++)
152     EXPECT_EQ (T[i]*mf[i % 8], W[i]);
153 }
WelsDequantHadamard2x2DcAnchor(int16_t * pDct,int16_t iMF)154 void WelsDequantHadamard2x2DcAnchor (int16_t* pDct, int16_t iMF) {
155   const int16_t iSumU = pDct[0] + pDct[2];
156   const int16_t iDelU =   pDct[0] -  pDct[2];
157   const int16_t iSumD = pDct[1] + pDct[3];
158   const int16_t iDelD =   pDct[1] -  pDct[3];
159   pDct[0] = ((iSumU + iSumD) * iMF) >> 1;
160   pDct[1] = ((iSumU - iSumD) * iMF) >> 1;
161   pDct[2] = ((iDelU + iDelD) * iMF) >> 1;
162   pDct[3] = ((iDelU - iDelD) * iMF) >> 1;
163 }
TEST(DecodeMbAuxTest,WelsDequantIHadamard2x2Dc)164 TEST (DecodeMbAuxTest, WelsDequantIHadamard2x2Dc) {
165   int16_t iDct[4], iRefDct[4];
166   int16_t iMF;
167   iMF = rand() & 127;
168   for (int i = 0; i < 4; i++)
169     iDct[i] = iRefDct[i] = (rand() & 65535) - 32768;
170   WelsDequantHadamard2x2DcAnchor (iRefDct, iMF);
171   WelsDequantIHadamard2x2Dc (iDct, iMF);
172   bool ok = true;
173   for (int i = 0; i < 4; i++) {
174     if (iDct[i] != iRefDct[i]) {
175       ok = false;
176       break;
177     }
178   }
179   EXPECT_TRUE (ok);
180 }
181 #define FDEC_STRIDE 32
182 template<typename clip_t>
WelsIDctT4Anchor(uint8_t * p_dst,int16_t dct[16])183 void WelsIDctT4Anchor (uint8_t* p_dst, int16_t dct[16]) {
184   int16_t tmp[16];
185   int32_t iStridex2 = (FDEC_STRIDE << 1);
186   int32_t iStridex3 = iStridex2 + FDEC_STRIDE;
187   uint8_t uiDst = 0;
188   int i;
189   for (i = 0; i < 4; i++) {
190     tmp[i << 2]     = dct[i << 2] + dct[ (i << 2) + 1]      + dct[ (i << 2) + 2] + (dct[ (i << 2) + 3] >> 1);
191     tmp[ (i << 2) + 1] = dct[i << 2] + (dct[ (i << 2) + 1] >> 1) - dct[ (i << 2) + 2] - dct[ (i << 2) + 3];
192     tmp[ (i << 2) + 2] = dct[i << 2] - (dct[ (i << 2) + 1] >> 1) - dct[ (i << 2) + 2] + dct[ (i << 2) + 3];
193     tmp[ (i << 2) + 3] = dct[i << 2] - dct[ (i << 2) + 1]      + dct[ (i << 2) + 2] - (dct[ (i << 2) + 3] >> 1);
194   }
195   for (i = 0; i < 4; i++) {
196     uiDst = p_dst[i];
197     p_dst[i]             = WelsClip1 (uiDst + (clip_t (tmp[i] + tmp[4 + i] +     tmp[8 + i] + (tmp[12 + i] >> 1) + 32) >> 6));
198     uiDst = p_dst[i + FDEC_STRIDE];
199     p_dst[i + FDEC_STRIDE] = WelsClip1 (uiDst + (clip_t (tmp[i] + (tmp[4 + i] >> 1) - tmp[8 + i] - tmp[12 + i] + 32)     >> 6));
200     uiDst = p_dst[i + iStridex2];
201     p_dst[i + iStridex2]   = WelsClip1 (uiDst + (clip_t (tmp[i] - (tmp[4 + i] >> 1) - tmp[8 + i] + tmp[12 + i] + 32)     >> 6));
202     uiDst = p_dst[i + iStridex3];
203     p_dst[i + iStridex3]   = WelsClip1 (uiDst + (clip_t (tmp[i] - tmp[4 + i] +     tmp[8 + i] - (tmp[12 + i] >> 1) + 32) >> 6));
204   }
205 }
206 template<typename clip_t>
TestIDctT4Rec(PIDctFunc func)207 void TestIDctT4Rec (PIDctFunc func) {
208   int16_t iRefDct[16];
209   uint8_t iRefDst[16 * FDEC_STRIDE];
210   ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
211   ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
212   ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
213   for (int i = 0; i < 4; i++) {
214     for (int j = 0; j < 4; j++) {
215       iRefDct[i * 4 + j] = iDct[i * 4 + j] = (rand() & 65535) - 32768;
216       iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255;
217     }
218   }
219   WelsIDctT4Anchor<clip_t> (iRefDst, iRefDct);
220   func (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
221   int ok = -1;
222   for (int i = 0; i < 4; i++) {
223     for (int j = 0; j < 4; j++) {
224       if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
225         ok = i * 4 + j;
226         break;
227       }
228     }
229   }
230   EXPECT_EQ (ok, -1);
231 }
TEST(DecodeMbAuxTest,WelsIDctT4Rec_c)232 TEST (DecodeMbAuxTest, WelsIDctT4Rec_c) {
233   TestIDctT4Rec<int32_t> (WelsIDctT4Rec_c);
234 }
235 #if defined(X86_ASM)
TEST(DecodeMbAuxTest,WelsIDctT4Rec_mmx)236 TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmx) {
237   TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmx);
238 }
TEST(DecodeMbAuxTest,WelsIDctT4Rec_sse2)239 TEST (DecodeMbAuxTest, WelsIDctT4Rec_sse2) {
240   TestIDctT4Rec<int16_t> (WelsIDctT4Rec_sse2);
241 }
242 #if defined(HAVE_AVX2)
TEST(DecodeMbAuxTest,WelsIDctT4Rec_avx2)243 TEST (DecodeMbAuxTest, WelsIDctT4Rec_avx2) {
244   if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
245     TestIDctT4Rec<int16_t> (WelsIDctT4Rec_avx2);
246 }
247 #endif
248 #endif
249 #if defined(HAVE_MMI)
TEST(DecodeMbAuxTest,WelsIDctT4Rec_mmi)250 TEST (DecodeMbAuxTest, WelsIDctT4Rec_mmi) {
251   TestIDctT4Rec<int16_t> (WelsIDctT4Rec_mmi);
252 }
253 #endif
254 template<typename clip_t>
WelsIDctT8Anchor(uint8_t * p_dst,int16_t dct[4][16])255 void WelsIDctT8Anchor (uint8_t* p_dst, int16_t dct[4][16]) {
256   WelsIDctT4Anchor<clip_t> (&p_dst[0],                   dct[0]);
257   WelsIDctT4Anchor<clip_t> (&p_dst[4],                   dct[1]);
258   WelsIDctT4Anchor<clip_t> (&p_dst[4 * FDEC_STRIDE + 0], dct[2]);
259   WelsIDctT4Anchor<clip_t> (&p_dst[4 * FDEC_STRIDE + 4], dct[3]);
260 }
261 template<typename clip_t>
TestIDctFourT4Rec(PIDctFunc func)262 void TestIDctFourT4Rec (PIDctFunc func) {
263   int16_t iRefDct[4][16];
264   uint8_t iRefDst[16 * FDEC_STRIDE];
265   ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 64, 16);
266   ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
267   ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
268   for (int k = 0; k < 4; k++)
269     for (int i = 0; i < 16; i++)
270       iRefDct[k][i] = iDct[k * 16 + i] = (rand() & 65535) - 32768;
271 
272   for (int i = 0; i < 8; i++)
273     for (int j = 0; j < 8; j++)
274       iPred[i * FDEC_STRIDE + j] = iRefDst[i * FDEC_STRIDE + j] = rand() & 255;
275 
276   WelsIDctT8Anchor<clip_t> (iRefDst, iRefDct);
277   func (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
278   int ok = -1;
279   for (int i = 0; i < 8; i++) {
280     for (int j = 0; j < 8; j++) {
281       if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
282         ok = i * 8 + j;
283         break;
284       }
285     }
286   }
287   EXPECT_EQ (ok, -1);
288 }
TEST(DecodeMbAuxTest,WelsIDctFourT4Rec_c)289 TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_c) {
290   TestIDctFourT4Rec<int32_t> (WelsIDctFourT4Rec_c);
291 }
WelsIDctRecI16x4DcAnchor(uint8_t * p_dst,int16_t dct[4])292 void WelsIDctRecI16x4DcAnchor (uint8_t* p_dst, int16_t dct[4]) {
293   for (int i = 0; i < 4; i++, p_dst += FDEC_STRIDE) {
294     p_dst[0] = WelsClip1 (p_dst[0] + ((dct[0] + 32) >> 6));
295     p_dst[1] = WelsClip1 (p_dst[1] + ((dct[0] + 32) >> 6));
296     p_dst[2] = WelsClip1 (p_dst[2] + ((dct[0] + 32) >> 6));
297     p_dst[3] = WelsClip1 (p_dst[3] + ((dct[0] + 32) >> 6));
298 
299     p_dst[4] = WelsClip1 (p_dst[4] + ((dct[1] + 32) >> 6));
300     p_dst[5] = WelsClip1 (p_dst[5] + ((dct[1] + 32) >> 6));
301     p_dst[6] = WelsClip1 (p_dst[6] + ((dct[1] + 32) >> 6));
302     p_dst[7] = WelsClip1 (p_dst[7] + ((dct[1] + 32) >> 6));
303 
304     p_dst[8]  = WelsClip1 (p_dst[8]  + ((dct[2] + 32) >> 6));
305     p_dst[9]  = WelsClip1 (p_dst[9]  + ((dct[2] + 32) >> 6));
306     p_dst[10] = WelsClip1 (p_dst[10] + ((dct[2] + 32) >> 6));
307     p_dst[11] = WelsClip1 (p_dst[11] + ((dct[2] + 32) >> 6));
308 
309     p_dst[12] = WelsClip1 (p_dst[12] + ((dct[3] + 32) >> 6));
310     p_dst[13] = WelsClip1 (p_dst[13] + ((dct[3] + 32) >> 6));
311     p_dst[14] = WelsClip1 (p_dst[14] + ((dct[3] + 32) >> 6));
312     p_dst[15] = WelsClip1 (p_dst[15] + ((dct[3] + 32) >> 6));
313   }
314 }
WelsIDctRecI16x16DcAnchor(uint8_t * p_dst,int16_t dct[4][4])315 void WelsIDctRecI16x16DcAnchor (uint8_t* p_dst, int16_t dct[4][4]) {
316   for (int i = 0; i < 4; i++, p_dst += 4 * FDEC_STRIDE)
317     WelsIDctRecI16x4DcAnchor (&p_dst[0], dct[i]);
318 }
319 
TEST(DecodeMbAuxTest,WelsIDctRecI16x16Dc_c)320 TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_c) {
321   uint8_t iRefDst[16 * FDEC_STRIDE];
322   int16_t iRefDct[4][4];
323   ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
324   ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
325   ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
326   for (int i = 0; i < 16; i++)
327     for (int j = 0; j < 16; j++)
328       iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
329 
330   for (int i = 0; i < 4; i++)
331     for (int j = 0; j < 4; j++)
332       iRefDct[i][j] = iDct[i * 4 + j] = (rand() & 65535) - 32768;
333   WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
334   WelsIDctRecI16x16Dc_c (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
335   int ok = -1;
336   for (int i = 0; i < 16; i++) {
337     for (int j = 0; j < 16; j++) {
338       if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
339         ok = i * 16 + j;
340         break;
341       }
342     }
343   }
344   EXPECT_EQ (ok, -1);
345 }
346 #if defined(X86_ASM)
TEST(DecodeMbAuxTest,WelsIDctFourT4Rec_sse2)347 TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_sse2) {
348   TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_sse2);
349 }
350 #if defined(HAVE_AVX2)
TEST(DecodeMbAuxTest,WelsIDctFourT4Rec_avx2)351 TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_avx2) {
352   if (WelsCPUFeatureDetect (0) & WELS_CPU_AVX2)
353     TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_avx2);
354 }
355 #endif
TEST(DecodeMbAuxTest,WelsIDctRecI16x16Dc_sse2)356 TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_sse2) {
357   int32_t iCpuCores = 0;
358   uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
359 
360   if (uiCpuFeatureFlag & WELS_CPU_SSE2) {
361     uint8_t iRefDst[16 * FDEC_STRIDE];
362     int16_t iRefDct[4][4];
363     ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
364     ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
365     ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
366     for (int i = 0; i < 16; i++)
367       for (int j = 0; j < 16; j++)
368         iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
369     for (int i = 0; i < 4; i++)
370       for (int j = 0; j < 4; j++)
371         iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<
372                                           14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
373     WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
374     WelsIDctRecI16x16Dc_sse2 (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
375     int ok = -1;
376     for (int i = 0; i < 16; i++) {
377       for (int j = 0; j < 16; j++) {
378         if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
379           ok = i * 16 + j;
380           break;
381         }
382       }
383     }
384     EXPECT_EQ (ok, -1);
385   }
386 }
387 #endif
388 #if defined(HAVE_MMI)
TEST(DecodeMbAuxTest,WelsIDctFourT4Rec_mmi)389 TEST (DecodeMbAuxTest, WelsIDctFourT4Rec_mmi) {
390   TestIDctFourT4Rec<int16_t> (WelsIDctFourT4Rec_mmi);
391 }
TEST(DecodeMbAuxTest,WelsIDctRecI16x16Dc_mmi)392 TEST (DecodeMbAuxTest, WelsIDctRecI16x16Dc_mmi) {
393   int32_t iCpuCores = 0;
394   uint32_t uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);
395 
396   if (uiCpuFeatureFlag & WELS_CPU_MMI) {
397     uint8_t iRefDst[16 * FDEC_STRIDE];
398     int16_t iRefDct[4][4];
399     ENFORCE_STACK_ALIGN_1D (int16_t, iDct, 16, 16);
400     ENFORCE_STACK_ALIGN_1D (uint8_t, iPred, 16 * FDEC_STRIDE, 16);
401     ENFORCE_STACK_ALIGN_1D (uint8_t, iRec, 16 * FDEC_STRIDE, 16);
402     for (int i = 0; i < 16; i++)
403       for (int j = 0; j < 16; j++)
404         iRefDst[i * FDEC_STRIDE + j] = iPred[i * FDEC_STRIDE + j] = rand() & 255;
405     for (int i = 0; i < 4; i++)
406       for (int j = 0; j < 4; j++)
407         iRefDct[i][j] = iDct[i * 4 + j] = (rand() & ((1 << 15) - 1)) - (1 <<
408                                           14); //2^14 limit, (2^15+32) will cause overflow for SSE2.
409     WelsIDctRecI16x16DcAnchor (iRefDst, iRefDct);
410     WelsIDctRecI16x16Dc_mmi (iRec, FDEC_STRIDE, iPred, FDEC_STRIDE, iDct);
411     int ok = -1;
412     for (int i = 0; i < 16; i++) {
413       for (int j = 0; j < 16; j++) {
414         if (iRec[i * FDEC_STRIDE + j] != iRefDst[i * FDEC_STRIDE + j]) {
415           ok = i * 16 + j;
416           break;
417         }
418       }
419     }
420     EXPECT_EQ (ok, -1);
421   }
422 }
423 #endif
424