1 /*!
2  * \copy
3  *     Copyright (c)  2013, Cisco Systems
4  *     All rights reserved.
5  *
6  *     Redistribution and use in source and binary forms, with or without
7  *     modification, are permitted provided that the following conditions
8  *     are met:
9  *
10  *        * Redistributions of source code must retain the above copyright
11  *          notice, this list of conditions and the following disclaimer.
12  *
13  *        * Redistributions in binary form must reproduce the above copyright
14  *          notice, this list of conditions and the following disclaimer in
15  *          the documentation and/or other materials provided with the
16  *          distribution.
17  *
18  *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25  *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26  *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28  *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  *     POSSIBILITY OF SUCH DAMAGE.
30  *
31  */
32 
33 #include "decode_mb_aux.h"
34 #include "cpu_core.h"
35 
36 namespace WelsEnc {
37 /****************************************************************************
38  * Dequant and Ihdm functions
39  ****************************************************************************/
WelsIHadamard4x4Dc(int16_t * pRes)40 void WelsIHadamard4x4Dc (int16_t* pRes) { //pBuffer size : 4x4
41   int16_t iTemp[4];
42   int32_t i = 4;
43 
44   while (--i >= 0) {
45     const int32_t kiIdx  = i << 2;
46     const int32_t kiIdx1 = 1 + kiIdx;
47     const int32_t kiIdx2 = 1 + kiIdx1;
48     const int32_t kiIdx3 = 1 + kiIdx2;
49 
50     iTemp[0] = pRes[kiIdx ] + pRes[kiIdx2];
51     iTemp[1] = pRes[kiIdx ] - pRes[kiIdx2];
52     iTemp[2] = pRes[kiIdx1] - pRes[kiIdx3];
53     iTemp[3] = pRes[kiIdx1] + pRes[kiIdx3];
54 
55     pRes[kiIdx ] = iTemp[0] + iTemp[3];
56     pRes[kiIdx1] = iTemp[1] + iTemp[2];
57     pRes[kiIdx2] = iTemp[1] - iTemp[2];
58     pRes[kiIdx3] = iTemp[0] - iTemp[3];
59   }
60 
61   i = 4;
62   while (--i >= 0) {
63     const int32_t kiI4  = 4 + i;
64     const int32_t kiI8  = 4 + kiI4;
65     const int32_t kiI12 = 4 + kiI8;
66 
67     iTemp[0] = pRes[i  ] + pRes[kiI8 ];
68     iTemp[1] = pRes[i  ] - pRes[kiI8 ];
69     iTemp[2] = pRes[kiI4 ] - pRes[kiI12];
70     iTemp[3] = pRes[kiI4 ] + pRes[kiI12];
71 
72     pRes[i  ] = iTemp[0] + iTemp[3];
73     pRes[kiI4 ] = iTemp[1] + iTemp[2];
74     pRes[kiI8 ] = iTemp[1] - iTemp[2];
75     pRes[kiI12] = iTemp[0] - iTemp[3];
76   }
77 }
78 
79 /* for qp < 12 */
WelsDequantLumaDc4x4(int16_t * pRes,const int32_t kiQp)80 void WelsDequantLumaDc4x4 (int16_t* pRes, const int32_t kiQp) {
81   int32_t i = 15;
82   const uint16_t kuiDequantValue = g_kuiDequantCoeff[kiQp % 6][0];
83   const int16_t kiQF0   = kiQp / 6;
84   const int16_t kiQF1   = 2 - kiQF0;
85   const int16_t kiQF0S  = 1 << (1 - kiQF0);
86 
87   while (i >= 0) {
88     pRes[i  ]   = (pRes[i  ]   * kuiDequantValue + kiQF0S) >> kiQF1;
89     pRes[i - 1] = (pRes[i - 1] * kuiDequantValue + kiQF0S) >> kiQF1;
90     pRes[i - 2] = (pRes[i - 2] * kuiDequantValue + kiQF0S) >> kiQF1;
91     pRes[i - 3] = (pRes[i - 3] * kuiDequantValue + kiQF0S) >> kiQF1;
92 
93     i -= 4;
94   }
95 }
96 
97 /* for qp >= 12 */
WelsDequantIHadamard4x4_c(int16_t * pRes,const uint16_t kuiMF)98 void WelsDequantIHadamard4x4_c (int16_t* pRes, const uint16_t kuiMF) {
99   int16_t iTemp[4];
100   int32_t i;
101 
102   for (i = 0; i < 16; i += 4) {
103     iTemp[0] = pRes[i  ]   + pRes[i + 2];
104     iTemp[1] = pRes[i  ]   - pRes[i + 2];
105     iTemp[2] = pRes[i + 1] - pRes[i + 3];
106     iTemp[3] = pRes[i + 1] + pRes[i + 3];
107 
108     pRes[i  ]   = iTemp[0] + iTemp[3];
109     pRes[i + 1] = iTemp[1] + iTemp[2];
110     pRes[i + 2] = iTemp[1] - iTemp[2];
111     pRes[i + 3] = iTemp[0] - iTemp[3];
112   }
113 
114   for (i = 0; i < 4; i++) {
115     iTemp[0] = pRes[i   ]   + pRes[i + 8 ];
116     iTemp[1] = pRes[i   ]   - pRes[i + 8 ];
117     iTemp[2] = pRes[i + 4 ] - pRes[i + 12];
118     iTemp[3] = pRes[i + 4 ] + pRes[i + 12];
119 
120     pRes[i  ]    = (iTemp[0] + iTemp[3]) * kuiMF;
121     pRes[i + 4 ] = (iTemp[1] + iTemp[2]) * kuiMF;
122     pRes[i + 8 ] = (iTemp[1] - iTemp[2]) * kuiMF;
123     pRes[i + 12] = (iTemp[0] - iTemp[3]) * kuiMF;
124   }
125 }
126 
WelsDequantIHadamard2x2Dc(int16_t * pDct,const uint16_t kuiMF)127 void WelsDequantIHadamard2x2Dc (int16_t* pDct, const uint16_t kuiMF) {
128   const int16_t kiSumU = pDct[0] + pDct[2];
129   const int16_t kiDelU = pDct[0] - pDct[2];
130   const int16_t kiSumD = pDct[1] + pDct[3];
131   const int16_t kiDelD = pDct[1] - pDct[3];
132 
133   pDct[0] = ((kiSumU + kiSumD) * kuiMF) >> 1;
134   pDct[1] = ((kiSumU - kiSumD) * kuiMF) >> 1;
135   pDct[2] = ((kiDelU + kiDelD) * kuiMF) >> 1;
136   pDct[3] = ((kiDelU - kiDelD) * kuiMF) >> 1;
137 }
138 
WelsDequant4x4_c(int16_t * pRes,const uint16_t * kpMF)139 void WelsDequant4x4_c (int16_t* pRes, const uint16_t* kpMF) {
140   int32_t i;
141   for (i = 0; i < 8; i++) {
142     pRes[i]     *= kpMF[i];
143     pRes[i + 8] *= kpMF[i];
144   }
145 }
146 
WelsDequantFour4x4_c(int16_t * pRes,const uint16_t * kpMF)147 void WelsDequantFour4x4_c (int16_t* pRes, const uint16_t* kpMF) {
148   int32_t i;
149   for (i = 0; i < 8; i++) {
150     pRes[i]      *= kpMF[i];
151     pRes[i + 8]  *= kpMF[i];
152     pRes[i + 16] *= kpMF[i];
153     pRes[i + 24] *= kpMF[i];
154     pRes[i + 32] *= kpMF[i];
155     pRes[i + 40] *= kpMF[i];
156     pRes[i + 48] *= kpMF[i];
157     pRes[i + 56] *= kpMF[i];
158   }
159 }
160 
161 /****************************************************************************
162  * IDCT functions, final output = prediction(CS) + IDCT(scaled_coeff)
163  ****************************************************************************/
WelsIDctT4Rec_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)164 void WelsIDctT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
165   int32_t i;
166   int16_t iTemp[16];
167 
168   int32_t iDstStridex2 = iStride << 1;
169   int32_t iDstStridex3 = iStride + iDstStridex2;
170   int32_t iPredStridex2 = iPredStride << 1;
171   int32_t iPredStridex3 = iPredStride + iPredStridex2;
172 
173   for (i = 0; i < 4; i ++) { //horizon
174     int32_t iIdx = i << 2;
175     const int32_t kiHorSumU = pDct[iIdx] + pDct[iIdx + 2];      // add 0-2
176     const int32_t kiHorDelU = pDct[iIdx] - pDct[iIdx + 2];      // sub 0-2
177     const int32_t kiHorSumD = pDct[iIdx + 1] + (pDct[iIdx + 3] >> 1);
178     const int32_t kiHorDelD = (pDct[iIdx + 1] >> 1) - pDct[iIdx + 3];
179 
180     iTemp[iIdx  ]   = kiHorSumU  + kiHorSumD;
181     iTemp[iIdx + 1] = kiHorDelU   + kiHorDelD;
182     iTemp[iIdx + 2] = kiHorDelU   -  kiHorDelD;
183     iTemp[iIdx + 3] = kiHorSumU  -  kiHorSumD;
184   }
185 
186   for (i = 0; i < 4; i ++) { //vertical
187     const int32_t kiVerSumL = iTemp[i]                 + iTemp[8 + i];
188     const int32_t kiVerDelL   = iTemp[i]                 - iTemp[8 + i];
189     const int32_t kiVerDelR   = (iTemp[4 + i] >> 1) - iTemp[12 + i];
190     const int32_t kiVerSumR = iTemp[4 + i]             + (iTemp[12 + i] >> 1);
191 
192     pRec[i               ] = WelsClip1 (pPred[i                ] + ((kiVerSumL + kiVerSumR + 32) >> 6));
193     pRec[iStride + i     ] = WelsClip1 (pPred[iPredStride + i  ] + ((kiVerDelL + kiVerDelR + 32) >> 6));
194     pRec[iDstStridex2 + i] = WelsClip1 (pPred[iPredStridex2 + i] + ((kiVerDelL - kiVerDelR + 32) >> 6));
195     pRec[iDstStridex3 + i] = WelsClip1 (pPred[iPredStridex3 + i] + ((kiVerSumL - kiVerSumR + 32) >> 6));
196   }
197 }
198 
WelsIDctFourT4Rec_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct)199 void WelsIDctFourT4Rec_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct) {
200   int32_t iDstStridex4  = iStride << 2;
201   int32_t iPredStridex4 = iPredStride << 2;
202   WelsIDctT4Rec_c (pRec,                    iStride, pPred,                     iPredStride, pDct);
203   WelsIDctT4Rec_c (&pRec[4],                iStride, &pPred[4],                 iPredStride, pDct + 16);
204   WelsIDctT4Rec_c (&pRec[iDstStridex4    ], iStride, &pPred[iPredStridex4  ],   iPredStride, pDct + 32);
205   WelsIDctT4Rec_c (&pRec[iDstStridex4 + 4], iStride, &pPred[iPredStridex4 + 4], iPredStride, pDct + 48);
206 
207 }
208 
WelsIDctT4RecOnMb(uint8_t * pDst,int32_t iDstStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDct,PIDctFunc pfIDctFourT4)209 void WelsIDctT4RecOnMb (uint8_t* pDst, int32_t iDstStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDct,
210                         PIDctFunc pfIDctFourT4) {
211   int32_t iDstStridex8  = iDstStride << 3;
212   int32_t iPredStridex8 = iPredStride << 3;
213 
214   pfIDctFourT4 (&pDst[0], iDstStride, &pPred[0], iPredStride, pDct);
215   pfIDctFourT4 (&pDst[8], iDstStride, &pPred[8], iPredStride, pDct + 64);
216   pfIDctFourT4 (&pDst[iDstStridex8], iDstStride, &pPred[iPredStridex8], iPredStride, pDct + 128);
217   pfIDctFourT4 (&pDst[iDstStridex8 + 8], iDstStride, &pPred[iPredStridex8 + 8], iPredStride, pDct + 192);
218 }
219 
220 /*
221  * pfIDctI16x16Dc: do luma idct of an MB for I16x16 mode, when only dc value are non-zero
222  */
WelsIDctRecI16x16Dc_c(uint8_t * pRec,int32_t iStride,uint8_t * pPred,int32_t iPredStride,int16_t * pDctDc)223 void WelsIDctRecI16x16Dc_c (uint8_t* pRec, int32_t iStride, uint8_t* pPred, int32_t iPredStride, int16_t* pDctDc) {
224   int32_t i, j;
225 
226   for (i = 0; i < 16; i ++) {
227     for (j = 0; j < 16; j++) {
228       pRec[j] = WelsClip1 (pPred[j] + ((pDctDc[ (i & 0x0C) + (j >> 2)] + 32) >> 6));
229     }
230     pRec += iStride;
231     pPred += iPredStride;
232   }
233 }
234 
WelsGetEncBlockStrideOffset(int32_t * pBlock,const int32_t kiStrideY,const int32_t kiStrideUV)235 void WelsGetEncBlockStrideOffset (int32_t* pBlock, const int32_t kiStrideY, const int32_t kiStrideUV) {
236   int32_t i, j, k, r;
237   for (j = 0; j < 4; j++) {
238     i = j << 2;
239     k = (j & 0x01) << 1;
240     r = j & 0x02;
241     pBlock[i]           = (0 + k + (0 + r) * kiStrideY) << 2;
242     pBlock[i + 1]       = (1 + k + (0 + r) * kiStrideY) << 2;
243     pBlock[i + 2]       = (0 + k + (1 + r) * kiStrideY) << 2;
244     pBlock[i + 3]       = (1 + k + (1 + r) * kiStrideY) << 2;
245 
246     pBlock[16 + j]      =
247       pBlock[20 + j]    = ((j & 0x01) + r * kiStrideUV) << 2;
248   }
249 }
250 
WelsInitReconstructionFuncs(SWelsFuncPtrList * pFuncList,uint32_t uiCpuFlag)251 void WelsInitReconstructionFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag) {
252   pFuncList->pfDequantization4x4            = WelsDequant4x4_c;
253   pFuncList->pfDequantizationFour4x4        = WelsDequantFour4x4_c;
254   pFuncList->pfDequantizationIHadamard4x4   = WelsDequantIHadamard4x4_c;
255 
256   pFuncList->pfIDctT4           = WelsIDctT4Rec_c;
257   pFuncList->pfIDctFourT4       = WelsIDctFourT4Rec_c;
258   pFuncList->pfIDctI16x16Dc     = WelsIDctRecI16x16Dc_c;
259 
260 #if defined(X86_ASM)
261   if (uiCpuFlag & WELS_CPU_MMXEXT) {
262     pFuncList->pfIDctT4         = WelsIDctT4Rec_mmx;
263   }
264   if (uiCpuFlag & WELS_CPU_SSE2) {
265     pFuncList->pfDequantization4x4          = WelsDequant4x4_sse2;
266     pFuncList->pfDequantizationFour4x4      = WelsDequantFour4x4_sse2;
267     pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_sse2;
268 
269     pFuncList->pfIDctT4         = WelsIDctT4Rec_sse2;
270     pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_sse2;
271     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_sse2;
272   }
273 #if defined(HAVE_AVX2)
274   if (uiCpuFlag & WELS_CPU_AVX2) {
275     pFuncList->pfIDctT4     = WelsIDctT4Rec_avx2;
276     pFuncList->pfIDctFourT4 = WelsIDctFourT4Rec_avx2;
277   }
278 #endif
279 
280 #endif//X86_ASM
281 
282 #if defined(HAVE_NEON)
283   if (uiCpuFlag & WELS_CPU_NEON) {
284     pFuncList->pfDequantization4x4          = WelsDequant4x4_neon;
285     pFuncList->pfDequantizationFour4x4      = WelsDequantFour4x4_neon;
286     pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_neon;
287 
288     pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_neon;
289     pFuncList->pfIDctT4         = WelsIDctT4Rec_neon;
290     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_neon;
291   }
292 #endif
293 
294 #if defined(HAVE_NEON_AARCH64)
295   if (uiCpuFlag & WELS_CPU_NEON) {
296     pFuncList->pfDequantization4x4          = WelsDequant4x4_AArch64_neon;
297     pFuncList->pfDequantizationFour4x4      = WelsDequantFour4x4_AArch64_neon;
298     pFuncList->pfDequantizationIHadamard4x4 = WelsDequantIHadamard4x4_AArch64_neon;
299 
300     pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_AArch64_neon;
301     pFuncList->pfIDctT4         = WelsIDctT4Rec_AArch64_neon;
302     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_AArch64_neon;
303   }
304 #endif
305 
306 #if defined(HAVE_MMI)
307   if (uiCpuFlag & WELS_CPU_MMI) {
308     pFuncList->pfIDctT4         = WelsIDctT4Rec_mmi;
309     pFuncList->pfIDctFourT4     = WelsIDctFourT4Rec_mmi;
310     pFuncList->pfIDctI16x16Dc   = WelsIDctRecI16x16Dc_mmi;
311   }
312 #endif//HAVE_MMI
313 }
314 }
315