1 #include "deblocking_common.h"
2 #include "macros.h"
3 
4 //  C code only
DeblockLumaLt4_c(uint8_t * pPix,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)5 void DeblockLumaLt4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta,
6                        int8_t* pTc) {
7   for (int32_t i = 0; i < 16; i++) {
8     int32_t iTc0 = pTc[i >> 2];
9     if (iTc0 >= 0) {
10       int32_t p0 = pPix[-iStrideX];
11       int32_t p1 = pPix[-2 * iStrideX];
12       int32_t p2 = pPix[-3 * iStrideX];
13       int32_t q0 = pPix[0];
14       int32_t q1 = pPix[iStrideX];
15       int32_t q2 = pPix[2 * iStrideX];
16       bool bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
17       bool bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
18       bool bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
19       int32_t iTc = iTc0;
20       if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
21         bool bDetaP2P0 =  WELS_ABS (p2 - p0) < iBeta;
22         bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
23         if (bDetaP2P0) {
24           pPix[-2 * iStrideX] = p1 + WELS_CLIP3 ((p2 + ((p0 + q0 + 1) >> 1) - (p1 * (1 << 1))) >> 1, -iTc0, iTc0);
25           iTc++;
26         }
27         if (bDetaQ2Q0) {
28           pPix[iStrideX] = q1 + WELS_CLIP3 ((q2 + ((p0 + q0 + 1) >> 1) - (q1 * (1 << 1))) >> 1, -iTc0, iTc0);
29           iTc++;
30         }
31         int32_t iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc, iTc);
32         pPix[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
33         pPix[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
34       }
35     }
36     pPix += iStrideY;
37   }
38 }
DeblockLumaEq4_c(uint8_t * pPix,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)39 void DeblockLumaEq4_c (uint8_t* pPix, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha, int32_t iBeta) {
40   int32_t p0, p1, p2, q0, q1, q2;
41   int32_t iDetaP0Q0;
42   bool bDetaP1P0, bDetaQ1Q0;
43   for (int32_t i = 0; i < 16; i++) {
44     p0 = pPix[-iStrideX];
45     p1 = pPix[-2 * iStrideX];
46     p2 = pPix[-3 * iStrideX];
47     q0 = pPix[0];
48     q1 = pPix[iStrideX];
49     q2 = pPix[2 * iStrideX];
50     iDetaP0Q0 = WELS_ABS (p0 - q0);
51     bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
52     bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
53     if ((iDetaP0Q0 < iAlpha) && bDetaP1P0 && bDetaQ1Q0) {
54       if (iDetaP0Q0 < ((iAlpha >> 2) + 2)) {
55         bool bDetaP2P0 = WELS_ABS (p2 - p0) < iBeta;
56         bool bDetaQ2Q0 =  WELS_ABS (q2 - q0) < iBeta;
57         if (bDetaP2P0) {
58           const int32_t p3 = pPix[-4 * iStrideX];
59           pPix[-iStrideX] = (p2 + (p1 * (1 << 1)) + (p0 * (1 << 1)) + (q0 * (1 << 1)) + q1 + 4) >> 3;   //p0
60           pPix[-2 * iStrideX] = (p2 + p1 + p0 + q0 + 2) >> 2;                         //p1
61           pPix[-3 * iStrideX] = ((p3 * (1 << 1)) + p2 + (p2 * (1 << 1)) + p1 + p0 + q0 + 4) >> 3; //p2
62         } else {
63           pPix[-1 * iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;                       //p0
64         }
65         if (bDetaQ2Q0) {
66           const int32_t q3 = pPix[3 * iStrideX];
67           pPix[0] = (p1 + (p0 * (1 << 1)) + (q0 * (1 << 1)) + (q1 * (1 << 1)) + q2 + 4) >> 3;           //q0
68           pPix[iStrideX] = (p0 + q0 + q1 + q2 + 2) >> 2;                              //q1
69           pPix[2 * iStrideX] = ((q3 * (1 << 1)) + q2 + (q2 * (1 << 1)) + q1 + q0 + p0 + 4) >> 3;  //q2
70         } else {
71           pPix[0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;                                   //q0
72         }
73       } else {
74         pPix[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;   //p0
75         pPix[ 0] = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;          //q0
76       }
77     }
78     pPix += iStrideY;
79   }
80 }
DeblockLumaLt4V_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)81 void DeblockLumaLt4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
82   DeblockLumaLt4_c (pPix, iStride, 1, iAlpha, iBeta, tc);
83 }
DeblockLumaLt4H_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)84 void DeblockLumaLt4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* tc) {
85   DeblockLumaLt4_c (pPix, 1, iStride, iAlpha, iBeta, tc);
86 }
DeblockLumaEq4V_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)87 void DeblockLumaEq4V_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
88   DeblockLumaEq4_c (pPix, iStride, 1, iAlpha, iBeta);
89 }
DeblockLumaEq4H_c(uint8_t * pPix,int32_t iStride,int32_t iAlpha,int32_t iBeta)90 void DeblockLumaEq4H_c (uint8_t* pPix, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
91   DeblockLumaEq4_c (pPix, 1, iStride, iAlpha, iBeta);
92 }
DeblockChromaLt4_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)93 void DeblockChromaLt4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
94                          int32_t iBeta, int8_t* pTc) {
95   int32_t p0, p1, q0, q1, iDeta;
96   bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
97 
98   for (int32_t i = 0; i < 8; i++) {
99     int32_t iTc0 = pTc[i >> 1];
100     if (iTc0 > 0) {
101       p0 = pPixCb[-iStrideX];
102       p1 = pPixCb[-2 * iStrideX];
103       q0 = pPixCb[0];
104       q1 = pPixCb[iStrideX];
105 
106       bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
107       bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
108       bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
109       if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
110         iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
111         pPixCb[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
112         pPixCb[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
113       }
114 
115 
116       p0 = pPixCr[-iStrideX];
117       p1 = pPixCr[-2 * iStrideX];
118       q0 = pPixCr[0];
119       q1 = pPixCr[iStrideX];
120 
121       bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
122       bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
123       bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
124 
125       if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
126         iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
127         pPixCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
128         pPixCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
129       }
130     }
131     pPixCb += iStrideY;
132     pPixCr += iStrideY;
133   }
134 }
DeblockChromaEq4_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)135 void DeblockChromaEq4_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
136                          int32_t iBeta) {
137   int32_t p0, p1, q0, q1;
138   bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
139   for (int32_t i = 0; i < 8; i++) {
140     //cb
141     p0 = pPixCb[-iStrideX];
142     p1 = pPixCb[-2 * iStrideX];
143     q0 = pPixCb[0];
144     q1 = pPixCb[iStrideX];
145     bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
146     bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
147     bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
148     if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
149       pPixCb[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
150       pPixCb[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
151     }
152 
153     //cr
154     p0 = pPixCr[-iStrideX];
155     p1 = pPixCr[-2 * iStrideX];
156     q0 = pPixCr[0];
157     q1 = pPixCr[iStrideX];
158     bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
159     bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
160     bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
161     if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
162       pPixCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
163       pPixCr[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
164     }
165     pPixCr += iStrideY;
166     pPixCb += iStrideY;
167   }
168 }
DeblockChromaLt4V_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)169 void DeblockChromaLt4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
170                           int8_t* tc) {
171   DeblockChromaLt4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta, tc);
172 }
DeblockChromaLt4H_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)173 void DeblockChromaLt4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
174                           int8_t* tc) {
175   DeblockChromaLt4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta, tc);
176 }
DeblockChromaEq4V_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)177 void DeblockChromaEq4V_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
178   DeblockChromaEq4_c (pPixCb, pPixCr, iStride, 1, iAlpha, iBeta);
179 }
DeblockChromaEq4H_c(uint8_t * pPixCb,uint8_t * pPixCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)180 void DeblockChromaEq4H_c (uint8_t* pPixCb, uint8_t* pPixCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
181   DeblockChromaEq4_c (pPixCb, pPixCr, 1, iStride, iAlpha, iBeta);
182 }
183 
DeblockChromaLt42_c(uint8_t * pPixCbCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta,int8_t * pTc)184 void DeblockChromaLt42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
185                           int32_t iBeta, int8_t* pTc) {
186   int32_t p0, p1, q0, q1, iDeta;
187   bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
188 
189   for (int32_t i = 0; i < 8; i++) {
190     int32_t iTc0 = pTc[i >> 1];
191     if (iTc0 > 0) {
192       p0 = pPixCbCr[-iStrideX];
193       p1 = pPixCbCr[-2 * iStrideX];
194       q0 = pPixCbCr[0];
195       q1 = pPixCbCr[iStrideX];
196 
197       bDetaP0Q0 =  WELS_ABS (p0 - q0) < iAlpha;
198       bDetaP1P0 =  WELS_ABS (p1 - p0) < iBeta;
199       bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
200       if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
201         iDeta = WELS_CLIP3 ((((q0 - p0) * (1 << 2)) + (p1 - q1) + 4) >> 3, -iTc0, iTc0);
202         pPixCbCr[-iStrideX] = WelsClip1 (p0 + iDeta);     /* p0' */
203         pPixCbCr[0]  = WelsClip1 (q0 - iDeta);     /* q0' */
204       }
205 
206 
207     }
208     pPixCbCr += iStrideY;
209   }
210 }
DeblockChromaEq42_c(uint8_t * pPixCbCr,int32_t iStrideX,int32_t iStrideY,int32_t iAlpha,int32_t iBeta)211 void DeblockChromaEq42_c (uint8_t* pPixCbCr, int32_t iStrideX, int32_t iStrideY, int32_t iAlpha,
212                           int32_t iBeta) {
213   int32_t p0, p1, q0, q1;
214   bool bDetaP0Q0, bDetaP1P0, bDetaQ1Q0;
215   for (int32_t i = 0; i < 8; i++) {
216     p0 = pPixCbCr[-iStrideX];
217     p1 = pPixCbCr[-2 * iStrideX];
218     q0 = pPixCbCr[0];
219     q1 = pPixCbCr[iStrideX];
220     bDetaP0Q0 = WELS_ABS (p0 - q0) < iAlpha;
221     bDetaP1P0 = WELS_ABS (p1 - p0) < iBeta;
222     bDetaQ1Q0 = WELS_ABS (q1 - q0) < iBeta;
223     if (bDetaP0Q0 && bDetaP1P0 && bDetaQ1Q0) {
224       pPixCbCr[-iStrideX] = ((p1 * (1 << 1)) + p0 + q1 + 2) >> 2;     /* p0' */
225       pPixCbCr[0]  = ((q1 * (1 << 1)) + q0 + p1 + 2) >> 2;     /* q0' */
226     }
227 
228     pPixCbCr += iStrideY;
229   }
230 }
231 
DeblockChromaLt4V2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)232 void DeblockChromaLt4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
233                            int8_t* tc) {
234   DeblockChromaLt42_c (pPixCbCr, iStride, 1, iAlpha, iBeta, tc);
235 }
DeblockChromaLt4H2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * tc)236 void DeblockChromaLt4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta,
237                            int8_t* tc) {
238 
239   DeblockChromaLt42_c (pPixCbCr, 1, iStride, iAlpha, iBeta, tc);
240 }
DeblockChromaEq4V2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)241 void DeblockChromaEq4V2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
242   DeblockChromaEq42_c (pPixCbCr, iStride, 1, iAlpha, iBeta);
243 }
DeblockChromaEq4H2_c(uint8_t * pPixCbCr,int32_t iStride,int32_t iAlpha,int32_t iBeta)244 void DeblockChromaEq4H2_c (uint8_t* pPixCbCr, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
245   DeblockChromaEq42_c (pPixCbCr, 1, iStride, iAlpha, iBeta);
246 }
247 
WelsNonZeroCount_c(int8_t * pNonZeroCount)248 void WelsNonZeroCount_c (int8_t* pNonZeroCount) {
249   int32_t i;
250   for (i = 0; i < 24; i++) {
251     pNonZeroCount[i] = !!pNonZeroCount[i];
252   }
253 }
254 
255 #ifdef X86_ASM
256 extern "C" {
DeblockLumaLt4H_ssse3(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)257   void DeblockLumaLt4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
258     ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
259 
260     DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
261     DeblockLumaLt4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
262     DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
263   }
264 
DeblockLumaEq4H_ssse3(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta)265   void DeblockLumaEq4H_ssse3 (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
266     ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
267 
268     DeblockLumaTransposeH2V_sse2 (pPixY - 4, iStride, &uiBuf[0]);
269     DeblockLumaEq4V_ssse3 (&uiBuf[4 * 16], 16, iAlpha, iBeta);
270     DeblockLumaTransposeV2H_sse2 (pPixY - 4, iStride, &uiBuf[0]);
271   }
272 
273 }
274 
275 #endif
276 
277 #ifdef HAVE_MMI
278 extern "C" {
DeblockLumaLt4H_mmi(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta,int8_t * pTc)279   void DeblockLumaLt4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta, int8_t* pTc) {
280     ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
281 
282     DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
283     DeblockLumaLt4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta, pTc);
284     DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
285   }
286 
DeblockLumaEq4H_mmi(uint8_t * pPixY,int32_t iStride,int32_t iAlpha,int32_t iBeta)287   void DeblockLumaEq4H_mmi (uint8_t* pPixY, int32_t iStride, int32_t iAlpha, int32_t iBeta) {
288     ENFORCE_STACK_ALIGN_1D (uint8_t,  uiBuf,   16 * 8, 16);
289 
290     DeblockLumaTransposeH2V_mmi (pPixY - 4, iStride, &uiBuf[0]);
291     DeblockLumaEq4V_mmi (&uiBuf[4 * 16], 16, iAlpha, iBeta);
292     DeblockLumaTransposeV2H_mmi (pPixY - 4, iStride, &uiBuf[0]);
293   }
294 }
295 #endif//HAVE_MMI
296