1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5  *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6  *          Mahesh Pittala <mahesh@multicorewareinc.com>
7  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
8  *          Min Chen <min.chen@multicorewareinc.com>
9  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10  *          Nabajit Deka <nabajit@multicorewareinc.com>
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License
23  * along with this program; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
25  *
26  * This program is also available under a commercial proprietary license.
27  * For more information, contact us at license @ x265.com.
28  *****************************************************************************/
29 
30 #include "common.h"
31 #include "primitives.h"
32 #include "contexts.h"   // costCoeffNxN_c
33 #include "threading.h"  // CLZ
34 
35 using namespace X265_NS;
36 
37 #if _MSC_VER
38 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39 #endif
40 
41 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42 // give identical results
fastForwardDst(const int16_t * block,int16_t * coeff,int shift)43 static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift)  // input block, output coeff
44 {
45     int c[4];
46     int rnd_factor = 1 << (shift - 1);
47 
48     for (int i = 0; i < 4; i++)
49     {
50         // Intermediate Variables
51         c[0] = block[4 * i + 0] + block[4 * i + 3];
52         c[1] = block[4 * i + 1] + block[4 * i + 3];
53         c[2] = block[4 * i + 0] - block[4 * i + 1];
54         c[3] = 74 * block[4 * i + 2];
55 
56         coeff[i] =      (int16_t)((29 * c[0] + 55 * c[1]  + c[3] + rnd_factor) >> shift);
57         coeff[4 + i] =  (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58         coeff[8 + i] =  (int16_t)((29 * c[2] + 55 * c[0]  - c[3] + rnd_factor) >> shift);
59         coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60     }
61 }
62 
inversedst(const int16_t * tmp,int16_t * block,int shift)63 static void inversedst(const int16_t* tmp, int16_t* block, int shift)  // input tmp, output block
64 {
65     int i, c[4];
66     int rnd_factor = 1 << (shift - 1);
67 
68     for (i = 0; i < 4; i++)
69     {
70         // Intermediate Variables
71         c[0] = tmp[i] + tmp[8 + i];
72         c[1] = tmp[8 + i] + tmp[12 + i];
73         c[2] = tmp[i] - tmp[12 + i];
74         c[3] = 74 * tmp[4 + i];
75 
76         block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor) >> shift);
77         block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor) >> shift);
78         block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i]  + tmp[12 + i])      + rnd_factor) >> shift);
79         block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor) >> shift);
80     }
81 }
82 
partialButterfly16(const int16_t * src,int16_t * dst,int shift,int line)83 static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84 {
85     int j, k;
86     int E[8], O[8];
87     int EE[4], EO[4];
88     int EEE[2], EEO[2];
89     int add = 1 << (shift - 1);
90 
91     for (j = 0; j < line; j++)
92     {
93         /* E and O */
94         for (k = 0; k < 8; k++)
95         {
96             E[k] = src[k] + src[15 - k];
97             O[k] = src[k] - src[15 - k];
98         }
99 
100         /* EE and EO */
101         for (k = 0; k < 4; k++)
102         {
103             EE[k] = E[k] + E[7 - k];
104             EO[k] = E[k] - E[7 - k];
105         }
106 
107         /* EEE and EEO */
108         EEE[0] = EE[0] + EE[3];
109         EEO[0] = EE[0] - EE[3];
110         EEE[1] = EE[1] + EE[2];
111         EEO[1] = EE[1] - EE[2];
112 
113         dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114         dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115         dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116         dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117 
118         for (k = 2; k < 16; k += 4)
119         {
120             dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121                                        g_t16[k][3] * EO[3] + add) >> shift);
122         }
123 
124         for (k = 1; k < 16; k += 2)
125         {
126             dst[k * line] =  (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127                                         g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128                                         add) >> shift);
129         }
130 
131         src += 16;
132         dst++;
133     }
134 }
135 
partialButterfly32(const int16_t * src,int16_t * dst,int shift,int line)136 static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137 {
138     int j, k;
139     int E[16], O[16];
140     int EE[8], EO[8];
141     int EEE[4], EEO[4];
142     int EEEE[2], EEEO[2];
143     int add = 1 << (shift - 1);
144 
145     for (j = 0; j < line; j++)
146     {
147         /* E and O*/
148         for (k = 0; k < 16; k++)
149         {
150             E[k] = src[k] + src[31 - k];
151             O[k] = src[k] - src[31 - k];
152         }
153 
154         /* EE and EO */
155         for (k = 0; k < 8; k++)
156         {
157             EE[k] = E[k] + E[15 - k];
158             EO[k] = E[k] - E[15 - k];
159         }
160 
161         /* EEE and EEO */
162         for (k = 0; k < 4; k++)
163         {
164             EEE[k] = EE[k] + EE[7 - k];
165             EEO[k] = EE[k] - EE[7 - k];
166         }
167 
168         /* EEEE and EEEO */
169         EEEE[0] = EEE[0] + EEE[3];
170         EEEO[0] = EEE[0] - EEE[3];
171         EEEE[1] = EEE[1] + EEE[2];
172         EEEO[1] = EEE[1] - EEE[2];
173 
174         dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175         dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176         dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177         dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178         for (k = 4; k < 32; k += 8)
179         {
180             dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181                                        g_t32[k][3] * EEO[3] + add) >> shift);
182         }
183 
184         for (k = 2; k < 32; k += 4)
185         {
186             dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187                                        g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188                                        g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189         }
190 
191         for (k = 1; k < 32; k += 2)
192         {
193             dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194                                        g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195                                        g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196                                        O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197                                        g_t32[k][15] * O[15] + add) >> shift);
198         }
199 
200         src += 32;
201         dst++;
202     }
203 }
204 
partialButterfly8(const int16_t * src,int16_t * dst,int shift,int line)205 static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206 {
207     int j, k;
208     int E[4], O[4];
209     int EE[2], EO[2];
210     int add = 1 << (shift - 1);
211 
212     for (j = 0; j < line; j++)
213     {
214         /* E and O*/
215         for (k = 0; k < 4; k++)
216         {
217             E[k] = src[k] + src[7 - k];
218             O[k] = src[k] - src[7 - k];
219         }
220 
221         /* EE and EO */
222         EE[0] = E[0] + E[3];
223         EO[0] = E[0] - E[3];
224         EE[1] = E[1] + E[2];
225         EO[1] = E[1] - E[2];
226 
227         dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228         dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229         dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230         dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231 
232         dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233         dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234         dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235         dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236 
237         src += 8;
238         dst++;
239     }
240 }
241 
partialButterflyInverse4(const int16_t * src,int16_t * dst,int shift,int line)242 static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243 {
244     int j;
245     int E[2], O[2];
246     int add = 1 << (shift - 1);
247 
248     for (j = 0; j < line; j++)
249     {
250         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251         O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252         O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253         E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254         E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255 
256         /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257         dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258         dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259         dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260         dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261 
262         src++;
263         dst += 4;
264     }
265 }
266 
partialButterflyInverse8(const int16_t * src,int16_t * dst,int shift,int line)267 static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268 {
269     int j, k;
270     int E[4], O[4];
271     int EE[2], EO[2];
272     int add = 1 << (shift - 1);
273 
274     for (j = 0; j < line; j++)
275     {
276         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277         for (k = 0; k < 4; k++)
278         {
279             O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280         }
281 
282         EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283         EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284         EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285         EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286 
287         /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288         E[0] = EE[0] + EO[0];
289         E[3] = EE[0] - EO[0];
290         E[1] = EE[1] + EO[1];
291         E[2] = EE[1] - EO[1];
292         for (k = 0; k < 4; k++)
293         {
294             dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295             dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296         }
297 
298         src++;
299         dst += 8;
300     }
301 }
302 
partialButterflyInverse16(const int16_t * src,int16_t * dst,int shift,int line)303 static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304 {
305     int j, k;
306     int E[8], O[8];
307     int EE[4], EO[4];
308     int EEE[2], EEO[2];
309     int add = 1 << (shift - 1);
310 
311     for (j = 0; j < line; j++)
312     {
313         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314         for (k = 0; k < 8; k++)
315         {
316             O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317                 g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318         }
319 
320         for (k = 0; k < 4; k++)
321         {
322             EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323         }
324 
325         EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326         EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327         EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328         EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329 
330         /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331         for (k = 0; k < 2; k++)
332         {
333             EE[k] = EEE[k] + EEO[k];
334             EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335         }
336 
337         for (k = 0; k < 4; k++)
338         {
339             E[k] = EE[k] + EO[k];
340             E[k + 4] = EE[3 - k] - EO[3 - k];
341         }
342 
343         for (k = 0; k < 8; k++)
344         {
345             dst[k]   = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346             dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347         }
348 
349         src++;
350         dst += 16;
351     }
352 }
353 
partialButterflyInverse32(const int16_t * src,int16_t * dst,int shift,int line)354 static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355 {
356     int j, k;
357     int E[16], O[16];
358     int EE[8], EO[8];
359     int EEE[4], EEO[4];
360     int EEEE[2], EEEO[2];
361     int add = 1 << (shift - 1);
362 
363     for (j = 0; j < line; j++)
364     {
365         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366         for (k = 0; k < 16; k++)
367         {
368             O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369                 g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370                 g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371                 g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372         }
373 
374         for (k = 0; k < 8; k++)
375         {
376             EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377                 g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378         }
379 
380         for (k = 0; k < 4; k++)
381         {
382             EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383         }
384 
385         EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386         EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387         EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388         EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389 
390         /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391         EEE[0] = EEEE[0] + EEEO[0];
392         EEE[3] = EEEE[0] - EEEO[0];
393         EEE[1] = EEEE[1] + EEEO[1];
394         EEE[2] = EEEE[1] - EEEO[1];
395         for (k = 0; k < 4; k++)
396         {
397             EE[k] = EEE[k] + EEO[k];
398             EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399         }
400 
401         for (k = 0; k < 8; k++)
402         {
403             E[k] = EE[k] + EO[k];
404             E[k + 8] = EE[7 - k] - EO[7 - k];
405         }
406 
407         for (k = 0; k < 16; k++)
408         {
409             dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410             dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411         }
412 
413         src++;
414         dst += 32;
415     }
416 }
417 
partialButterfly4(const int16_t * src,int16_t * dst,int shift,int line)418 static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419 {
420     int j;
421     int E[2], O[2];
422     int add = 1 << (shift - 1);
423 
424     for (j = 0; j < line; j++)
425     {
426         /* E and O */
427         E[0] = src[0] + src[3];
428         O[0] = src[0] - src[3];
429         E[1] = src[1] + src[2];
430         O[1] = src[1] - src[2];
431 
432         dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433         dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434         dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435         dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436 
437         src += 4;
438         dst++;
439     }
440 }
441 
dst4_c(const int16_t * src,int16_t * dst,intptr_t srcStride)442 static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
443 {
444     const int shift_1st = 1 + X265_DEPTH - 8;
445     const int shift_2nd = 8;
446 
447     ALIGN_VAR_32(int16_t, coef[4 * 4]);
448     ALIGN_VAR_32(int16_t, block[4 * 4]);
449 
450     for (int i = 0; i < 4; i++)
451     {
452         memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
453     }
454 
455     fastForwardDst(block, coef, shift_1st);
456     fastForwardDst(coef, dst, shift_2nd);
457 }
458 
dct4_c(const int16_t * src,int16_t * dst,intptr_t srcStride)459 static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
460 {
461     const int shift_1st = 1 + X265_DEPTH - 8;
462     const int shift_2nd = 8;
463 
464     ALIGN_VAR_32(int16_t, coef[4 * 4]);
465     ALIGN_VAR_32(int16_t, block[4 * 4]);
466 
467     for (int i = 0; i < 4; i++)
468     {
469         memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
470     }
471 
472     partialButterfly4(block, coef, shift_1st, 4);
473     partialButterfly4(coef, dst, shift_2nd, 4);
474 }
475 
dct8_c(const int16_t * src,int16_t * dst,intptr_t srcStride)476 static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
477 {
478     const int shift_1st = 2 + X265_DEPTH - 8;
479     const int shift_2nd = 9;
480 
481     ALIGN_VAR_32(int16_t, coef[8 * 8]);
482     ALIGN_VAR_32(int16_t, block[8 * 8]);
483 
484     for (int i = 0; i < 8; i++)
485     {
486         memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
487     }
488 
489     partialButterfly8(block, coef, shift_1st, 8);
490     partialButterfly8(coef, dst, shift_2nd, 8);
491 }
492 
dct16_c(const int16_t * src,int16_t * dst,intptr_t srcStride)493 static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
494 {
495     const int shift_1st = 3 + X265_DEPTH - 8;
496     const int shift_2nd = 10;
497 
498     ALIGN_VAR_32(int16_t, coef[16 * 16]);
499     ALIGN_VAR_32(int16_t, block[16 * 16]);
500 
501     for (int i = 0; i < 16; i++)
502     {
503         memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
504     }
505 
506     partialButterfly16(block, coef, shift_1st, 16);
507     partialButterfly16(coef, dst, shift_2nd, 16);
508 }
509 
dct32_c(const int16_t * src,int16_t * dst,intptr_t srcStride)510 static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
511 {
512     const int shift_1st = 4 + X265_DEPTH - 8;
513     const int shift_2nd = 11;
514 
515     ALIGN_VAR_32(int16_t, coef[32 * 32]);
516     ALIGN_VAR_32(int16_t, block[32 * 32]);
517 
518     for (int i = 0; i < 32; i++)
519     {
520         memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
521     }
522 
523     partialButterfly32(block, coef, shift_1st, 32);
524     partialButterfly32(coef, dst, shift_2nd, 32);
525 }
526 
idst4_c(const int16_t * src,int16_t * dst,intptr_t dstStride)527 static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
528 {
529     const int shift_1st = 7;
530     const int shift_2nd = 12 - (X265_DEPTH - 8);
531 
532     ALIGN_VAR_32(int16_t, coef[4 * 4]);
533     ALIGN_VAR_32(int16_t, block[4 * 4]);
534 
535     inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
536     inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
537 
538     for (int i = 0; i < 4; i++)
539     {
540         memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
541     }
542 }
543 
idct4_c(const int16_t * src,int16_t * dst,intptr_t dstStride)544 static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
545 {
546     const int shift_1st = 7;
547     const int shift_2nd = 12 - (X265_DEPTH - 8);
548 
549     ALIGN_VAR_32(int16_t, coef[4 * 4]);
550     ALIGN_VAR_32(int16_t, block[4 * 4]);
551 
552     partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
553     partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
554 
555     for (int i = 0; i < 4; i++)
556     {
557         memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
558     }
559 }
560 
idct8_c(const int16_t * src,int16_t * dst,intptr_t dstStride)561 static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
562 {
563     const int shift_1st = 7;
564     const int shift_2nd = 12 - (X265_DEPTH - 8);
565 
566     ALIGN_VAR_32(int16_t, coef[8 * 8]);
567     ALIGN_VAR_32(int16_t, block[8 * 8]);
568 
569     partialButterflyInverse8(src, coef, shift_1st, 8);
570     partialButterflyInverse8(coef, block, shift_2nd, 8);
571 
572     for (int i = 0; i < 8; i++)
573     {
574         memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
575     }
576 }
577 
idct16_c(const int16_t * src,int16_t * dst,intptr_t dstStride)578 static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
579 {
580     const int shift_1st = 7;
581     const int shift_2nd = 12 - (X265_DEPTH - 8);
582 
583     ALIGN_VAR_32(int16_t, coef[16 * 16]);
584     ALIGN_VAR_32(int16_t, block[16 * 16]);
585 
586     partialButterflyInverse16(src, coef, shift_1st, 16);
587     partialButterflyInverse16(coef, block, shift_2nd, 16);
588 
589     for (int i = 0; i < 16; i++)
590     {
591         memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
592     }
593 }
594 
idct32_c(const int16_t * src,int16_t * dst,intptr_t dstStride)595 static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
596 {
597     const int shift_1st = 7;
598     const int shift_2nd = 12 - (X265_DEPTH - 8);
599 
600     ALIGN_VAR_32(int16_t, coef[32 * 32]);
601     ALIGN_VAR_32(int16_t, block[32 * 32]);
602 
603     partialButterflyInverse32(src, coef, shift_1st, 32);
604     partialButterflyInverse32(coef, block, shift_2nd, 32);
605 
606     for (int i = 0; i < 32; i++)
607     {
608         memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
609     }
610 }
611 
dequant_normal_c(const int16_t * quantCoef,int16_t * coef,int num,int scale,int shift)612 static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
613 {
614 #if HIGH_BIT_DEPTH
615     X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
616 #else
617     // NOTE: maximum of scale is (72 * 256)
618     X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
619 #endif
620     X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
621     X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
622     X265_CHECK(shift <= 10, "shift too large %d\n", shift);
623     X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
624 
625     int add, coeffQ;
626 
627     add = 1 << (shift - 1);
628 
629     for (int n = 0; n < num; n++)
630     {
631         coeffQ = (quantCoef[n] * scale + add) >> shift;
632         coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
633     }
634 }
635 
dequant_scaling_c(const int16_t * quantCoef,const int32_t * deQuantCoef,int16_t * coef,int num,int per,int shift)636 static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
637 {
638     X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
639 
640     int add, coeffQ;
641 
642     shift += 4;
643 
644     if (shift > per)
645     {
646         add = 1 << (shift - per - 1);
647 
648         for (int n = 0; n < num; n++)
649         {
650             coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
651             coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
652         }
653     }
654     else
655     {
656         for (int n = 0; n < num; n++)
657         {
658             coeffQ   = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
659             coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
660         }
661     }
662 }
663 
quant_c(const int16_t * coef,const int32_t * quantCoeff,int32_t * deltaU,int16_t * qCoef,int qBits,int add,int numCoeff)664 static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
665 {
666     X265_CHECK(qBits >= 8, "qBits less than 8\n");
667     X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
668     int qBits8 = qBits - 8;
669     uint32_t numSig = 0;
670 
671     for (int blockpos = 0; blockpos < numCoeff; blockpos++)
672     {
673         int level = coef[blockpos];
674         int sign  = (level < 0 ? -1 : 1);
675 
676         int tmplevel = abs(level) * quantCoeff[blockpos];
677         level = ((tmplevel + add) >> qBits);
678         deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
679         if (level)
680             ++numSig;
681         level *= sign;
682         qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
683     }
684 
685     return numSig;
686 }
687 
nquant_c(const int16_t * coef,const int32_t * quantCoeff,int16_t * qCoef,int qBits,int add,int numCoeff)688 static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
689 {
690     X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
691     X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
692     X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
693 
694     uint32_t numSig = 0;
695 
696     for (int blockpos = 0; blockpos < numCoeff; blockpos++)
697     {
698         int level = coef[blockpos];
699         int sign  = (level < 0 ? -1 : 1);
700 
701         int tmplevel = abs(level) * quantCoeff[blockpos];
702         level = ((tmplevel + add) >> qBits);
703         if (level)
704             ++numSig;
705         level *= sign;
706 
707         // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
708         //       But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
709         qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
710     }
711 
712     return numSig;
713 }
714 template<int trSize>
count_nonzero_c(const int16_t * quantCoeff)715 int  count_nonzero_c(const int16_t* quantCoeff)
716 {
717     X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718     int count = 0;
719     int numCoeff = trSize * trSize;
720     for (int i = 0; i < numCoeff; i++)
721     {
722         count += quantCoeff[i] != 0;
723     }
724 
725     return count;
726 }
727 
728 template<int trSize>
copy_count(int16_t * coeff,const int16_t * residual,intptr_t resiStride)729 uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
730 {
731     uint32_t numSig = 0;
732     for (int k = 0; k < trSize; k++)
733     {
734         for (int j = 0; j < trSize; j++)
735         {
736             coeff[k * trSize + j] = residual[k * resiStride + j];
737             numSig += (residual[k * resiStride + j] != 0);
738         }
739     }
740 
741     return numSig;
742 }
743 
denoiseDct_c(int16_t * dctCoef,uint32_t * resSum,const uint16_t * offset,int numCoeff)744 static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
745 {
746     for (int i = 0; i < numCoeff; i++)
747     {
748         int level = dctCoef[i];
749         int sign = level >> 31;
750         level = (level + sign) ^ sign;
751         resSum[i] += level;
752         level -= offset[i];
753         dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
754     }
755 }
756 
scanPosLast_c(const uint16_t * scan,const coeff_t * coeff,uint16_t * coeffSign,uint16_t * coeffFlag,uint8_t * coeffNum,int numSig,const uint16_t *,const int)757 static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
758 {
759     memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
760     memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
761     memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
762 
763     int scanPosLast = 0;
764     do
765     {
766         const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
767 
768         const uint32_t posLast = scan[scanPosLast++];
769 
770         const int curCoeff = coeff[posLast];
771         const uint32_t isNZCoeff = (curCoeff != 0);
772         // get L1 sig map
773         // NOTE: the new algorithm is complicated, so I keep reference code here
774         //uint32_t posy   = posLast >> log2TrSize;
775         //uint32_t posx   = posLast - (posy << log2TrSize);
776         //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
777         //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
778         //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
779         numSig -= isNZCoeff;
780 
781         // TODO: optimize by instruction BTS
782         coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
783         coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
784         coeffNum[cgIdx] += (uint8_t)isNZCoeff;
785     }
786     while (numSig > 0);
787     return scanPosLast - 1;
788 }
789 
790 // NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
findPosFirstLast_c(const int16_t * dstCoeff,const intptr_t trSize,const uint16_t scanTbl[16])791 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
792 {
793     int n;
794 
795     for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
796     {
797         const uint32_t idx = scanTbl[n];
798         const uint32_t idxY = idx / MLS_CG_SIZE;
799         const uint32_t idxX = idx % MLS_CG_SIZE;
800         if (dstCoeff[idxY * trSize + idxX])
801             break;
802     }
803 
804     X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
805 
806     uint32_t lastNZPosInCG = (uint32_t)n;
807 
808     for (n = 0; n < SCAN_SET_SIZE; n++)
809     {
810         const uint32_t idx = scanTbl[n];
811         const uint32_t idxY = idx / MLS_CG_SIZE;
812         const uint32_t idxX = idx % MLS_CG_SIZE;
813         if (dstCoeff[idxY * trSize + idxX])
814             break;
815     }
816 
817     uint32_t firstNZPosInCG = (uint32_t)n;
818 
819     uint32_t absSumSign = 0;
820     for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
821     {
822         const uint32_t idx = scanTbl[n];
823         const uint32_t idxY = idx / MLS_CG_SIZE;
824         const uint32_t idxX = idx % MLS_CG_SIZE;
825         absSumSign += dstCoeff[idxY * trSize + idxX];
826     }
827 
828     // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
829     return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
830 }
831 
832 
costCoeffNxN_c(const uint16_t * scan,const coeff_t * coeff,intptr_t trSize,uint16_t * absCoeff,const uint8_t * tabSigCtx,uint32_t scanFlagMask,uint8_t * baseCtx,int offset,int scanPosSigOff,int subPosBase)833 static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
834 {
835     ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
836     uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
837     uint32_t sum = 0;
838 
839     // correct offset to match assembly
840     absCoeff -= numNonZero;
841 
842     for (int i = 0; i < MLS_CG_SIZE; i++)
843     {
844         tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
845         tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
846         tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
847         tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
848     }
849 
850     do
851     {
852         uint32_t blkPos, sig, ctxSig;
853         blkPos = scan[scanPosSigOff];
854         const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
855         sig     = scanFlagMask & 1;
856         scanFlagMask >>= 1;
857         X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
858         if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
859         {
860             const uint32_t cnt = tabSigCtx[blkPos] + offset;
861             ctxSig = cnt & posZeroMask;
862 
863             //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
864             //encodeBin(sig, baseCtx[ctxSig]);
865             const uint32_t mstate = baseCtx[ctxSig];
866             const uint32_t mps = mstate & 1;
867             const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
868             uint32_t nextState = (stateBits >> 24) + mps;
869             if ((mstate ^ sig) == 1)
870                 nextState = sig;
871             X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
872             X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
873             baseCtx[ctxSig] = (uint8_t)nextState;
874             sum += stateBits;
875         }
876         assert(numNonZero <= 15);
877         assert(blkPos <= 15);
878         absCoeff[numNonZero] = tmpCoeff[blkPos];
879         numNonZero += sig;
880         scanPosSigOff--;
881     }
882     while(scanPosSigOff >= 0);
883 
884     return (sum & 0xFFFFFF);
885 }
886 
costCoeffRemain_c(uint16_t * absCoeff,int numNonZero,int idx)887 static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
888 {
889     uint32_t goRiceParam = 0;
890 
891     uint32_t sum = 0;
892     int baseLevel = 3;
893     do
894     {
895         if (idx >= C1FLAG_NUMBER)
896             baseLevel = 1;
897 
898         // TODO: the IDX is not really idx, so this check inactive
899         //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
900         int codeNumber = absCoeff[idx] - baseLevel;
901 
902         if (codeNumber >= 0)
903         {
904             //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
905             uint32_t length = 0;
906 
907             codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
908             if (codeNumber >= 0)
909             {
910                 {
911                     unsigned long cidx;
912                     CLZ(cidx, codeNumber + 1);
913                     length = cidx;
914                 }
915                 X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
916 
917                 codeNumber = (length + length);
918             }
919             sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
920 
921             if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
922                 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
923             X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
924         }
925         baseLevel = 2;
926         idx++;
927     }
928     while(idx < numNonZero);
929 
930     return sum;
931 }
932 
933 
costC1C2Flag_c(uint16_t * absCoeff,intptr_t numC1Flag,uint8_t * baseCtxMod,intptr_t ctxOffset)934 static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
935 {
936     uint32_t sum = 0;
937     uint32_t c1 = 1;
938     uint32_t firstC2Idx = 8;
939     uint32_t firstC2Flag = 2;
940     uint32_t c1Next = 0xFFFFFFFE;
941 
942     int idx = 0;
943     do
944     {
945         uint32_t symbol1 = absCoeff[idx] > 1;
946         uint32_t symbol2 = absCoeff[idx] > 2;
947         //encodeBin(symbol1, baseCtxMod[c1]);
948         {
949             const uint32_t mstate = baseCtxMod[c1];
950             baseCtxMod[c1] = sbacNext(mstate, symbol1);
951             sum += sbacGetEntropyBits(mstate, symbol1);
952         }
953 
954         if (symbol1)
955             c1Next = 0;
956 
957         if (symbol1 + firstC2Flag == 3)
958             firstC2Flag = symbol2;
959 
960         if (symbol1 + firstC2Idx == 9)
961             firstC2Idx  = idx;
962 
963         c1 = (c1Next & 3);
964         c1Next >>= 2;
965         X265_CHECK(c1 <= 3, "c1 check failure\n");
966         idx++;
967     }
968     while(idx < numC1Flag);
969 
970     if (!c1)
971     {
972         X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
973 
974         baseCtxMod += ctxOffset;
975 
976         //encodeBin(firstC2Flag, baseCtxMod[0]);
977         {
978             const uint32_t mstate = baseCtxMod[0];
979             baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
980             sum += sbacGetEntropyBits(mstate, firstC2Flag);
981         }
982     }
983     return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
984 }
985 template<int log2TrSize>
nonPsyRdoQuant_c(int16_t * m_resiDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,uint32_t blkPos)986 static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
987 {
988     const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
989     const int scaleBits = SCALE_BITS - 2 * transformShift;
990     const uint32_t trSize = 1 << log2TrSize;
991 
992     for (int y = 0; y < MLS_CG_SIZE; y++)
993     {
994         for (int x = 0; x < MLS_CG_SIZE; x++)
995         {
996              int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
997              costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
998              *totalUncodedCost += costUncoded[blkPos + x];
999              *totalRdCost += costUncoded[blkPos + x];
1000         }
1001         blkPos += trSize;
1002     }
1003 }
1004 template<int log2TrSize>
psyRdoQuant_c(int16_t * m_resiDctCoeff,int16_t * m_fencDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,int64_t * psyScale,uint32_t blkPos)1005 static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1006 {
1007     const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1008     const int scaleBits = SCALE_BITS - 2 * transformShift;
1009     const uint32_t trSize = 1 << log2TrSize;
1010     int max = X265_MAX(0, (2 * transformShift + 1));
1011 
1012     for (int y = 0; y < MLS_CG_SIZE; y++)
1013     {
1014         for (int x = 0; x < MLS_CG_SIZE; x++)
1015         {
1016             int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1017             int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1018 
1019             costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1020 
1021             /* when no residual coefficient is coded, predicted coef == recon coef */
1022             costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1023 
1024             *totalUncodedCost += costUncoded[blkPos + x];
1025             *totalRdCost += costUncoded[blkPos + x];
1026         }
1027         blkPos += trSize;
1028     }
1029 }
1030 template<int log2TrSize>
psyRdoQuant_c_1(int16_t * m_resiDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,uint32_t blkPos)1031 static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t  *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1032 {
1033 	const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034 	const int scaleBits = SCALE_BITS - 2 * transformShift;
1035 	const uint32_t trSize = 1 << log2TrSize;
1036 
1037 	for (int y = 0; y < MLS_CG_SIZE; y++)
1038 	{
1039 		for (int x = 0; x < MLS_CG_SIZE; x++)
1040 		{
1041 			int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1042 			costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043 			*totalUncodedCost += costUncoded[blkPos + x];
1044 			*totalRdCost += costUncoded[blkPos + x];
1045 		}
1046 		blkPos += trSize;
1047 	}
1048 }
1049 template<int log2TrSize>
psyRdoQuant_c_2(int16_t * m_resiDctCoeff,int16_t * m_fencDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,int64_t * psyScale,uint32_t blkPos)1050 static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1051 {
1052 	const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053 
1054 	const uint32_t trSize = 1 << log2TrSize;
1055 	int max = X265_MAX(0, (2 * transformShift + 1));
1056 
1057 	for (int y = 0; y < MLS_CG_SIZE; y++)
1058 	{
1059 		for (int x = 0; x < MLS_CG_SIZE; x++)
1060 		{
1061 			int64_t signCoef = m_resiDctCoeff[blkPos + x];            /* pre-quantization DCT coeff */
1062 			int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063 			costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064 			*totalUncodedCost += costUncoded[blkPos + x];
1065 			*totalRdCost += costUncoded[blkPos + x];
1066 		}
1067 		blkPos += trSize;
1068 	}
1069 }
1070 
1071 namespace X265_NS {
1072 // x265 private namespace
setupDCTPrimitives_c(EncoderPrimitives & p)1073 void setupDCTPrimitives_c(EncoderPrimitives& p)
1074 {
1075     p.dequant_scaling = dequant_scaling_c;
1076     p.dequant_normal = dequant_normal_c;
1077     p.quant = quant_c;
1078     p.nquant = nquant_c;
1079     p.cu[BLOCK_4x4].nonPsyRdoQuant   = nonPsyRdoQuant_c<2>;
1080     p.cu[BLOCK_8x8].nonPsyRdoQuant   = nonPsyRdoQuant_c<3>;
1081     p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1082     p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1083     p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1084     p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1085     p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1086     p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1087     p.dst4x4 = dst4_c;
1088     p.cu[BLOCK_4x4].dct   = dct4_c;
1089     p.cu[BLOCK_8x8].dct   = dct8_c;
1090     p.cu[BLOCK_16x16].dct = dct16_c;
1091     p.cu[BLOCK_32x32].dct = dct32_c;
1092     p.idst4x4 = idst4_c;
1093     p.cu[BLOCK_4x4].idct   = idct4_c;
1094     p.cu[BLOCK_8x8].idct   = idct8_c;
1095     p.cu[BLOCK_16x16].idct = idct16_c;
1096     p.cu[BLOCK_32x32].idct = idct32_c;
1097     p.denoiseDct = denoiseDct_c;
1098     p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1099     p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1100     p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1101     p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1102 
1103     p.cu[BLOCK_4x4].copy_cnt   = copy_count<4>;
1104     p.cu[BLOCK_8x8].copy_cnt   = copy_count<8>;
1105     p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1106     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1107 	p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1108 	p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1109 	p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1110 	p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1111 	p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1112 	p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1113 	p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1114 	p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1115     p.scanPosLast = scanPosLast_c;
1116     p.findPosFirstLast = findPosFirstLast_c;
1117     p.costCoeffNxN = costCoeffNxN_c;
1118     p.costCoeffRemain = costCoeffRemain_c;
1119     p.costC1C2Flag = costC1C2Flag_c;
1120 }
1121 }
1122