1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Mandar Gurav <mandar@multicorewareinc.com>
5 * Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Rajesh Paulraj <rajesh@multicorewareinc.com>
8 * Min Chen <min.chen@multicorewareinc.com>
9 * Praveen Kumar Tiwari <praveen@multicorewareinc.com>
10 * Nabajit Deka <nabajit@multicorewareinc.com>
11 *
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
25 *
26 * This program is also available under a commercial proprietary license.
27 * For more information, contact us at license @ x265.com.
28 *****************************************************************************/
29
30 #include "common.h"
31 #include "primitives.h"
32 #include "contexts.h" // costCoeffNxN_c
33 #include "threading.h" // CLZ
34
35 using namespace X265_NS;
36
37 #if _MSC_VER
38 #pragma warning(disable: 4127) // conditional expression is constant, typical for templated functions
39 #endif
40
41 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
42 // give identical results
fastForwardDst(const int16_t * block,int16_t * coeff,int shift)43 static void fastForwardDst(const int16_t* block, int16_t* coeff, int shift) // input block, output coeff
44 {
45 int c[4];
46 int rnd_factor = 1 << (shift - 1);
47
48 for (int i = 0; i < 4; i++)
49 {
50 // Intermediate Variables
51 c[0] = block[4 * i + 0] + block[4 * i + 3];
52 c[1] = block[4 * i + 1] + block[4 * i + 3];
53 c[2] = block[4 * i + 0] - block[4 * i + 1];
54 c[3] = 74 * block[4 * i + 2];
55
56 coeff[i] = (int16_t)((29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift);
57 coeff[4 + i] = (int16_t)((74 * (block[4 * i + 0] + block[4 * i + 1] - block[4 * i + 3]) + rnd_factor) >> shift);
58 coeff[8 + i] = (int16_t)((29 * c[2] + 55 * c[0] - c[3] + rnd_factor) >> shift);
59 coeff[12 + i] = (int16_t)((55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
60 }
61 }
62
inversedst(const int16_t * tmp,int16_t * block,int shift)63 static void inversedst(const int16_t* tmp, int16_t* block, int shift) // input tmp, output block
64 {
65 int i, c[4];
66 int rnd_factor = 1 << (shift - 1);
67
68 for (i = 0; i < 4; i++)
69 {
70 // Intermediate Variables
71 c[0] = tmp[i] + tmp[8 + i];
72 c[1] = tmp[8 + i] + tmp[12 + i];
73 c[2] = tmp[i] - tmp[12 + i];
74 c[3] = 74 * tmp[4 + i];
75
76 block[4 * i + 0] = (int16_t)x265_clip3(-32768, 32767, (29 * c[0] + 55 * c[1] + c[3] + rnd_factor) >> shift);
77 block[4 * i + 1] = (int16_t)x265_clip3(-32768, 32767, (55 * c[2] - 29 * c[1] + c[3] + rnd_factor) >> shift);
78 block[4 * i + 2] = (int16_t)x265_clip3(-32768, 32767, (74 * (tmp[i] - tmp[8 + i] + tmp[12 + i]) + rnd_factor) >> shift);
79 block[4 * i + 3] = (int16_t)x265_clip3(-32768, 32767, (55 * c[0] + 29 * c[2] - c[3] + rnd_factor) >> shift);
80 }
81 }
82
partialButterfly16(const int16_t * src,int16_t * dst,int shift,int line)83 static void partialButterfly16(const int16_t* src, int16_t* dst, int shift, int line)
84 {
85 int j, k;
86 int E[8], O[8];
87 int EE[4], EO[4];
88 int EEE[2], EEO[2];
89 int add = 1 << (shift - 1);
90
91 for (j = 0; j < line; j++)
92 {
93 /* E and O */
94 for (k = 0; k < 8; k++)
95 {
96 E[k] = src[k] + src[15 - k];
97 O[k] = src[k] - src[15 - k];
98 }
99
100 /* EE and EO */
101 for (k = 0; k < 4; k++)
102 {
103 EE[k] = E[k] + E[7 - k];
104 EO[k] = E[k] - E[7 - k];
105 }
106
107 /* EEE and EEO */
108 EEE[0] = EE[0] + EE[3];
109 EEO[0] = EE[0] - EE[3];
110 EEE[1] = EE[1] + EE[2];
111 EEO[1] = EE[1] - EE[2];
112
113 dst[0] = (int16_t)((g_t16[0][0] * EEE[0] + g_t16[0][1] * EEE[1] + add) >> shift);
114 dst[8 * line] = (int16_t)((g_t16[8][0] * EEE[0] + g_t16[8][1] * EEE[1] + add) >> shift);
115 dst[4 * line] = (int16_t)((g_t16[4][0] * EEO[0] + g_t16[4][1] * EEO[1] + add) >> shift);
116 dst[12 * line] = (int16_t)((g_t16[12][0] * EEO[0] + g_t16[12][1] * EEO[1] + add) >> shift);
117
118 for (k = 2; k < 16; k += 4)
119 {
120 dst[k * line] = (int16_t)((g_t16[k][0] * EO[0] + g_t16[k][1] * EO[1] + g_t16[k][2] * EO[2] +
121 g_t16[k][3] * EO[3] + add) >> shift);
122 }
123
124 for (k = 1; k < 16; k += 2)
125 {
126 dst[k * line] = (int16_t)((g_t16[k][0] * O[0] + g_t16[k][1] * O[1] + g_t16[k][2] * O[2] + g_t16[k][3] * O[3] +
127 g_t16[k][4] * O[4] + g_t16[k][5] * O[5] + g_t16[k][6] * O[6] + g_t16[k][7] * O[7] +
128 add) >> shift);
129 }
130
131 src += 16;
132 dst++;
133 }
134 }
135
partialButterfly32(const int16_t * src,int16_t * dst,int shift,int line)136 static void partialButterfly32(const int16_t* src, int16_t* dst, int shift, int line)
137 {
138 int j, k;
139 int E[16], O[16];
140 int EE[8], EO[8];
141 int EEE[4], EEO[4];
142 int EEEE[2], EEEO[2];
143 int add = 1 << (shift - 1);
144
145 for (j = 0; j < line; j++)
146 {
147 /* E and O*/
148 for (k = 0; k < 16; k++)
149 {
150 E[k] = src[k] + src[31 - k];
151 O[k] = src[k] - src[31 - k];
152 }
153
154 /* EE and EO */
155 for (k = 0; k < 8; k++)
156 {
157 EE[k] = E[k] + E[15 - k];
158 EO[k] = E[k] - E[15 - k];
159 }
160
161 /* EEE and EEO */
162 for (k = 0; k < 4; k++)
163 {
164 EEE[k] = EE[k] + EE[7 - k];
165 EEO[k] = EE[k] - EE[7 - k];
166 }
167
168 /* EEEE and EEEO */
169 EEEE[0] = EEE[0] + EEE[3];
170 EEEO[0] = EEE[0] - EEE[3];
171 EEEE[1] = EEE[1] + EEE[2];
172 EEEO[1] = EEE[1] - EEE[2];
173
174 dst[0] = (int16_t)((g_t32[0][0] * EEEE[0] + g_t32[0][1] * EEEE[1] + add) >> shift);
175 dst[16 * line] = (int16_t)((g_t32[16][0] * EEEE[0] + g_t32[16][1] * EEEE[1] + add) >> shift);
176 dst[8 * line] = (int16_t)((g_t32[8][0] * EEEO[0] + g_t32[8][1] * EEEO[1] + add) >> shift);
177 dst[24 * line] = (int16_t)((g_t32[24][0] * EEEO[0] + g_t32[24][1] * EEEO[1] + add) >> shift);
178 for (k = 4; k < 32; k += 8)
179 {
180 dst[k * line] = (int16_t)((g_t32[k][0] * EEO[0] + g_t32[k][1] * EEO[1] + g_t32[k][2] * EEO[2] +
181 g_t32[k][3] * EEO[3] + add) >> shift);
182 }
183
184 for (k = 2; k < 32; k += 4)
185 {
186 dst[k * line] = (int16_t)((g_t32[k][0] * EO[0] + g_t32[k][1] * EO[1] + g_t32[k][2] * EO[2] +
187 g_t32[k][3] * EO[3] + g_t32[k][4] * EO[4] + g_t32[k][5] * EO[5] +
188 g_t32[k][6] * EO[6] + g_t32[k][7] * EO[7] + add) >> shift);
189 }
190
191 for (k = 1; k < 32; k += 2)
192 {
193 dst[k * line] = (int16_t)((g_t32[k][0] * O[0] + g_t32[k][1] * O[1] + g_t32[k][2] * O[2] + g_t32[k][3] * O[3] +
194 g_t32[k][4] * O[4] + g_t32[k][5] * O[5] + g_t32[k][6] * O[6] + g_t32[k][7] * O[7] +
195 g_t32[k][8] * O[8] + g_t32[k][9] * O[9] + g_t32[k][10] * O[10] + g_t32[k][11] *
196 O[11] + g_t32[k][12] * O[12] + g_t32[k][13] * O[13] + g_t32[k][14] * O[14] +
197 g_t32[k][15] * O[15] + add) >> shift);
198 }
199
200 src += 32;
201 dst++;
202 }
203 }
204
partialButterfly8(const int16_t * src,int16_t * dst,int shift,int line)205 static void partialButterfly8(const int16_t* src, int16_t* dst, int shift, int line)
206 {
207 int j, k;
208 int E[4], O[4];
209 int EE[2], EO[2];
210 int add = 1 << (shift - 1);
211
212 for (j = 0; j < line; j++)
213 {
214 /* E and O*/
215 for (k = 0; k < 4; k++)
216 {
217 E[k] = src[k] + src[7 - k];
218 O[k] = src[k] - src[7 - k];
219 }
220
221 /* EE and EO */
222 EE[0] = E[0] + E[3];
223 EO[0] = E[0] - E[3];
224 EE[1] = E[1] + E[2];
225 EO[1] = E[1] - E[2];
226
227 dst[0] = (int16_t)((g_t8[0][0] * EE[0] + g_t8[0][1] * EE[1] + add) >> shift);
228 dst[4 * line] = (int16_t)((g_t8[4][0] * EE[0] + g_t8[4][1] * EE[1] + add) >> shift);
229 dst[2 * line] = (int16_t)((g_t8[2][0] * EO[0] + g_t8[2][1] * EO[1] + add) >> shift);
230 dst[6 * line] = (int16_t)((g_t8[6][0] * EO[0] + g_t8[6][1] * EO[1] + add) >> shift);
231
232 dst[line] = (int16_t)((g_t8[1][0] * O[0] + g_t8[1][1] * O[1] + g_t8[1][2] * O[2] + g_t8[1][3] * O[3] + add) >> shift);
233 dst[3 * line] = (int16_t)((g_t8[3][0] * O[0] + g_t8[3][1] * O[1] + g_t8[3][2] * O[2] + g_t8[3][3] * O[3] + add) >> shift);
234 dst[5 * line] = (int16_t)((g_t8[5][0] * O[0] + g_t8[5][1] * O[1] + g_t8[5][2] * O[2] + g_t8[5][3] * O[3] + add) >> shift);
235 dst[7 * line] = (int16_t)((g_t8[7][0] * O[0] + g_t8[7][1] * O[1] + g_t8[7][2] * O[2] + g_t8[7][3] * O[3] + add) >> shift);
236
237 src += 8;
238 dst++;
239 }
240 }
241
partialButterflyInverse4(const int16_t * src,int16_t * dst,int shift,int line)242 static void partialButterflyInverse4(const int16_t* src, int16_t* dst, int shift, int line)
243 {
244 int j;
245 int E[2], O[2];
246 int add = 1 << (shift - 1);
247
248 for (j = 0; j < line; j++)
249 {
250 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
251 O[0] = g_t4[1][0] * src[line] + g_t4[3][0] * src[3 * line];
252 O[1] = g_t4[1][1] * src[line] + g_t4[3][1] * src[3 * line];
253 E[0] = g_t4[0][0] * src[0] + g_t4[2][0] * src[2 * line];
254 E[1] = g_t4[0][1] * src[0] + g_t4[2][1] * src[2 * line];
255
256 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
257 dst[0] = (int16_t)(x265_clip3(-32768, 32767, (E[0] + O[0] + add) >> shift));
258 dst[1] = (int16_t)(x265_clip3(-32768, 32767, (E[1] + O[1] + add) >> shift));
259 dst[2] = (int16_t)(x265_clip3(-32768, 32767, (E[1] - O[1] + add) >> shift));
260 dst[3] = (int16_t)(x265_clip3(-32768, 32767, (E[0] - O[0] + add) >> shift));
261
262 src++;
263 dst += 4;
264 }
265 }
266
partialButterflyInverse8(const int16_t * src,int16_t * dst,int shift,int line)267 static void partialButterflyInverse8(const int16_t* src, int16_t* dst, int shift, int line)
268 {
269 int j, k;
270 int E[4], O[4];
271 int EE[2], EO[2];
272 int add = 1 << (shift - 1);
273
274 for (j = 0; j < line; j++)
275 {
276 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
277 for (k = 0; k < 4; k++)
278 {
279 O[k] = g_t8[1][k] * src[line] + g_t8[3][k] * src[3 * line] + g_t8[5][k] * src[5 * line] + g_t8[7][k] * src[7 * line];
280 }
281
282 EO[0] = g_t8[2][0] * src[2 * line] + g_t8[6][0] * src[6 * line];
283 EO[1] = g_t8[2][1] * src[2 * line] + g_t8[6][1] * src[6 * line];
284 EE[0] = g_t8[0][0] * src[0] + g_t8[4][0] * src[4 * line];
285 EE[1] = g_t8[0][1] * src[0] + g_t8[4][1] * src[4 * line];
286
287 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
288 E[0] = EE[0] + EO[0];
289 E[3] = EE[0] - EO[0];
290 E[1] = EE[1] + EO[1];
291 E[2] = EE[1] - EO[1];
292 for (k = 0; k < 4; k++)
293 {
294 dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
295 dst[k + 4] = (int16_t)x265_clip3(-32768, 32767, (E[3 - k] - O[3 - k] + add) >> shift);
296 }
297
298 src++;
299 dst += 8;
300 }
301 }
302
partialButterflyInverse16(const int16_t * src,int16_t * dst,int shift,int line)303 static void partialButterflyInverse16(const int16_t* src, int16_t* dst, int shift, int line)
304 {
305 int j, k;
306 int E[8], O[8];
307 int EE[4], EO[4];
308 int EEE[2], EEO[2];
309 int add = 1 << (shift - 1);
310
311 for (j = 0; j < line; j++)
312 {
313 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
314 for (k = 0; k < 8; k++)
315 {
316 O[k] = g_t16[1][k] * src[line] + g_t16[3][k] * src[3 * line] + g_t16[5][k] * src[5 * line] + g_t16[7][k] * src[7 * line] +
317 g_t16[9][k] * src[9 * line] + g_t16[11][k] * src[11 * line] + g_t16[13][k] * src[13 * line] + g_t16[15][k] * src[15 * line];
318 }
319
320 for (k = 0; k < 4; k++)
321 {
322 EO[k] = g_t16[2][k] * src[2 * line] + g_t16[6][k] * src[6 * line] + g_t16[10][k] * src[10 * line] + g_t16[14][k] * src[14 * line];
323 }
324
325 EEO[0] = g_t16[4][0] * src[4 * line] + g_t16[12][0] * src[12 * line];
326 EEE[0] = g_t16[0][0] * src[0] + g_t16[8][0] * src[8 * line];
327 EEO[1] = g_t16[4][1] * src[4 * line] + g_t16[12][1] * src[12 * line];
328 EEE[1] = g_t16[0][1] * src[0] + g_t16[8][1] * src[8 * line];
329
330 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
331 for (k = 0; k < 2; k++)
332 {
333 EE[k] = EEE[k] + EEO[k];
334 EE[k + 2] = EEE[1 - k] - EEO[1 - k];
335 }
336
337 for (k = 0; k < 4; k++)
338 {
339 E[k] = EE[k] + EO[k];
340 E[k + 4] = EE[3 - k] - EO[3 - k];
341 }
342
343 for (k = 0; k < 8; k++)
344 {
345 dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
346 dst[k + 8] = (int16_t)x265_clip3(-32768, 32767, (E[7 - k] - O[7 - k] + add) >> shift);
347 }
348
349 src++;
350 dst += 16;
351 }
352 }
353
partialButterflyInverse32(const int16_t * src,int16_t * dst,int shift,int line)354 static void partialButterflyInverse32(const int16_t* src, int16_t* dst, int shift, int line)
355 {
356 int j, k;
357 int E[16], O[16];
358 int EE[8], EO[8];
359 int EEE[4], EEO[4];
360 int EEEE[2], EEEO[2];
361 int add = 1 << (shift - 1);
362
363 for (j = 0; j < line; j++)
364 {
365 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
366 for (k = 0; k < 16; k++)
367 {
368 O[k] = g_t32[1][k] * src[line] + g_t32[3][k] * src[3 * line] + g_t32[5][k] * src[5 * line] + g_t32[7][k] * src[7 * line] +
369 g_t32[9][k] * src[9 * line] + g_t32[11][k] * src[11 * line] + g_t32[13][k] * src[13 * line] + g_t32[15][k] * src[15 * line] +
370 g_t32[17][k] * src[17 * line] + g_t32[19][k] * src[19 * line] + g_t32[21][k] * src[21 * line] + g_t32[23][k] * src[23 * line] +
371 g_t32[25][k] * src[25 * line] + g_t32[27][k] * src[27 * line] + g_t32[29][k] * src[29 * line] + g_t32[31][k] * src[31 * line];
372 }
373
374 for (k = 0; k < 8; k++)
375 {
376 EO[k] = g_t32[2][k] * src[2 * line] + g_t32[6][k] * src[6 * line] + g_t32[10][k] * src[10 * line] + g_t32[14][k] * src[14 * line] +
377 g_t32[18][k] * src[18 * line] + g_t32[22][k] * src[22 * line] + g_t32[26][k] * src[26 * line] + g_t32[30][k] * src[30 * line];
378 }
379
380 for (k = 0; k < 4; k++)
381 {
382 EEO[k] = g_t32[4][k] * src[4 * line] + g_t32[12][k] * src[12 * line] + g_t32[20][k] * src[20 * line] + g_t32[28][k] * src[28 * line];
383 }
384
385 EEEO[0] = g_t32[8][0] * src[8 * line] + g_t32[24][0] * src[24 * line];
386 EEEO[1] = g_t32[8][1] * src[8 * line] + g_t32[24][1] * src[24 * line];
387 EEEE[0] = g_t32[0][0] * src[0] + g_t32[16][0] * src[16 * line];
388 EEEE[1] = g_t32[0][1] * src[0] + g_t32[16][1] * src[16 * line];
389
390 /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
391 EEE[0] = EEEE[0] + EEEO[0];
392 EEE[3] = EEEE[0] - EEEO[0];
393 EEE[1] = EEEE[1] + EEEO[1];
394 EEE[2] = EEEE[1] - EEEO[1];
395 for (k = 0; k < 4; k++)
396 {
397 EE[k] = EEE[k] + EEO[k];
398 EE[k + 4] = EEE[3 - k] - EEO[3 - k];
399 }
400
401 for (k = 0; k < 8; k++)
402 {
403 E[k] = EE[k] + EO[k];
404 E[k + 8] = EE[7 - k] - EO[7 - k];
405 }
406
407 for (k = 0; k < 16; k++)
408 {
409 dst[k] = (int16_t)x265_clip3(-32768, 32767, (E[k] + O[k] + add) >> shift);
410 dst[k + 16] = (int16_t)x265_clip3(-32768, 32767, (E[15 - k] - O[15 - k] + add) >> shift);
411 }
412
413 src++;
414 dst += 32;
415 }
416 }
417
partialButterfly4(const int16_t * src,int16_t * dst,int shift,int line)418 static void partialButterfly4(const int16_t* src, int16_t* dst, int shift, int line)
419 {
420 int j;
421 int E[2], O[2];
422 int add = 1 << (shift - 1);
423
424 for (j = 0; j < line; j++)
425 {
426 /* E and O */
427 E[0] = src[0] + src[3];
428 O[0] = src[0] - src[3];
429 E[1] = src[1] + src[2];
430 O[1] = src[1] - src[2];
431
432 dst[0] = (int16_t)((g_t4[0][0] * E[0] + g_t4[0][1] * E[1] + add) >> shift);
433 dst[2 * line] = (int16_t)((g_t4[2][0] * E[0] + g_t4[2][1] * E[1] + add) >> shift);
434 dst[line] = (int16_t)((g_t4[1][0] * O[0] + g_t4[1][1] * O[1] + add) >> shift);
435 dst[3 * line] = (int16_t)((g_t4[3][0] * O[0] + g_t4[3][1] * O[1] + add) >> shift);
436
437 src += 4;
438 dst++;
439 }
440 }
441
dst4_c(const int16_t * src,int16_t * dst,intptr_t srcStride)442 static void dst4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
443 {
444 const int shift_1st = 1 + X265_DEPTH - 8;
445 const int shift_2nd = 8;
446
447 ALIGN_VAR_32(int16_t, coef[4 * 4]);
448 ALIGN_VAR_32(int16_t, block[4 * 4]);
449
450 for (int i = 0; i < 4; i++)
451 {
452 memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
453 }
454
455 fastForwardDst(block, coef, shift_1st);
456 fastForwardDst(coef, dst, shift_2nd);
457 }
458
dct4_c(const int16_t * src,int16_t * dst,intptr_t srcStride)459 static void dct4_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
460 {
461 const int shift_1st = 1 + X265_DEPTH - 8;
462 const int shift_2nd = 8;
463
464 ALIGN_VAR_32(int16_t, coef[4 * 4]);
465 ALIGN_VAR_32(int16_t, block[4 * 4]);
466
467 for (int i = 0; i < 4; i++)
468 {
469 memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
470 }
471
472 partialButterfly4(block, coef, shift_1st, 4);
473 partialButterfly4(coef, dst, shift_2nd, 4);
474 }
475
dct8_c(const int16_t * src,int16_t * dst,intptr_t srcStride)476 static void dct8_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
477 {
478 const int shift_1st = 2 + X265_DEPTH - 8;
479 const int shift_2nd = 9;
480
481 ALIGN_VAR_32(int16_t, coef[8 * 8]);
482 ALIGN_VAR_32(int16_t, block[8 * 8]);
483
484 for (int i = 0; i < 8; i++)
485 {
486 memcpy(&block[i * 8], &src[i * srcStride], 8 * sizeof(int16_t));
487 }
488
489 partialButterfly8(block, coef, shift_1st, 8);
490 partialButterfly8(coef, dst, shift_2nd, 8);
491 }
492
dct16_c(const int16_t * src,int16_t * dst,intptr_t srcStride)493 static void dct16_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
494 {
495 const int shift_1st = 3 + X265_DEPTH - 8;
496 const int shift_2nd = 10;
497
498 ALIGN_VAR_32(int16_t, coef[16 * 16]);
499 ALIGN_VAR_32(int16_t, block[16 * 16]);
500
501 for (int i = 0; i < 16; i++)
502 {
503 memcpy(&block[i * 16], &src[i * srcStride], 16 * sizeof(int16_t));
504 }
505
506 partialButterfly16(block, coef, shift_1st, 16);
507 partialButterfly16(coef, dst, shift_2nd, 16);
508 }
509
dct32_c(const int16_t * src,int16_t * dst,intptr_t srcStride)510 static void dct32_c(const int16_t* src, int16_t* dst, intptr_t srcStride)
511 {
512 const int shift_1st = 4 + X265_DEPTH - 8;
513 const int shift_2nd = 11;
514
515 ALIGN_VAR_32(int16_t, coef[32 * 32]);
516 ALIGN_VAR_32(int16_t, block[32 * 32]);
517
518 for (int i = 0; i < 32; i++)
519 {
520 memcpy(&block[i * 32], &src[i * srcStride], 32 * sizeof(int16_t));
521 }
522
523 partialButterfly32(block, coef, shift_1st, 32);
524 partialButterfly32(coef, dst, shift_2nd, 32);
525 }
526
idst4_c(const int16_t * src,int16_t * dst,intptr_t dstStride)527 static void idst4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
528 {
529 const int shift_1st = 7;
530 const int shift_2nd = 12 - (X265_DEPTH - 8);
531
532 ALIGN_VAR_32(int16_t, coef[4 * 4]);
533 ALIGN_VAR_32(int16_t, block[4 * 4]);
534
535 inversedst(src, coef, shift_1st); // Forward DST BY FAST ALGORITHM, block input, coef output
536 inversedst(coef, block, shift_2nd); // Forward DST BY FAST ALGORITHM, coef input, coeff output
537
538 for (int i = 0; i < 4; i++)
539 {
540 memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
541 }
542 }
543
idct4_c(const int16_t * src,int16_t * dst,intptr_t dstStride)544 static void idct4_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
545 {
546 const int shift_1st = 7;
547 const int shift_2nd = 12 - (X265_DEPTH - 8);
548
549 ALIGN_VAR_32(int16_t, coef[4 * 4]);
550 ALIGN_VAR_32(int16_t, block[4 * 4]);
551
552 partialButterflyInverse4(src, coef, shift_1st, 4); // Forward DST BY FAST ALGORITHM, block input, coef output
553 partialButterflyInverse4(coef, block, shift_2nd, 4); // Forward DST BY FAST ALGORITHM, coef input, coeff output
554
555 for (int i = 0; i < 4; i++)
556 {
557 memcpy(&dst[i * dstStride], &block[i * 4], 4 * sizeof(int16_t));
558 }
559 }
560
idct8_c(const int16_t * src,int16_t * dst,intptr_t dstStride)561 static void idct8_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
562 {
563 const int shift_1st = 7;
564 const int shift_2nd = 12 - (X265_DEPTH - 8);
565
566 ALIGN_VAR_32(int16_t, coef[8 * 8]);
567 ALIGN_VAR_32(int16_t, block[8 * 8]);
568
569 partialButterflyInverse8(src, coef, shift_1st, 8);
570 partialButterflyInverse8(coef, block, shift_2nd, 8);
571
572 for (int i = 0; i < 8; i++)
573 {
574 memcpy(&dst[i * dstStride], &block[i * 8], 8 * sizeof(int16_t));
575 }
576 }
577
idct16_c(const int16_t * src,int16_t * dst,intptr_t dstStride)578 static void idct16_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
579 {
580 const int shift_1st = 7;
581 const int shift_2nd = 12 - (X265_DEPTH - 8);
582
583 ALIGN_VAR_32(int16_t, coef[16 * 16]);
584 ALIGN_VAR_32(int16_t, block[16 * 16]);
585
586 partialButterflyInverse16(src, coef, shift_1st, 16);
587 partialButterflyInverse16(coef, block, shift_2nd, 16);
588
589 for (int i = 0; i < 16; i++)
590 {
591 memcpy(&dst[i * dstStride], &block[i * 16], 16 * sizeof(int16_t));
592 }
593 }
594
idct32_c(const int16_t * src,int16_t * dst,intptr_t dstStride)595 static void idct32_c(const int16_t* src, int16_t* dst, intptr_t dstStride)
596 {
597 const int shift_1st = 7;
598 const int shift_2nd = 12 - (X265_DEPTH - 8);
599
600 ALIGN_VAR_32(int16_t, coef[32 * 32]);
601 ALIGN_VAR_32(int16_t, block[32 * 32]);
602
603 partialButterflyInverse32(src, coef, shift_1st, 32);
604 partialButterflyInverse32(coef, block, shift_2nd, 32);
605
606 for (int i = 0; i < 32; i++)
607 {
608 memcpy(&dst[i * dstStride], &block[i * 32], 32 * sizeof(int16_t));
609 }
610 }
611
dequant_normal_c(const int16_t * quantCoef,int16_t * coef,int num,int scale,int shift)612 static void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
613 {
614 #if HIGH_BIT_DEPTH
615 X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > (X265_DEPTH - 8)), "dequant invalid scale %d\n", scale);
616 #else
617 // NOTE: maximum of scale is (72 * 256)
618 X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
619 #endif
620 X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
621 X265_CHECK((num % 8) == 0, "dequant num %d not multiple of 8\n", num);
622 X265_CHECK(shift <= 10, "shift too large %d\n", shift);
623 X265_CHECK(((intptr_t)coef & 31) == 0, "dequant coef buffer not aligned\n");
624
625 int add, coeffQ;
626
627 add = 1 << (shift - 1);
628
629 for (int n = 0; n < num; n++)
630 {
631 coeffQ = (quantCoef[n] * scale + add) >> shift;
632 coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
633 }
634 }
635
dequant_scaling_c(const int16_t * quantCoef,const int32_t * deQuantCoef,int16_t * coef,int num,int per,int shift)636 static void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
637 {
638 X265_CHECK(num <= 32 * 32, "dequant num %d too large\n", num);
639
640 int add, coeffQ;
641
642 shift += 4;
643
644 if (shift > per)
645 {
646 add = 1 << (shift - per - 1);
647
648 for (int n = 0; n < num; n++)
649 {
650 coeffQ = ((quantCoef[n] * deQuantCoef[n]) + add) >> (shift - per);
651 coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ);
652 }
653 }
654 else
655 {
656 for (int n = 0; n < num; n++)
657 {
658 coeffQ = x265_clip3(-32768, 32767, quantCoef[n] * deQuantCoef[n]);
659 coef[n] = (int16_t)x265_clip3(-32768, 32767, coeffQ << (per - shift));
660 }
661 }
662 }
663
quant_c(const int16_t * coef,const int32_t * quantCoeff,int32_t * deltaU,int16_t * qCoef,int qBits,int add,int numCoeff)664 static uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
665 {
666 X265_CHECK(qBits >= 8, "qBits less than 8\n");
667 X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
668 int qBits8 = qBits - 8;
669 uint32_t numSig = 0;
670
671 for (int blockpos = 0; blockpos < numCoeff; blockpos++)
672 {
673 int level = coef[blockpos];
674 int sign = (level < 0 ? -1 : 1);
675
676 int tmplevel = abs(level) * quantCoeff[blockpos];
677 level = ((tmplevel + add) >> qBits);
678 deltaU[blockpos] = ((tmplevel - (level << qBits)) >> qBits8);
679 if (level)
680 ++numSig;
681 level *= sign;
682 qCoef[blockpos] = (int16_t)x265_clip3(-32768, 32767, level);
683 }
684
685 return numSig;
686 }
687
nquant_c(const int16_t * coef,const int32_t * quantCoeff,int16_t * qCoef,int qBits,int add,int numCoeff)688 static uint32_t nquant_c(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)
689 {
690 X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
691 X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
692 X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
693
694 uint32_t numSig = 0;
695
696 for (int blockpos = 0; blockpos < numCoeff; blockpos++)
697 {
698 int level = coef[blockpos];
699 int sign = (level < 0 ? -1 : 1);
700
701 int tmplevel = abs(level) * quantCoeff[blockpos];
702 level = ((tmplevel + add) >> qBits);
703 if (level)
704 ++numSig;
705 level *= sign;
706
707 // TODO: when we limit range to [-32767, 32767], we can get more performance with output change
708 // But nquant is a little percent in rdoQuant, so I keep old dynamic range for compatible
709 qCoef[blockpos] = (int16_t)abs(x265_clip3(-32768, 32767, level));
710 }
711
712 return numSig;
713 }
714 template<int trSize>
count_nonzero_c(const int16_t * quantCoeff)715 int count_nonzero_c(const int16_t* quantCoeff)
716 {
717 X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quant buffer not aligned\n");
718 int count = 0;
719 int numCoeff = trSize * trSize;
720 for (int i = 0; i < numCoeff; i++)
721 {
722 count += quantCoeff[i] != 0;
723 }
724
725 return count;
726 }
727
728 template<int trSize>
copy_count(int16_t * coeff,const int16_t * residual,intptr_t resiStride)729 uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
730 {
731 uint32_t numSig = 0;
732 for (int k = 0; k < trSize; k++)
733 {
734 for (int j = 0; j < trSize; j++)
735 {
736 coeff[k * trSize + j] = residual[k * resiStride + j];
737 numSig += (residual[k * resiStride + j] != 0);
738 }
739 }
740
741 return numSig;
742 }
743
denoiseDct_c(int16_t * dctCoef,uint32_t * resSum,const uint16_t * offset,int numCoeff)744 static void denoiseDct_c(int16_t* dctCoef, uint32_t* resSum, const uint16_t* offset, int numCoeff)
745 {
746 for (int i = 0; i < numCoeff; i++)
747 {
748 int level = dctCoef[i];
749 int sign = level >> 31;
750 level = (level + sign) ^ sign;
751 resSum[i] += level;
752 level -= offset[i];
753 dctCoef[i] = (int16_t)(level < 0 ? 0 : (level ^ sign) - sign);
754 }
755 }
756
scanPosLast_c(const uint16_t * scan,const coeff_t * coeff,uint16_t * coeffSign,uint16_t * coeffFlag,uint8_t * coeffNum,int numSig,const uint16_t *,const int)757 static int scanPosLast_c(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* /*scanCG4x4*/, const int /*trSize*/)
758 {
759 memset(coeffNum, 0, MLS_GRP_NUM * sizeof(*coeffNum));
760 memset(coeffFlag, 0, MLS_GRP_NUM * sizeof(*coeffFlag));
761 memset(coeffSign, 0, MLS_GRP_NUM * sizeof(*coeffSign));
762
763 int scanPosLast = 0;
764 do
765 {
766 const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
767
768 const uint32_t posLast = scan[scanPosLast++];
769
770 const int curCoeff = coeff[posLast];
771 const uint32_t isNZCoeff = (curCoeff != 0);
772 // get L1 sig map
773 // NOTE: the new algorithm is complicated, so I keep reference code here
774 //uint32_t posy = posLast >> log2TrSize;
775 //uint32_t posx = posLast - (posy << log2TrSize);
776 //uint32_t blkIdx0 = ((posy >> MLS_CG_LOG2_SIZE) << codingParameters.log2TrSizeCG) + (posx >> MLS_CG_LOG2_SIZE);
777 //const uint32_t blkIdx = ((posLast >> (2 * MLS_CG_LOG2_SIZE)) & ~maskPosXY) + ((posLast >> MLS_CG_LOG2_SIZE) & maskPosXY);
778 //sigCoeffGroupFlag64 |= ((uint64_t)isNZCoeff << blkIdx);
779 numSig -= isNZCoeff;
780
781 // TODO: optimize by instruction BTS
782 coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
783 coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
784 coeffNum[cgIdx] += (uint8_t)isNZCoeff;
785 }
786 while (numSig > 0);
787 return scanPosLast - 1;
788 }
789
790 // NOTE: no defined value on lastNZPosInCG & absSumSign when ALL ZEROS block as input
findPosFirstLast_c(const int16_t * dstCoeff,const intptr_t trSize,const uint16_t scanTbl[16])791 static uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
792 {
793 int n;
794
795 for (n = SCAN_SET_SIZE - 1; n >= 0; n--)
796 {
797 const uint32_t idx = scanTbl[n];
798 const uint32_t idxY = idx / MLS_CG_SIZE;
799 const uint32_t idxX = idx % MLS_CG_SIZE;
800 if (dstCoeff[idxY * trSize + idxX])
801 break;
802 }
803
804 X265_CHECK(n >= -1, "non-zero coeff scan failuare!\n");
805
806 uint32_t lastNZPosInCG = (uint32_t)n;
807
808 for (n = 0; n < SCAN_SET_SIZE; n++)
809 {
810 const uint32_t idx = scanTbl[n];
811 const uint32_t idxY = idx / MLS_CG_SIZE;
812 const uint32_t idxX = idx % MLS_CG_SIZE;
813 if (dstCoeff[idxY * trSize + idxX])
814 break;
815 }
816
817 uint32_t firstNZPosInCG = (uint32_t)n;
818
819 uint32_t absSumSign = 0;
820 for (n = firstNZPosInCG; n <= (int)lastNZPosInCG; n++)
821 {
822 const uint32_t idx = scanTbl[n];
823 const uint32_t idxY = idx / MLS_CG_SIZE;
824 const uint32_t idxX = idx % MLS_CG_SIZE;
825 absSumSign += dstCoeff[idxY * trSize + idxX];
826 }
827
828 // NOTE: when coeff block all ZERO, the lastNZPosInCG is undefined and firstNZPosInCG is 16
829 return ((absSumSign << 31) | (lastNZPosInCG << 8) | firstNZPosInCG);
830 }
831
832
costCoeffNxN_c(const uint16_t * scan,const coeff_t * coeff,intptr_t trSize,uint16_t * absCoeff,const uint8_t * tabSigCtx,uint32_t scanFlagMask,uint8_t * baseCtx,int offset,int scanPosSigOff,int subPosBase)833 static uint32_t costCoeffNxN_c(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
834 {
835 ALIGN_VAR_32(uint16_t, tmpCoeff[SCAN_SET_SIZE]);
836 uint32_t numNonZero = (scanPosSigOff < (SCAN_SET_SIZE - 1) ? 1 : 0);
837 uint32_t sum = 0;
838
839 // correct offset to match assembly
840 absCoeff -= numNonZero;
841
842 for (int i = 0; i < MLS_CG_SIZE; i++)
843 {
844 tmpCoeff[i * MLS_CG_SIZE + 0] = (uint16_t)abs(coeff[i * trSize + 0]);
845 tmpCoeff[i * MLS_CG_SIZE + 1] = (uint16_t)abs(coeff[i * trSize + 1]);
846 tmpCoeff[i * MLS_CG_SIZE + 2] = (uint16_t)abs(coeff[i * trSize + 2]);
847 tmpCoeff[i * MLS_CG_SIZE + 3] = (uint16_t)abs(coeff[i * trSize + 3]);
848 }
849
850 do
851 {
852 uint32_t blkPos, sig, ctxSig;
853 blkPos = scan[scanPosSigOff];
854 const uint32_t posZeroMask = (subPosBase + scanPosSigOff) ? ~0 : 0;
855 sig = scanFlagMask & 1;
856 scanFlagMask >>= 1;
857 X265_CHECK((uint32_t)(tmpCoeff[blkPos] != 0) == sig, "sign bit mistake\n");
858 if ((scanPosSigOff != 0) || (subPosBase == 0) || numNonZero)
859 {
860 const uint32_t cnt = tabSigCtx[blkPos] + offset;
861 ctxSig = cnt & posZeroMask;
862
863 //X265_CHECK(ctxSig == Quant::getSigCtxInc(patternSigCtx, log2TrSize, trSize, codingParameters.scan[subPosBase + scanPosSigOff], bIsLuma, codingParameters.firstSignificanceMapContext), "sigCtx mistake!\n");;
864 //encodeBin(sig, baseCtx[ctxSig]);
865 const uint32_t mstate = baseCtx[ctxSig];
866 const uint32_t mps = mstate & 1;
867 const uint32_t stateBits = PFX(entropyStateBits)[mstate ^ sig];
868 uint32_t nextState = (stateBits >> 24) + mps;
869 if ((mstate ^ sig) == 1)
870 nextState = sig;
871 X265_CHECK(sbacNext(mstate, sig) == nextState, "nextState check failure\n");
872 X265_CHECK(sbacGetEntropyBits(mstate, sig) == (stateBits & 0xFFFFFF), "entropyBits check failure\n");
873 baseCtx[ctxSig] = (uint8_t)nextState;
874 sum += stateBits;
875 }
876 assert(numNonZero <= 15);
877 assert(blkPos <= 15);
878 absCoeff[numNonZero] = tmpCoeff[blkPos];
879 numNonZero += sig;
880 scanPosSigOff--;
881 }
882 while(scanPosSigOff >= 0);
883
884 return (sum & 0xFFFFFF);
885 }
886
costCoeffRemain_c(uint16_t * absCoeff,int numNonZero,int idx)887 static uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
888 {
889 uint32_t goRiceParam = 0;
890
891 uint32_t sum = 0;
892 int baseLevel = 3;
893 do
894 {
895 if (idx >= C1FLAG_NUMBER)
896 baseLevel = 1;
897
898 // TODO: the IDX is not really idx, so this check inactive
899 //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
900 int codeNumber = absCoeff[idx] - baseLevel;
901
902 if (codeNumber >= 0)
903 {
904 //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
905 uint32_t length = 0;
906
907 codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
908 if (codeNumber >= 0)
909 {
910 {
911 unsigned long cidx;
912 CLZ(cidx, codeNumber + 1);
913 length = cidx;
914 }
915 X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
916
917 codeNumber = (length + length);
918 }
919 sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
920
921 if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
922 goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
923 X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
924 }
925 baseLevel = 2;
926 idx++;
927 }
928 while(idx < numNonZero);
929
930 return sum;
931 }
932
933
costC1C2Flag_c(uint16_t * absCoeff,intptr_t numC1Flag,uint8_t * baseCtxMod,intptr_t ctxOffset)934 static uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
935 {
936 uint32_t sum = 0;
937 uint32_t c1 = 1;
938 uint32_t firstC2Idx = 8;
939 uint32_t firstC2Flag = 2;
940 uint32_t c1Next = 0xFFFFFFFE;
941
942 int idx = 0;
943 do
944 {
945 uint32_t symbol1 = absCoeff[idx] > 1;
946 uint32_t symbol2 = absCoeff[idx] > 2;
947 //encodeBin(symbol1, baseCtxMod[c1]);
948 {
949 const uint32_t mstate = baseCtxMod[c1];
950 baseCtxMod[c1] = sbacNext(mstate, symbol1);
951 sum += sbacGetEntropyBits(mstate, symbol1);
952 }
953
954 if (symbol1)
955 c1Next = 0;
956
957 if (symbol1 + firstC2Flag == 3)
958 firstC2Flag = symbol2;
959
960 if (symbol1 + firstC2Idx == 9)
961 firstC2Idx = idx;
962
963 c1 = (c1Next & 3);
964 c1Next >>= 2;
965 X265_CHECK(c1 <= 3, "c1 check failure\n");
966 idx++;
967 }
968 while(idx < numC1Flag);
969
970 if (!c1)
971 {
972 X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
973
974 baseCtxMod += ctxOffset;
975
976 //encodeBin(firstC2Flag, baseCtxMod[0]);
977 {
978 const uint32_t mstate = baseCtxMod[0];
979 baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
980 sum += sbacGetEntropyBits(mstate, firstC2Flag);
981 }
982 }
983 return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
984 }
985 template<int log2TrSize>
nonPsyRdoQuant_c(int16_t * m_resiDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,uint32_t blkPos)986 static void nonPsyRdoQuant_c(int16_t *m_resiDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, uint32_t blkPos)
987 {
988 const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
989 const int scaleBits = SCALE_BITS - 2 * transformShift;
990 const uint32_t trSize = 1 << log2TrSize;
991
992 for (int y = 0; y < MLS_CG_SIZE; y++)
993 {
994 for (int x = 0; x < MLS_CG_SIZE; x++)
995 {
996 int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
997 costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
998 *totalUncodedCost += costUncoded[blkPos + x];
999 *totalRdCost += costUncoded[blkPos + x];
1000 }
1001 blkPos += trSize;
1002 }
1003 }
1004 template<int log2TrSize>
psyRdoQuant_c(int16_t * m_resiDctCoeff,int16_t * m_fencDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,int64_t * psyScale,uint32_t blkPos)1005 static void psyRdoQuant_c(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1006 {
1007 const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1008 const int scaleBits = SCALE_BITS - 2 * transformShift;
1009 const uint32_t trSize = 1 << log2TrSize;
1010 int max = X265_MAX(0, (2 * transformShift + 1));
1011
1012 for (int y = 0; y < MLS_CG_SIZE; y++)
1013 {
1014 for (int x = 0; x < MLS_CG_SIZE; x++)
1015 {
1016 int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
1017 int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1018
1019 costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1020
1021 /* when no residual coefficient is coded, predicted coef == recon coef */
1022 costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1023
1024 *totalUncodedCost += costUncoded[blkPos + x];
1025 *totalRdCost += costUncoded[blkPos + x];
1026 }
1027 blkPos += trSize;
1028 }
1029 }
1030 template<int log2TrSize>
psyRdoQuant_c_1(int16_t * m_resiDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,uint32_t blkPos)1031 static void psyRdoQuant_c_1(int16_t *m_resiDctCoeff, /*int16_t *m_fencDctCoeff, */ int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, /* int64_t *psyScale,*/ uint32_t blkPos)
1032 {
1033 const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1034 const int scaleBits = SCALE_BITS - 2 * transformShift;
1035 const uint32_t trSize = 1 << log2TrSize;
1036
1037 for (int y = 0; y < MLS_CG_SIZE; y++)
1038 {
1039 for (int x = 0; x < MLS_CG_SIZE; x++)
1040 {
1041 int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
1042 costUncoded[blkPos + x] = static_cast<int64_t>((double)((signCoef * signCoef) << scaleBits));
1043 *totalUncodedCost += costUncoded[blkPos + x];
1044 *totalRdCost += costUncoded[blkPos + x];
1045 }
1046 blkPos += trSize;
1047 }
1048 }
1049 template<int log2TrSize>
psyRdoQuant_c_2(int16_t * m_resiDctCoeff,int16_t * m_fencDctCoeff,int64_t * costUncoded,int64_t * totalUncodedCost,int64_t * totalRdCost,int64_t * psyScale,uint32_t blkPos)1050 static void psyRdoQuant_c_2(int16_t *m_resiDctCoeff, int16_t *m_fencDctCoeff, int64_t *costUncoded, int64_t *totalUncodedCost, int64_t *totalRdCost, int64_t *psyScale, uint32_t blkPos)
1051 {
1052 const int transformShift = MAX_TR_DYNAMIC_RANGE - X265_DEPTH - log2TrSize; /* Represents scaling through forward transform */
1053
1054 const uint32_t trSize = 1 << log2TrSize;
1055 int max = X265_MAX(0, (2 * transformShift + 1));
1056
1057 for (int y = 0; y < MLS_CG_SIZE; y++)
1058 {
1059 for (int x = 0; x < MLS_CG_SIZE; x++)
1060 {
1061 int64_t signCoef = m_resiDctCoeff[blkPos + x]; /* pre-quantization DCT coeff */
1062 int64_t predictedCoef = m_fencDctCoeff[blkPos + x] - signCoef; /* predicted DCT = source DCT - residual DCT*/
1063 costUncoded[blkPos + x] -= static_cast<int64_t>((double)(((*psyScale) * predictedCoef) >> max));
1064 *totalUncodedCost += costUncoded[blkPos + x];
1065 *totalRdCost += costUncoded[blkPos + x];
1066 }
1067 blkPos += trSize;
1068 }
1069 }
1070
1071 namespace X265_NS {
1072 // x265 private namespace
setupDCTPrimitives_c(EncoderPrimitives & p)1073 void setupDCTPrimitives_c(EncoderPrimitives& p)
1074 {
1075 p.dequant_scaling = dequant_scaling_c;
1076 p.dequant_normal = dequant_normal_c;
1077 p.quant = quant_c;
1078 p.nquant = nquant_c;
1079 p.cu[BLOCK_4x4].nonPsyRdoQuant = nonPsyRdoQuant_c<2>;
1080 p.cu[BLOCK_8x8].nonPsyRdoQuant = nonPsyRdoQuant_c<3>;
1081 p.cu[BLOCK_16x16].nonPsyRdoQuant = nonPsyRdoQuant_c<4>;
1082 p.cu[BLOCK_32x32].nonPsyRdoQuant = nonPsyRdoQuant_c<5>;
1083 p.cu[BLOCK_4x4].psyRdoQuant = psyRdoQuant_c<2>;
1084 p.cu[BLOCK_8x8].psyRdoQuant = psyRdoQuant_c<3>;
1085 p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_c<4>;
1086 p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_c<5>;
1087 p.dst4x4 = dst4_c;
1088 p.cu[BLOCK_4x4].dct = dct4_c;
1089 p.cu[BLOCK_8x8].dct = dct8_c;
1090 p.cu[BLOCK_16x16].dct = dct16_c;
1091 p.cu[BLOCK_32x32].dct = dct32_c;
1092 p.idst4x4 = idst4_c;
1093 p.cu[BLOCK_4x4].idct = idct4_c;
1094 p.cu[BLOCK_8x8].idct = idct8_c;
1095 p.cu[BLOCK_16x16].idct = idct16_c;
1096 p.cu[BLOCK_32x32].idct = idct32_c;
1097 p.denoiseDct = denoiseDct_c;
1098 p.cu[BLOCK_4x4].count_nonzero = count_nonzero_c<4>;
1099 p.cu[BLOCK_8x8].count_nonzero = count_nonzero_c<8>;
1100 p.cu[BLOCK_16x16].count_nonzero = count_nonzero_c<16>;
1101 p.cu[BLOCK_32x32].count_nonzero = count_nonzero_c<32>;
1102
1103 p.cu[BLOCK_4x4].copy_cnt = copy_count<4>;
1104 p.cu[BLOCK_8x8].copy_cnt = copy_count<8>;
1105 p.cu[BLOCK_16x16].copy_cnt = copy_count<16>;
1106 p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
1107 p.cu[BLOCK_4x4].psyRdoQuant_1p = psyRdoQuant_c_1<2>;
1108 p.cu[BLOCK_4x4].psyRdoQuant_2p = psyRdoQuant_c_2<2>;
1109 p.cu[BLOCK_8x8].psyRdoQuant_1p = psyRdoQuant_c_1<3>;
1110 p.cu[BLOCK_8x8].psyRdoQuant_2p = psyRdoQuant_c_2<3>;
1111 p.cu[BLOCK_16x16].psyRdoQuant_1p = psyRdoQuant_c_1<4>;
1112 p.cu[BLOCK_16x16].psyRdoQuant_2p = psyRdoQuant_c_2<4>;
1113 p.cu[BLOCK_32x32].psyRdoQuant_1p = psyRdoQuant_c_1<5>;
1114 p.cu[BLOCK_32x32].psyRdoQuant_2p = psyRdoQuant_c_2<5>;
1115 p.scanPosLast = scanPosLast_c;
1116 p.findPosFirstLast = findPosFirstLast_c;
1117 p.costCoeffNxN = costCoeffNxN_c;
1118 p.costCoeffRemain = costCoeffRemain_c;
1119 p.costC1C2Flag = costC1C2Flag_c;
1120 }
1121 }
1122