1 /*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33 #include "rdo.h"
34
35 #include <errno.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <pthread.h>
39
40 #include "cabac.h"
41 #include "context.h"
42 #include "encode_coding_tree.h"
43 #include "encoder.h"
44 #include "imagelist.h"
45 #include "inter.h"
46 #include "scalinglist.h"
47 #include "strategyselector.h"
48 #include "tables.h"
49 #include "transform.h"
50
51 #include "strategies/strategies-quant.h"
52
53
54 #define QUANT_SHIFT 14
55 #define SCAN_SET_SIZE 16
56 #define LOG2_SCAN_SET_SIZE 4
57 #define SBH_THRESHOLD 4
58
59 #define RD_SAMPLING_MAX_LAST_QP 50
60
61 static FILE *fastrd_learning_outfile[RD_SAMPLING_MAX_LAST_QP + 1] = {NULL};
62 static pthread_mutex_t outfile_mutex[RD_SAMPLING_MAX_LAST_QP + 1];
63
64 const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
65 const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
66
67 /**
68 * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0)
69 */
70 const uint32_t kvz_entropy_bits[128] =
71 {
72 0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f,
73 0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d,
74 0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c,
75 0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a,
76 0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759,
77 0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458,
78 0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156,
79 0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55
80 };
81
82 // Entropy bits scaled so that 50% probability yields 1 bit.
83 const float kvz_f_entropy_bits[128] =
84 {
85 1.0, 1.0,
86 0.92852783203125, 1.0751953125,
87 0.86383056640625, 1.150390625,
88 0.80499267578125, 1.225555419921875,
89 0.751251220703125, 1.300750732421875,
90 0.702056884765625, 1.375946044921875,
91 0.656829833984375, 1.451141357421875,
92 0.615203857421875, 1.526336669921875,
93 0.576751708984375, 1.601531982421875,
94 0.54119873046875, 1.67669677734375,
95 0.508209228515625, 1.75189208984375,
96 0.47760009765625, 1.82708740234375,
97 0.449127197265625, 1.90228271484375,
98 0.422637939453125, 1.97747802734375,
99 0.39788818359375, 2.05267333984375,
100 0.37481689453125, 2.127838134765625,
101 0.353240966796875, 2.203033447265625,
102 0.33306884765625, 2.278228759765625,
103 0.31414794921875, 2.353424072265625,
104 0.29644775390625, 2.428619384765625,
105 0.279815673828125, 2.5037841796875,
106 0.26422119140625, 2.5789794921875,
107 0.24957275390625, 2.6541748046875,
108 0.235809326171875, 2.7293701171875,
109 0.222869873046875, 2.8045654296875,
110 0.210662841796875, 2.879730224609375,
111 0.199188232421875, 2.954925537109375,
112 0.188385009765625, 3.030120849609375,
113 0.17822265625, 3.105316162109375,
114 0.168609619140625, 3.180511474609375,
115 0.1595458984375, 3.255706787109375,
116 0.1510009765625, 3.33087158203125,
117 0.1429443359375, 3.40606689453125,
118 0.135345458984375, 3.48126220703125,
119 0.128143310546875, 3.55645751953125,
120 0.121368408203125, 3.63165283203125,
121 0.114959716796875, 3.706817626953125,
122 0.10888671875, 3.782012939453125,
123 0.1031494140625, 3.857208251953125,
124 0.09771728515625, 3.932403564453125,
125 0.09259033203125, 4.007598876953125,
126 0.0877685546875, 4.082794189453125,
127 0.083160400390625, 4.157958984375,
128 0.078826904296875, 4.233154296875,
129 0.07470703125, 4.308349609375,
130 0.070831298828125, 4.383544921875,
131 0.067138671875, 4.458740234375,
132 0.06365966796875, 4.533935546875,
133 0.06036376953125, 4.609100341796875,
134 0.057220458984375, 4.684295654296875,
135 0.05426025390625, 4.759490966796875,
136 0.05145263671875, 4.834686279296875,
137 0.048797607421875, 4.909881591796875,
138 0.046295166015625, 4.985076904296875,
139 0.043914794921875, 5.06024169921875,
140 0.0416259765625, 5.13543701171875,
141 0.03948974609375, 5.21063232421875,
142 0.0374755859375, 5.285858154296875,
143 0.035552978515625, 5.360992431640625,
144 0.033721923828125, 5.43621826171875,
145 0.031982421875, 5.51141357421875,
146 0.03033447265625, 5.586578369140625,
147 0.028778076171875, 5.661773681640625,
148 0.027313232421875, 5.736968994140625,
149 };
150
151
152 // This struct is for passing data to kvz_rdoq_sign_hiding
153 struct sh_rates_t {
154 // Bit cost of increasing rate by one.
155 int32_t inc[32 * 32];
156 // Bit cost of decreasing rate by one.
157 int32_t dec[32 * 32];
158 // Bit cost of going from zero to one.
159 int32_t sig_coeff_inc[32 * 32];
160 // Coeff minus quantized coeff.
161 int32_t quant_delta[32 * 32];
162 };
163
kvz_init_rdcost_outfiles(const char * dir_path)164 int kvz_init_rdcost_outfiles(const char *dir_path)
165 {
166 #define RD_SAMPLING_MAX_FN_LENGTH 4095
167 static const char *basename_tmpl = "/%02i.txt";
168 char fn_template[RD_SAMPLING_MAX_FN_LENGTH + 1];
169 char fn[RD_SAMPLING_MAX_FN_LENGTH + 1];
170 int rv = 0, qp;
171
172 // As long as QP is a two-digit number, template and produced string should
173 // be equal in length ("%i" -> "22")
174 assert(RD_SAMPLING_MAX_LAST_QP <= 99);
175 assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
176
177 strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
178 strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
179
180 for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
181 pthread_mutex_t *curr = outfile_mutex + qp;
182
183 if (pthread_mutex_init(curr, NULL) != 0) {
184 fprintf(stderr, "Failed to create mutex\n");
185 rv = -1;
186 qp--;
187 goto out_destroy_mutexes;
188 }
189 }
190
191 for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
192 FILE *curr;
193
194 snprintf(fn, RD_SAMPLING_MAX_FN_LENGTH, fn_template, qp);
195 fn[RD_SAMPLING_MAX_FN_LENGTH] = 0;
196 curr = fopen(fn, "w");
197 if (curr == NULL) {
198 fprintf(stderr, "Failed to open %s: %s\n", fn, strerror(errno));
199 rv = -1;
200 qp--;
201 goto out_close_files;
202 }
203 fastrd_learning_outfile[qp] = curr;
204 }
205 goto out;
206
207 out_close_files:
208 for (; qp >= 0; qp--) {
209 fclose(fastrd_learning_outfile[qp]);
210 fastrd_learning_outfile[qp] = NULL;
211 }
212 goto out;
213
214 out_destroy_mutexes:
215 for (; qp >= 0; qp--) {
216 pthread_mutex_destroy(outfile_mutex + qp);
217 }
218 goto out;
219
220 out:
221 return rv;
222 #undef RD_SAMPLING_MAX_FN_LENGTH
223 }
224
225
226 /**
227 * \brief Calculate actual (or really close to actual) bitcost for coding
228 * coefficients.
229 *
230 * \param coeff coefficient array
231 * \param width coeff block width
232 * \param type data type (0 == luma)
233 *
234 * \returns bits needed to code input coefficients
235 */
get_coeff_cabac_cost(const encoder_state_t * const state,const coeff_t * coeff,int32_t width,int32_t type,int8_t scan_mode)236 static INLINE uint32_t get_coeff_cabac_cost(
237 const encoder_state_t * const state,
238 const coeff_t *coeff,
239 int32_t width,
240 int32_t type,
241 int8_t scan_mode)
242 {
243 // Make sure there are coeffs present
244 bool found = false;
245 for (int i = 0; i < width*width; i++) {
246 if (coeff[i] != 0) {
247 found = 1;
248 break;
249 }
250 }
251 if (!found) return 0;
252
253 // Take a copy of the CABAC so that we don't overwrite the contexts when
254 // counting the bits.
255 cabac_data_t cabac_copy;
256 memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
257
258 // Clear bytes and bits and set mode to "count"
259 cabac_copy.only_count = 1;
260 cabac_copy.num_buffered_bytes = 0;
261 cabac_copy.bits_left = 23;
262
263 // Execute the coding function.
264 // It is safe to drop the const modifier since state won't be modified
265 // when cabac.only_count is set.
266 kvz_encode_coeff_nxn((encoder_state_t*) state,
267 &cabac_copy,
268 coeff,
269 width,
270 type,
271 scan_mode,
272 0);
273
274 return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
275 }
276
save_ccc(int qp,const coeff_t * coeff,int32_t size,uint32_t ccc)277 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
278 {
279 pthread_mutex_t *mtx = outfile_mutex + qp;
280
281 assert(sizeof(coeff_t) == sizeof(int16_t));
282 assert(qp <= RD_SAMPLING_MAX_LAST_QP);
283
284 pthread_mutex_lock(mtx);
285
286 fwrite(&size, sizeof(size), 1, fastrd_learning_outfile[qp]);
287 fwrite(&ccc, sizeof(ccc), 1, fastrd_learning_outfile[qp]);
288 fwrite( coeff, sizeof(coeff_t), size, fastrd_learning_outfile[qp]);
289
290 pthread_mutex_unlock(mtx);
291 }
292
save_accuracy(int qp,uint32_t ccc,uint32_t fast_cost)293 static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
294 {
295 pthread_mutex_t *mtx = outfile_mutex + qp;
296
297 assert(qp <= RD_SAMPLING_MAX_LAST_QP);
298
299 pthread_mutex_lock(mtx);
300 fprintf(fastrd_learning_outfile[qp], "%u %u\n", fast_cost, ccc);
301 pthread_mutex_unlock(mtx);
302 }
303
304 /**
305 * \brief Estimate bitcost for coding coefficients.
306 *
307 * \param coeff coefficient array
308 * \param width coeff block width
309 * \param type data type (0 == luma)
310 *
311 * \returns number of bits needed to code coefficients
312 */
kvz_get_coeff_cost(const encoder_state_t * const state,const coeff_t * coeff,int32_t width,int32_t type,int8_t scan_mode)313 uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
314 const coeff_t *coeff,
315 int32_t width,
316 int32_t type,
317 int8_t scan_mode)
318 {
319 uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
320 uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
321
322 if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
323 state->qp < MAX_FAST_COEFF_COST_QP) {
324 // TODO: do we need to assert(0) out of the fast-estimation branch if we
325 // are to save block costs, or should we just warn about it somewhere
326 // earlier (configuration validation I guess)?
327 if (save_cccs) {
328 assert(0 && "Fast RD sampling does not work with fast-residual-cost");
329 return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
330 } else {
331 uint64_t weights = kvz_fast_coeff_get_weights(state);
332 uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
333 if (check_accuracy) {
334 uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
335 save_accuracy(state->qp, ccc, fast_cost);
336 }
337 return fast_cost;
338 }
339 } else {
340 uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
341 if (save_cccs) {
342 save_ccc(state->qp, coeff, width * width, ccc);
343 }
344 return ccc;
345 }
346 }
347
348 #define COEF_REMAIN_BIN_REDUCTION 3
349 /** Calculates the cost for specific absolute transform level
350 * \param abs_level scaled quantized level
351 * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
352 * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
353 * \param abs_go_rice Rice parameter for coeff_abs_level_minus3
354 * \returns cost of given absolute transform level
355 * From HM 12.0
356 */
kvz_get_ic_rate(encoder_state_t * const state,uint32_t abs_level,uint16_t ctx_num_one,uint16_t ctx_num_abs,uint16_t abs_go_rice,uint32_t c1_idx,uint32_t c2_idx,int8_t type)357 INLINE int32_t kvz_get_ic_rate(encoder_state_t * const state,
358 uint32_t abs_level,
359 uint16_t ctx_num_one,
360 uint16_t ctx_num_abs,
361 uint16_t abs_go_rice,
362 uint32_t c1_idx,
363 uint32_t c2_idx,
364 int8_t type)
365 {
366 cabac_data_t * const cabac = &state->cabac;
367 int32_t rate = 1 << CTX_FRAC_BITS;
368 uint32_t base_level = (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
369 cabac_ctx_t *base_one_ctx = (type == 0) ? &(cabac->ctx.cu_one_model_luma[0]) : &(cabac->ctx.cu_one_model_chroma[0]);
370 cabac_ctx_t *base_abs_ctx = (type == 0) ? &(cabac->ctx.cu_abs_model_luma[0]) : &(cabac->ctx.cu_abs_model_chroma[0]);
371
372 if ( abs_level >= base_level ) {
373 int32_t symbol = abs_level - base_level;
374 int32_t length;
375 if (symbol < (COEF_REMAIN_BIN_REDUCTION << abs_go_rice)) {
376 length = symbol>>abs_go_rice;
377 rate += (length+1+abs_go_rice) * (1 << CTX_FRAC_BITS);
378 } else {
379 length = abs_go_rice;
380 symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << abs_go_rice);
381 while (symbol >= (1<<length)) {
382 symbol -= (1<<(length++));
383 }
384 rate += (COEF_REMAIN_BIN_REDUCTION+length+1-abs_go_rice+length) * (1 << CTX_FRAC_BITS);
385 }
386 if (c1_idx < C1FLAG_NUMBER) {
387 rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
388
389 if (c2_idx < C2FLAG_NUMBER) {
390 rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
391 }
392 }
393 }
394 else if( abs_level == 1 ) {
395 rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
396 } else if( abs_level == 2 ) {
397 rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
398 rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
399 }
400
401 return rate;
402 }
403
404 /** Get the best level in RD sense
405 * \param coded_cost reference to coded cost
406 * \param coded_cost0 reference to cost when coefficient is 0
407 * \param coded_cost_sig reference to cost of significant coefficient
408 * \param level_double reference to unscaled quantized level
409 * \param max_abs_level scaled quantized level
410 * \param ctx_num_sig current ctxInc for coeff_abs_significant_flag
411 * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
412 * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
413 * \param abs_go_rice current Rice parameter for coeff_abs_level_minus3
414 * \param q_bits quantization step size
415 * \param temp correction factor
416 * \param last indicates if the coefficient is the last significant
417 * \returns best quantized transform level for given scan position
418 * This method calculates the best quantized transform level for a given scan position.
419 * From HM 12.0
420 */
kvz_get_coded_level(encoder_state_t * const state,double * coded_cost,double * coded_cost0,double * coded_cost_sig,int32_t level_double,uint32_t max_abs_level,uint16_t ctx_num_sig,uint16_t ctx_num_one,uint16_t ctx_num_abs,uint16_t abs_go_rice,uint32_t c1_idx,uint32_t c2_idx,int32_t q_bits,double temp,int8_t last,int8_t type)421 INLINE uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost, double *coded_cost0, double *coded_cost_sig,
422 int32_t level_double, uint32_t max_abs_level,
423 uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,
424 uint16_t abs_go_rice,
425 uint32_t c1_idx, uint32_t c2_idx,
426 int32_t q_bits,double temp, int8_t last, int8_t type)
427 {
428 cabac_data_t * const cabac = &state->cabac;
429 double cur_cost_sig = 0;
430 uint32_t best_abs_level = 0;
431 int32_t abs_level;
432 int32_t min_abs_level;
433 cabac_ctx_t* base_sig_model = type?(cabac->ctx.cu_sig_model_chroma):(cabac->ctx.cu_sig_model_luma);
434
435 if( !last && max_abs_level < 3 ) {
436 *coded_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
437 *coded_cost = *coded_cost0 + *coded_cost_sig;
438 if (max_abs_level == 0) return best_abs_level;
439 } else {
440 *coded_cost = MAX_DOUBLE;
441 }
442
443 if( !last ) {
444 cur_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
445 }
446
447 min_abs_level = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
448 for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
449 double err = (double)(level_double - ( abs_level * (1 << q_bits) ) );
450 double cur_cost = err * err * temp + state->lambda *
451 kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
452 abs_go_rice, c1_idx, c2_idx, type);
453 cur_cost += cur_cost_sig;
454
455 if( cur_cost < *coded_cost ) {
456 best_abs_level = abs_level;
457 *coded_cost = cur_cost;
458 *coded_cost_sig = cur_cost_sig;
459 }
460 }
461
462 return best_abs_level;
463 }
464
465
466 /** Calculates the cost of signaling the last significant coefficient in the block
467 * \param pos_x X coordinate of the last significant coefficient
468 * \param pos_y Y coordinate of the last significant coefficient
469 * \returns cost of last significant coefficient
470 * \param uiWidth width of the transform unit (TU)
471 *
472 * From HM 12.0
473 */
get_rate_last(const encoder_state_t * const state,const uint32_t pos_x,const uint32_t pos_y,int32_t * last_x_bits,int32_t * last_y_bits)474 static double get_rate_last(const encoder_state_t * const state,
475 const uint32_t pos_x, const uint32_t pos_y,
476 int32_t* last_x_bits, int32_t* last_y_bits)
477 {
478 uint32_t ctx_x = g_group_idx[pos_x];
479 uint32_t ctx_y = g_group_idx[pos_y];
480 double uiCost = last_x_bits[ ctx_x ] + last_y_bits[ ctx_y ];
481 if( ctx_x > 3 ) {
482 uiCost += CTX_FRAC_ONE_BIT * ((ctx_x - 2) >> 1);
483 }
484 if( ctx_y > 3 ) {
485 uiCost += CTX_FRAC_ONE_BIT * ((ctx_y - 2) >> 1);
486 }
487 return state->lambda * uiCost;
488 }
489
calc_last_bits(encoder_state_t * const state,int32_t width,int32_t height,int8_t type,int32_t * last_x_bits,int32_t * last_y_bits)490 static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t height, int8_t type,
491 int32_t* last_x_bits, int32_t* last_y_bits)
492 {
493 cabac_data_t * const cabac = &state->cabac;
494 int32_t bits_x = 0, bits_y = 0;
495 int32_t blk_size_offset_x, blk_size_offset_y, shiftX, shiftY;
496 int32_t ctx;
497
498 cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
499 cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
500
501 blk_size_offset_x = type ? 0: (kvz_g_convert_to_bit[ width ] *3 + ((kvz_g_convert_to_bit[ width ] +1)>>2));
502 blk_size_offset_y = type ? 0: (kvz_g_convert_to_bit[ height ]*3 + ((kvz_g_convert_to_bit[ height ]+1)>>2));
503 shiftX = type ? kvz_g_convert_to_bit[ width ] :((kvz_g_convert_to_bit[ width ]+3)>>2);
504 shiftY = type ? kvz_g_convert_to_bit[ height ] :((kvz_g_convert_to_bit[ height ]+3)>>2);
505
506
507 for (ctx = 0; ctx < g_group_idx[ width - 1 ]; ctx++) {
508 int32_t ctx_offset = blk_size_offset_x + (ctx >>shiftX);
509 last_x_bits[ ctx ] = bits_x + CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],0);
510 bits_x += CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],1);
511 }
512 last_x_bits[ctx] = bits_x;
513 for (ctx = 0; ctx < g_group_idx[ height - 1 ]; ctx++) {
514 int32_t ctx_offset = blk_size_offset_y + (ctx >>shiftY);
515 last_y_bits[ ctx ] = bits_y + CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],0);
516 bits_y += CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],1);
517 }
518 last_y_bits[ctx] = bits_y;
519 }
520
521 /**
522 * \brief Select which coefficient to change for sign hiding, and change it.
523 *
524 * When sign hiding is enabled, the last sign bit of the last coefficient is
525 * calculated from the parity of the other coefficients. If the parity is not
526 * correct, one coefficient has to be changed by one. This function uses
527 * tables generated during RDOQ to select the best coefficient to change.
528 */
kvz_rdoq_sign_hiding(const encoder_state_t * const state,const int32_t qp_scaled,const uint32_t * const scan2raster,const struct sh_rates_t * const sh_rates,const int32_t last_pos,const coeff_t * const coeffs,coeff_t * const quant_coeffs)529 void kvz_rdoq_sign_hiding(
530 const encoder_state_t *const state,
531 const int32_t qp_scaled,
532 const uint32_t *const scan2raster,
533 const struct sh_rates_t *const sh_rates,
534 const int32_t last_pos,
535 const coeff_t *const coeffs,
536 coeff_t *const quant_coeffs)
537 {
538 const encoder_control_t * const ctrl = state->encoder_control;
539
540 int inv_quant = kvz_g_inv_quant_scales[qp_scaled % 6];
541 // This somehow scales quant_delta into fractional bits. Instead of the bits
542 // being multiplied by lambda, the residual is divided by it, or something
543 // like that.
544 const int64_t rd_factor = (inv_quant * inv_quant * (1 << (2 * (qp_scaled / 6)))
545 / state->lambda / 16 / (1 << (2 * (ctrl->bitdepth - 8))) + 0.5);
546 const int last_cg = (last_pos - 1) >> LOG2_SCAN_SET_SIZE;
547
548 for (int32_t cg_scan = last_cg; cg_scan >= 0; cg_scan--) {
549 const int32_t cg_coeff_scan = cg_scan << LOG2_SCAN_SET_SIZE;
550
551 // Find positions of first and last non-zero coefficients in the CG.
552 int32_t last_nz_scan = -1;
553 for (int32_t coeff_i = SCAN_SET_SIZE - 1; coeff_i >= 0; --coeff_i) {
554 if (quant_coeffs[scan2raster[coeff_i + cg_coeff_scan]]) {
555 last_nz_scan = coeff_i;
556 break;
557 }
558 }
559 int32_t first_nz_scan = SCAN_SET_SIZE;
560 for (int32_t coeff_i = 0; coeff_i <= last_nz_scan; coeff_i++) {
561 if (quant_coeffs[scan2raster[coeff_i + cg_coeff_scan]]) {
562 first_nz_scan = coeff_i;
563 break;
564 }
565 }
566
567 if (last_nz_scan - first_nz_scan < SBH_THRESHOLD) {
568 continue;
569 }
570
571 const int32_t signbit = quant_coeffs[scan2raster[cg_coeff_scan + first_nz_scan]] <= 0;
572 unsigned abs_coeff_sum = 0;
573 for (int32_t coeff_scan = first_nz_scan; coeff_scan <= last_nz_scan; coeff_scan++) {
574 abs_coeff_sum += quant_coeffs[scan2raster[coeff_scan + cg_coeff_scan]];
575 }
576 if (signbit == (abs_coeff_sum & 0x1)) {
577 // Sign already matches with the parity, no need to modify coefficients.
578 continue;
579 }
580
581 // Otherwise, search for the best coeff to change by one and change it.
582
583 struct {
584 int64_t cost;
585 int pos;
586 int change;
587 } current, best = { MAX_INT64, 0, 0 };
588
589 const int last_coeff_scan = (cg_scan == last_cg ? last_nz_scan : SCAN_SET_SIZE - 1);
590 for (int coeff_scan = last_coeff_scan; coeff_scan >= 0; --coeff_scan) {
591 current.pos = scan2raster[coeff_scan + cg_coeff_scan];
592 // Shift the calculation back into original precision to avoid
593 // changing the bitstream.
594 # define PRECISION_INC (15 - CTX_FRAC_BITS)
595 int64_t quant_cost_in_bits = rd_factor * sh_rates->quant_delta[current.pos];
596
597 coeff_t abs_coeff = abs(quant_coeffs[current.pos]);
598
599 if (abs_coeff != 0) {
600 // Choose between incrementing and decrementing a non-zero coeff.
601
602 int64_t inc_bits = sh_rates->inc[current.pos];
603 int64_t dec_bits = sh_rates->dec[current.pos];
604 if (abs_coeff == 1) {
605 // We save sign bit and sig_coeff goes to zero.
606 dec_bits -= CTX_FRAC_ONE_BIT + sh_rates->sig_coeff_inc[current.pos];
607 }
608 if (cg_scan == last_cg && last_nz_scan == coeff_scan && abs_coeff == 1) {
609 // Changing the last non-zero bit in the last cg to zero.
610 // This might save a lot of bits if the next bits are already
611 // zeros, or just a coupple fractional bits if they are not.
612 // TODO: Check if calculating the real savings makes sense.
613 dec_bits -= 4 * CTX_FRAC_ONE_BIT;
614 }
615
616 inc_bits = -quant_cost_in_bits + inc_bits * (1 << PRECISION_INC);
617 dec_bits = quant_cost_in_bits + dec_bits * (1 << PRECISION_INC);
618
619 if (inc_bits < dec_bits) {
620 current.change = 1;
621 current.cost = inc_bits;
622 } else {
623 current.change = -1;
624 current.cost = dec_bits;
625
626 if (coeff_scan == first_nz_scan && abs_coeff == 1) {
627 // Don't turn first non-zero coeff into zero.
628 // Seems kind of arbitrary. It's probably because it could lead to
629 // breaking SBH_THRESHOLD.
630 current.cost = MAX_INT64;
631 }
632 }
633 } else {
634 // Try incrementing a zero coeff.
635
636 // Add sign bit, other bits and sig_coeff goes to one.
637 int bits = CTX_FRAC_ONE_BIT + sh_rates->inc[current.pos] + sh_rates->sig_coeff_inc[current.pos];
638 current.cost = -llabs(quant_cost_in_bits) + bits * (1 << PRECISION_INC);
639 current.change = 1;
640
641 if (coeff_scan < first_nz_scan) {
642 if (((coeffs[current.pos] >= 0) ? 0 : 1) != signbit) {
643 current.cost = MAX_INT64;
644 }
645 }
646 }
647
648 if (current.cost < best.cost) {
649 best = current;
650 }
651 }
652
653 if (quant_coeffs[best.pos] == 32767 || quant_coeffs[best.pos] == -32768) {
654 best.change = -1;
655 }
656
657 if (coeffs[best.pos] >= 0) {
658 quant_coeffs[best.pos] += best.change;
659 } else {
660 quant_coeffs[best.pos] -= best.change;
661 }
662 }
663 }
664
665
666 /** RDOQ with CABAC
667 * \returns void
668 * Rate distortion optimized quantization for entropy
669 * coding engines using probability models like CABAC
670 * From HM 12.0
671 */
kvz_rdoq(encoder_state_t * const state,coeff_t * coef,coeff_t * dest_coeff,int32_t width,int32_t height,int8_t type,int8_t scan_mode,int8_t block_type,int8_t tr_depth)672 void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
673 int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
674 {
675 const encoder_control_t * const encoder = state->encoder_control;
676 cabac_data_t * const cabac = &state->cabac;
677 uint32_t log2_tr_size = kvz_g_convert_to_bit[ width ] + 2;
678 int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; // Represents scaling through forward transform
679 uint16_t go_rice_param = 0;
680 uint32_t log2_block_size = kvz_g_convert_to_bit[ width ] + 2;
681 int32_t scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
682
683 int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
684
685 int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
686
687 const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
688 const double *err_scale = encoder->scaling_list.error_scale[log2_tr_size-2][scalinglist_type][qp_scaled%6];
689
690 double block_uncoded_cost = 0;
691
692 double cost_coeff [ 32 * 32 ];
693 double cost_sig [ 32 * 32 ];
694 double cost_coeff0[ 32 * 32 ];
695
696 struct sh_rates_t sh_rates;
697
698 const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
699 const uint32_t cg_size = 16;
700 const int32_t shift = 4 >> 1;
701 const uint32_t num_blk_side = width >> shift;
702 double cost_coeffgroup_sig[ 64 ];
703 uint32_t sig_coeffgroup_flag[ 64 ];
704
705 uint16_t ctx_set = 0;
706 int16_t c1 = 1;
707 int16_t c2 = 0;
708 double base_cost = 0;
709
710 uint32_t c1_idx = 0;
711 uint32_t c2_idx = 0;
712 int32_t base_level;
713
714 const uint32_t *scan = kvz_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
715
716 int32_t cg_last_scanpos = -1;
717 int32_t last_scanpos = -1;
718
719 uint32_t cg_num = width * height >> 4;
720
721 // Explicitly tell the only possible numbers of elements to be zeroed.
722 // Hope the compiler is able to utilize this information.
723 switch (cg_num) {
724 case 1: FILL_ARRAY(sig_coeffgroup_flag, 0, 1); break;
725 case 4: FILL_ARRAY(sig_coeffgroup_flag, 0, 4); break;
726 case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
727 case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
728 default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
729 }
730
731 cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
732 cabac_ctx_t *baseCtx = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) : &(cabac->ctx.cu_sig_model_chroma[0]);
733 cabac_ctx_t *base_one_ctx = (type == 0) ? &(cabac->ctx.cu_one_model_luma[0]) : &(cabac->ctx.cu_one_model_chroma[0]);
734
735 struct {
736 double coded_level_and_dist;
737 double uncoded_dist;
738 double sig_cost;
739 double sig_cost_0;
740 int32_t nnz_before_pos0;
741 } rd_stats;
742
743 //Find last cg and last scanpos
744 int32_t cg_scanpos;
745 for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
746 {
747 for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
748 {
749 int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
750 uint32_t blkpos = scan[scanpos];
751 int32_t q = quant_coeff[blkpos];
752 int32_t level_double = coef[blkpos];
753 level_double = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
754 uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits;
755
756 if (max_abs_level > 0) {
757 last_scanpos = scanpos;
758 ctx_set = (scanpos > 0 && type == 0) ? 2 : 0;
759 cg_last_scanpos = cg_scanpos;
760 sh_rates.sig_coeff_inc[blkpos] = 0;
761 break;
762 }
763 dest_coeff[blkpos] = 0;
764 }
765 if (last_scanpos != -1) break;
766 }
767
768 if (last_scanpos == -1) {
769 return;
770 }
771
772 for (; cg_scanpos >= 0; cg_scanpos--) cost_coeffgroup_sig[cg_scanpos] = 0;
773
774 int32_t last_x_bits[32], last_y_bits[32];
775 calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
776
777 for (int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
778 uint32_t cg_blkpos = scan_cg[cg_scanpos];
779 uint32_t cg_pos_y = cg_blkpos / num_blk_side;
780 uint32_t cg_pos_x = cg_blkpos - (cg_pos_y * num_blk_side);
781
782 int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
783 cg_pos_x, cg_pos_y, width);
784
785 FILL(rd_stats, 0);
786 for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
787 int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
788 if (scanpos > last_scanpos) continue;
789 uint32_t blkpos = scan[scanpos];
790 int32_t q = quant_coeff[blkpos];
791 double temp = err_scale[blkpos];
792 int32_t level_double = coef[blkpos];
793 level_double = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
794 uint32_t max_abs_level = (level_double + (1 << (q_bits - 1))) >> q_bits;
795
796 double err = (double)level_double;
797 cost_coeff0[scanpos] = err * err * temp;
798 block_uncoded_cost += cost_coeff0[ scanpos ];
799 //===== coefficient level estimation =====
800 int32_t level;
801 uint16_t one_ctx = 4 * ctx_set + c1;
802 uint16_t abs_ctx = ctx_set + c2;
803
804 if( scanpos == last_scanpos ) {
805 level = kvz_get_coded_level(state, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ],
806 level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param,
807 c1_idx, c2_idx, q_bits, temp, 1, type );
808 } else {
809 uint32_t pos_y = blkpos >> log2_block_size;
810 uint32_t pos_x = blkpos - ( pos_y << log2_block_size );
811 uint16_t ctx_sig = (uint16_t)kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
812 log2_block_size, type);
813 level = kvz_get_coded_level(state, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ],
814 level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param,
815 c1_idx, c2_idx, q_bits, temp, 0, type );
816 if (encoder->cfg.signhide_enable) {
817 int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
818 int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
819 sh_rates.sig_coeff_inc[blkpos] = greater_than_zero - zero;
820 }
821 }
822
823 if (encoder->cfg.signhide_enable) {
824 sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
825 if (level > 0) {
826 int32_t rate_now = kvz_get_ic_rate(state, level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
827 int32_t rate_up = kvz_get_ic_rate(state, level + 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
828 int32_t rate_down = kvz_get_ic_rate(state, level - 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
829 sh_rates.inc[blkpos] = rate_up - rate_now;
830 sh_rates.dec[blkpos] = rate_down - rate_now;
831 } else { // level == 0
832 sh_rates.inc[blkpos] = CTX_ENTROPY_BITS(&base_one_ctx[one_ctx], 0);
833 }
834 }
835 dest_coeff[blkpos] = (coeff_t)level;
836 base_cost += cost_coeff[scanpos];
837
838 base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
839 if (level >= base_level) {
840 if(level > 3*(1<<go_rice_param)) {
841 go_rice_param = MIN(go_rice_param + 1, 4);
842 }
843 }
844 if (level >= 1) c1_idx ++;
845
846 //===== update bin model =====
847 if (level > 1) {
848 c1 = 0;
849 c2 += (c2 < 2);
850 c2_idx ++;
851 } else if( (c1 < 3) && (c1 > 0) && level) {
852 c1++;
853 }
854
855 //===== context set update =====
856 if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) {
857 c2 = 0;
858 go_rice_param = 0;
859
860 c1_idx = 0;
861 c2_idx = 0;
862 ctx_set = (scanpos == SCAN_SET_SIZE || type != 0) ? 0 : 2;
863 if( c1 == 0 ) {
864 ctx_set++;
865 }
866 c1 = 1;
867 }
868
869 rd_stats.sig_cost += cost_sig[scanpos];
870 if ( scanpos_in_cg == 0 ) {
871 rd_stats.sig_cost_0 = cost_sig[scanpos];
872 }
873 if ( dest_coeff[blkpos] ) {
874 sig_coeffgroup_flag[cg_blkpos] = 1;
875 rd_stats.coded_level_and_dist += cost_coeff[scanpos] - cost_sig[scanpos];
876 rd_stats.uncoded_dist += cost_coeff0[scanpos];
877 if ( scanpos_in_cg != 0 ) {
878 rd_stats.nnz_before_pos0++;
879 }
880 }
881 } //end for (scanpos_in_cg)
882
883 if( cg_scanpos ) {
884 if (sig_coeffgroup_flag[cg_blkpos] == 0) {
885 uint32_t ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
886 cg_pos_y, width);
887 cost_coeffgroup_sig[cg_scanpos] = state->lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
888 base_cost += cost_coeffgroup_sig[cg_scanpos] - rd_stats.sig_cost;
889 } else {
890 if (cg_scanpos < cg_last_scanpos){
891 double cost_zero_cg;
892 uint32_t ctx_sig;
893 if (rd_stats.nnz_before_pos0 == 0) {
894 base_cost -= rd_stats.sig_cost_0;
895 rd_stats.sig_cost -= rd_stats.sig_cost_0;
896 }
897 // rd-cost if SigCoeffGroupFlag = 0, initialization
898 cost_zero_cg = base_cost;
899
900 // add SigCoeffGroupFlag cost to total cost
901 ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
902 cg_pos_y, width);
903
904 cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
905 base_cost += cost_coeffgroup_sig[cg_scanpos];
906 cost_zero_cg += state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
907
908 // try to convert the current coeff group from non-zero to all-zero
909 cost_zero_cg += rd_stats.uncoded_dist; // distortion for resetting non-zero levels to zero levels
910 cost_zero_cg -= rd_stats.coded_level_and_dist; // distortion and level cost for keeping all non-zero levels
911 cost_zero_cg -= rd_stats.sig_cost; // sig cost for all coeffs, including zero levels and non-zerl levels
912
913 // if we can save cost, change this block to all-zero block
914 if (cost_zero_cg < base_cost) {
915
916 sig_coeffgroup_flag[cg_blkpos] = 0;
917 base_cost = cost_zero_cg;
918
919 cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
920
921 // reset coeffs to 0 in this block
922 for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
923 int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
924 uint32_t blkpos = scan[scanpos];
925 if (dest_coeff[blkpos]){
926 dest_coeff[blkpos] = 0;
927 cost_coeff[scanpos] = cost_coeff0[scanpos];
928 cost_sig[scanpos] = 0;
929 }
930 }
931 } // end if ( cost_all_zeros < base_cost )
932 }
933 } // end if if (sig_coeffgroup_flag[ cg_blkpos ] == 0)
934 } else {
935 sig_coeffgroup_flag[cg_blkpos] = 1;
936 }
937 } //end for (cg_scanpos)
938
939 //===== estimate last position =====
940 double best_cost = 0;
941 int32_t ctx_cbf = 0;
942 int8_t found_last = 0;
943 int32_t best_last_idx_p1 = 0;
944
945 if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
946 best_cost = block_uncoded_cost + state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
947 base_cost += state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
948 } else {
949 cabac_ctx_t* base_cbf_model = type?(cabac->ctx.qt_cbf_model_chroma):(cabac->ctx.qt_cbf_model_luma);
950 ctx_cbf = ( type ? tr_depth : !tr_depth);
951 best_cost = block_uncoded_cost + state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
952 base_cost += state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
953 }
954
955 for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
956 uint32_t cg_blkpos = scan_cg[cg_scanpos];
957 base_cost -= cost_coeffgroup_sig[cg_scanpos];
958
959 if (sig_coeffgroup_flag[ cg_blkpos ]) {
960 for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
961 int32_t scanpos = cg_scanpos*cg_size + scanpos_in_cg;
962 if (scanpos > last_scanpos) continue;
963 uint32_t blkpos = scan[scanpos];
964
965 if( dest_coeff[ blkpos ] ) {
966 uint32_t pos_y = blkpos >> log2_block_size;
967 uint32_t pos_x = blkpos - ( pos_y << log2_block_size );
968
969 double cost_last = (scan_mode == SCAN_VER) ? get_rate_last(state, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(state, pos_x, pos_y, last_x_bits,last_y_bits );
970 double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
971
972 if( totalCost < best_cost ) {
973 best_last_idx_p1 = scanpos + 1;
974 best_cost = totalCost;
975 }
976 if( dest_coeff[ blkpos ] > 1 ) {
977 found_last = 1;
978 break;
979 }
980 base_cost -= cost_coeff[scanpos];
981 base_cost += cost_coeff0[scanpos];
982 } else {
983 base_cost -= cost_sig[scanpos];
984 }
985 } //end for
986 if (found_last) break;
987 } // end if (sig_coeffgroup_flag[ cg_blkpos ])
988 } // end for
989
990 uint32_t abs_sum = 0;
991 for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
992 int32_t blkPos = scan[scanpos];
993 int32_t level = dest_coeff[blkPos];
994 abs_sum += level;
995 dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
996 }
997 //===== clean uncoded coefficients =====
998 for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
999 dest_coeff[scan[scanpos]] = 0;
1000 }
1001
1002 if (encoder->cfg.signhide_enable && abs_sum >= 2) {
1003 kvz_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff);
1004 }
1005 }
1006
1007 /**
1008 * Calculate cost of actual motion vectors using CABAC coding
1009 */
kvz_get_mvd_coding_cost_cabac(const encoder_state_t * state,const cabac_data_t * cabac,const int32_t mvd_hor,const int32_t mvd_ver)1010 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
1011 const cabac_data_t* cabac,
1012 const int32_t mvd_hor,
1013 const int32_t mvd_ver)
1014 {
1015 cabac_data_t cabac_copy = *cabac;
1016 cabac_copy.only_count = 1;
1017
1018 // It is safe to drop const here because cabac->only_count is set.
1019 kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
1020
1021 uint32_t bitcost =
1022 ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
1023 ((23 - cabac->bits_left) + (cabac->num_buffered_bytes << 3));
1024
1025 return bitcost;
1026 }
1027
1028 /** MVD cost calculation with CABAC
1029 * \returns int
1030 * Calculates Motion Vector cost and related costs using CABAC coding
1031 */
kvz_calc_mvd_cost_cabac(const encoder_state_t * state,int x,int y,int mv_shift,int16_t mv_cand[2][2],inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],int16_t num_cand,int32_t ref_idx,uint32_t * bitcost)1032 uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
1033 int x,
1034 int y,
1035 int mv_shift,
1036 int16_t mv_cand[2][2],
1037 inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
1038 int16_t num_cand,
1039 int32_t ref_idx,
1040 uint32_t *bitcost)
1041 {
1042 cabac_data_t state_cabac_copy;
1043 cabac_data_t* cabac;
1044 uint32_t merge_idx;
1045 vector2d_t mvd = { 0, 0 };
1046 int8_t merged = 0;
1047 int8_t cur_mv_cand = 0;
1048
1049 x *= 1 << mv_shift;
1050 y *= 1 << mv_shift;
1051
1052 // Check every candidate to find a match
1053 for (merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
1054 if (merge_cand[merge_idx].dir == 3) continue;
1055 if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x &&
1056 merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y &&
1057 state->frame->ref_LX[merge_cand[merge_idx].dir - 1][
1058 merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1]
1059 ] == ref_idx)
1060 {
1061 merged = 1;
1062 break;
1063 }
1064 }
1065
1066 // Store cabac state and contexts
1067 memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
1068
1069 // Clear bytes and bits and set mode to "count"
1070 state_cabac_copy.only_count = 1;
1071 state_cabac_copy.num_buffered_bytes = 0;
1072 state_cabac_copy.bits_left = 23;
1073
1074 cabac = &state_cabac_copy;
1075
1076 if (!merged) {
1077 vector2d_t mvd1 = {
1078 x - mv_cand[0][0],
1079 y - mv_cand[0][1],
1080 };
1081 vector2d_t mvd2 = {
1082 x - mv_cand[1][0],
1083 y - mv_cand[1][1],
1084 };
1085 uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
1086 uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
1087
1088 // Select candidate 1 if it has lower cost
1089 if (cand2_cost < cand1_cost) {
1090 cur_mv_cand = 1;
1091 mvd = mvd2;
1092 } else {
1093 mvd = mvd1;
1094 }
1095 }
1096
1097 cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
1098
1099 CABAC_BIN(cabac, merged, "MergeFlag");
1100 num_cand = state->encoder_control->cfg.max_merge;
1101 if (merged) {
1102 if (num_cand > 1) {
1103 int32_t ui;
1104 for (ui = 0; ui < num_cand - 1; ui++) {
1105 int32_t symbol = (ui != merge_idx);
1106 if (ui == 0) {
1107 cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
1108 CABAC_BIN(cabac, symbol, "MergeIndex");
1109 } else {
1110 CABAC_BIN_EP(cabac, symbol, "MergeIndex");
1111 }
1112 if (symbol == 0) break;
1113 }
1114 }
1115 } else {
1116 uint32_t ref_list_idx;
1117 uint32_t j;
1118 int ref_list[2] = { 0, 0 };
1119 for (j = 0; j < state->frame->ref->used_size; j++) {
1120 if (state->frame->ref->pocs[j] < state->frame->poc) {
1121 ref_list[0]++;
1122 } else {
1123 ref_list[1]++;
1124 }
1125 }
1126
1127 //ToDo: bidir mv support
1128 for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
1129 if (/*cur_cu->inter.mv_dir*/ 1 & (1 << ref_list_idx)) {
1130 if (ref_list[ref_list_idx] > 1) {
1131 // parseRefFrmIdx
1132 int32_t ref_frame = ref_idx;
1133
1134 cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
1135 CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
1136
1137 if (ref_frame > 0) {
1138 int32_t i;
1139 int32_t ref_num = ref_list[ref_list_idx] - 2;
1140
1141 cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
1142 ref_frame--;
1143
1144 for (i = 0; i < ref_num; ++i) {
1145 const uint32_t symbol = (i == ref_frame) ? 0 : 1;
1146
1147 if (i == 0) {
1148 CABAC_BIN(cabac, symbol, "ref_idx_lX");
1149 } else {
1150 CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
1151 }
1152 if (symbol == 0) break;
1153 }
1154 }
1155 }
1156
1157 // ToDo: Bidir vector support
1158 if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
1159 // It is safe to drop const here because cabac->only_count is set.
1160 kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
1161 }
1162
1163 // Signal which candidate MV to use
1164 kvz_cabac_write_unary_max_symbol(
1165 cabac,
1166 cabac->ctx.mvp_idx_model,
1167 cur_mv_cand,
1168 1,
1169 AMVP_MAX_NUM_CANDS - 1);
1170 }
1171 }
1172 }
1173
1174 *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
1175
1176 // Store bitcost before restoring cabac
1177 return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
1178 }
1179
kvz_close_rdcost_outfiles(void)1180 void kvz_close_rdcost_outfiles(void)
1181 {
1182 int i;
1183
1184 for (i = 0; i < RD_SAMPLING_MAX_LAST_QP; i++) {
1185 FILE *curr = fastrd_learning_outfile[i];
1186 pthread_mutex_t *curr_mtx = outfile_mutex + i;
1187 if (curr != NULL) {
1188 fclose(curr);
1189 }
1190 if (curr_mtx != NULL) {
1191 pthread_mutex_destroy(curr_mtx);
1192 }
1193 }
1194 }
1195