1 /*****************************************************************************
2  * This file is part of Kvazaar HEVC encoder.
3  *
4  * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without modification,
8  * are permitted provided that the following conditions are met:
9  *
10  * * Redistributions of source code must retain the above copyright notice, this
11  *   list of conditions and the following disclaimer.
12  *
13  * * Redistributions in binary form must reproduce the above copyright notice, this
14  *   list of conditions and the following disclaimer in the documentation and/or
15  *   other materials provided with the distribution.
16  *
17  * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18  *   contributors may be used to endorse or promote products derived from
19  *   this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28  * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30  * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31  ****************************************************************************/
32 
33 #include "rdo.h"
34 
35 #include <errno.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <pthread.h>
39 
40 #include "cabac.h"
41 #include "context.h"
42 #include "encode_coding_tree.h"
43 #include "encoder.h"
44 #include "imagelist.h"
45 #include "inter.h"
46 #include "scalinglist.h"
47 #include "strategyselector.h"
48 #include "tables.h"
49 #include "transform.h"
50 
51 #include "strategies/strategies-quant.h"
52 
53 
54 #define QUANT_SHIFT          14
55 #define SCAN_SET_SIZE        16
56 #define LOG2_SCAN_SET_SIZE    4
57 #define SBH_THRESHOLD         4
58 
59 #define RD_SAMPLING_MAX_LAST_QP     50
60 
61 static FILE *fastrd_learning_outfile[RD_SAMPLING_MAX_LAST_QP + 1] = {NULL};
62 static pthread_mutex_t outfile_mutex[RD_SAMPLING_MAX_LAST_QP + 1];
63 
64 const uint32_t kvz_g_go_rice_range[5] = { 7, 14, 26, 46, 78 };
65 const uint32_t kvz_g_go_rice_prefix_len[5] = { 8, 7, 6, 5, 4 };
66 
67 /**
68  * Entropy bits to estimate coded bits in RDO / RDOQ (From HM 12.0)
69  */
70 const uint32_t kvz_entropy_bits[128] =
71 {
72   0x08000, 0x08000, 0x076da, 0x089a0, 0x06e92, 0x09340, 0x0670a, 0x09cdf, 0x06029, 0x0a67f, 0x059dd, 0x0b01f, 0x05413, 0x0b9bf, 0x04ebf, 0x0c35f,
73   0x049d3, 0x0ccff, 0x04546, 0x0d69e, 0x0410d, 0x0e03e, 0x03d22, 0x0e9de, 0x0397d, 0x0f37e, 0x03619, 0x0fd1e, 0x032ee, 0x106be, 0x02ffa, 0x1105d,
74   0x02d37, 0x119fd, 0x02aa2, 0x1239d, 0x02836, 0x12d3d, 0x025f2, 0x136dd, 0x023d1, 0x1407c, 0x021d2, 0x14a1c, 0x01ff2, 0x153bc, 0x01e2f, 0x15d5c,
75   0x01c87, 0x166fc, 0x01af7, 0x1709b, 0x0197f, 0x17a3b, 0x0181d, 0x183db, 0x016d0, 0x18d7b, 0x01595, 0x1971b, 0x0146c, 0x1a0bb, 0x01354, 0x1aa5a,
76   0x0124c, 0x1b3fa, 0x01153, 0x1bd9a, 0x01067, 0x1c73a, 0x00f89, 0x1d0da, 0x00eb7, 0x1da79, 0x00df0, 0x1e419, 0x00d34, 0x1edb9, 0x00c82, 0x1f759,
77   0x00bda, 0x200f9, 0x00b3c, 0x20a99, 0x00aa5, 0x21438, 0x00a17, 0x21dd8, 0x00990, 0x22778, 0x00911, 0x23118, 0x00898, 0x23ab8, 0x00826, 0x24458,
78   0x007ba, 0x24df7, 0x00753, 0x25797, 0x006f2, 0x26137, 0x00696, 0x26ad7, 0x0063f, 0x27477, 0x005ed, 0x27e17, 0x0059f, 0x287b6, 0x00554, 0x29156,
79   0x0050e, 0x29af6, 0x004cc, 0x2a497, 0x0048d, 0x2ae35, 0x00451, 0x2b7d6, 0x00418, 0x2c176, 0x003e2, 0x2cb15, 0x003af, 0x2d4b5, 0x0037f, 0x2de55
80 };
81 
82 // Entropy bits scaled so that 50% probability yields 1 bit.
83 const float kvz_f_entropy_bits[128] =
84 {
85   1.0, 1.0,
86   0.92852783203125, 1.0751953125,
87   0.86383056640625, 1.150390625,
88   0.80499267578125, 1.225555419921875,
89   0.751251220703125, 1.300750732421875,
90   0.702056884765625, 1.375946044921875,
91   0.656829833984375, 1.451141357421875,
92   0.615203857421875, 1.526336669921875,
93   0.576751708984375, 1.601531982421875,
94   0.54119873046875, 1.67669677734375,
95   0.508209228515625, 1.75189208984375,
96   0.47760009765625, 1.82708740234375,
97   0.449127197265625, 1.90228271484375,
98   0.422637939453125, 1.97747802734375,
99   0.39788818359375, 2.05267333984375,
100   0.37481689453125, 2.127838134765625,
101   0.353240966796875, 2.203033447265625,
102   0.33306884765625, 2.278228759765625,
103   0.31414794921875, 2.353424072265625,
104   0.29644775390625, 2.428619384765625,
105   0.279815673828125, 2.5037841796875,
106   0.26422119140625, 2.5789794921875,
107   0.24957275390625, 2.6541748046875,
108   0.235809326171875, 2.7293701171875,
109   0.222869873046875, 2.8045654296875,
110   0.210662841796875, 2.879730224609375,
111   0.199188232421875, 2.954925537109375,
112   0.188385009765625, 3.030120849609375,
113   0.17822265625, 3.105316162109375,
114   0.168609619140625, 3.180511474609375,
115   0.1595458984375, 3.255706787109375,
116   0.1510009765625, 3.33087158203125,
117   0.1429443359375, 3.40606689453125,
118   0.135345458984375, 3.48126220703125,
119   0.128143310546875, 3.55645751953125,
120   0.121368408203125, 3.63165283203125,
121   0.114959716796875, 3.706817626953125,
122   0.10888671875, 3.782012939453125,
123   0.1031494140625, 3.857208251953125,
124   0.09771728515625, 3.932403564453125,
125   0.09259033203125, 4.007598876953125,
126   0.0877685546875, 4.082794189453125,
127   0.083160400390625, 4.157958984375,
128   0.078826904296875, 4.233154296875,
129   0.07470703125, 4.308349609375,
130   0.070831298828125, 4.383544921875,
131   0.067138671875, 4.458740234375,
132   0.06365966796875, 4.533935546875,
133   0.06036376953125, 4.609100341796875,
134   0.057220458984375, 4.684295654296875,
135   0.05426025390625, 4.759490966796875,
136   0.05145263671875, 4.834686279296875,
137   0.048797607421875, 4.909881591796875,
138   0.046295166015625, 4.985076904296875,
139   0.043914794921875, 5.06024169921875,
140   0.0416259765625, 5.13543701171875,
141   0.03948974609375, 5.21063232421875,
142   0.0374755859375, 5.285858154296875,
143   0.035552978515625, 5.360992431640625,
144   0.033721923828125, 5.43621826171875,
145   0.031982421875, 5.51141357421875,
146   0.03033447265625, 5.586578369140625,
147   0.028778076171875, 5.661773681640625,
148   0.027313232421875, 5.736968994140625,
149 };
150 
151 
152 // This struct is for passing data to kvz_rdoq_sign_hiding
153 struct sh_rates_t {
154   // Bit cost of increasing rate by one.
155   int32_t inc[32 * 32];
156   // Bit cost of decreasing rate by one.
157   int32_t dec[32 * 32];
158   // Bit cost of going from zero to one.
159   int32_t sig_coeff_inc[32 * 32];
160   // Coeff minus quantized coeff.
161   int32_t quant_delta[32 * 32];
162 };
163 
kvz_init_rdcost_outfiles(const char * dir_path)164 int kvz_init_rdcost_outfiles(const char *dir_path)
165 {
166 #define RD_SAMPLING_MAX_FN_LENGTH 4095
167   static const char *basename_tmpl = "/%02i.txt";
168   char fn_template[RD_SAMPLING_MAX_FN_LENGTH + 1];
169   char fn[RD_SAMPLING_MAX_FN_LENGTH + 1];
170   int rv = 0, qp;
171 
172   // As long as QP is a two-digit number, template and produced string should
173   // be equal in length ("%i" -> "22")
174   assert(RD_SAMPLING_MAX_LAST_QP <= 99);
175   assert(strlen(fn_template) <= RD_SAMPLING_MAX_FN_LENGTH);
176 
177   strncpy(fn_template, dir_path, RD_SAMPLING_MAX_FN_LENGTH);
178   strncat(fn_template, basename_tmpl, RD_SAMPLING_MAX_FN_LENGTH - strlen(dir_path));
179 
180   for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
181     pthread_mutex_t *curr = outfile_mutex + qp;
182 
183     if (pthread_mutex_init(curr, NULL) != 0) {
184       fprintf(stderr, "Failed to create mutex\n");
185       rv = -1;
186       qp--;
187       goto out_destroy_mutexes;
188     }
189   }
190 
191   for (qp = 0; qp <= RD_SAMPLING_MAX_LAST_QP; qp++) {
192     FILE *curr;
193 
194     snprintf(fn, RD_SAMPLING_MAX_FN_LENGTH, fn_template, qp);
195     fn[RD_SAMPLING_MAX_FN_LENGTH] = 0;
196     curr = fopen(fn, "w");
197     if (curr == NULL) {
198       fprintf(stderr, "Failed to open %s: %s\n", fn, strerror(errno));
199       rv = -1;
200       qp--;
201       goto out_close_files;
202     }
203     fastrd_learning_outfile[qp] = curr;
204   }
205   goto out;
206 
207 out_close_files:
208   for (; qp >= 0; qp--) {
209     fclose(fastrd_learning_outfile[qp]);
210     fastrd_learning_outfile[qp] = NULL;
211   }
212   goto out;
213 
214 out_destroy_mutexes:
215   for (; qp >= 0; qp--) {
216     pthread_mutex_destroy(outfile_mutex + qp);
217   }
218   goto out;
219 
220 out:
221   return rv;
222 #undef RD_SAMPLING_MAX_FN_LENGTH
223 }
224 
225 
226 /**
227  * \brief Calculate actual (or really close to actual) bitcost for coding
228  * coefficients.
229  *
230  * \param coeff coefficient array
231  * \param width coeff block width
232  * \param type data type (0 == luma)
233  *
234  * \returns bits needed to code input coefficients
235  */
get_coeff_cabac_cost(const encoder_state_t * const state,const coeff_t * coeff,int32_t width,int32_t type,int8_t scan_mode)236 static INLINE uint32_t get_coeff_cabac_cost(
237     const encoder_state_t * const state,
238     const coeff_t *coeff,
239     int32_t width,
240     int32_t type,
241     int8_t scan_mode)
242 {
243   // Make sure there are coeffs present
244   bool found = false;
245   for (int i = 0; i < width*width; i++) {
246     if (coeff[i] != 0) {
247       found = 1;
248       break;
249     }
250   }
251   if (!found) return 0;
252 
253   // Take a copy of the CABAC so that we don't overwrite the contexts when
254   // counting the bits.
255   cabac_data_t cabac_copy;
256   memcpy(&cabac_copy, &state->cabac, sizeof(cabac_copy));
257 
258   // Clear bytes and bits and set mode to "count"
259   cabac_copy.only_count = 1;
260   cabac_copy.num_buffered_bytes = 0;
261   cabac_copy.bits_left = 23;
262 
263   // Execute the coding function.
264   // It is safe to drop the const modifier since state won't be modified
265   // when cabac.only_count is set.
266   kvz_encode_coeff_nxn((encoder_state_t*) state,
267                        &cabac_copy,
268                        coeff,
269                        width,
270                        type,
271                        scan_mode,
272                        0);
273 
274   return (23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3);
275 }
276 
save_ccc(int qp,const coeff_t * coeff,int32_t size,uint32_t ccc)277 static INLINE void save_ccc(int qp, const coeff_t *coeff, int32_t size, uint32_t ccc)
278 {
279   pthread_mutex_t *mtx = outfile_mutex + qp;
280 
281   assert(sizeof(coeff_t) == sizeof(int16_t));
282   assert(qp <= RD_SAMPLING_MAX_LAST_QP);
283 
284   pthread_mutex_lock(mtx);
285 
286   fwrite(&size,  sizeof(size),     1,    fastrd_learning_outfile[qp]);
287   fwrite(&ccc,   sizeof(ccc),      1,    fastrd_learning_outfile[qp]);
288   fwrite( coeff, sizeof(coeff_t),  size, fastrd_learning_outfile[qp]);
289 
290   pthread_mutex_unlock(mtx);
291 }
292 
save_accuracy(int qp,uint32_t ccc,uint32_t fast_cost)293 static INLINE void save_accuracy(int qp, uint32_t ccc, uint32_t fast_cost)
294 {
295   pthread_mutex_t *mtx = outfile_mutex + qp;
296 
297   assert(qp <= RD_SAMPLING_MAX_LAST_QP);
298 
299   pthread_mutex_lock(mtx);
300   fprintf(fastrd_learning_outfile[qp], "%u %u\n", fast_cost, ccc);
301   pthread_mutex_unlock(mtx);
302 }
303 
304 /**
305  * \brief Estimate bitcost for coding coefficients.
306  *
307  * \param coeff   coefficient array
308  * \param width   coeff block width
309  * \param type    data type (0 == luma)
310  *
311  * \returns       number of bits needed to code coefficients
312  */
kvz_get_coeff_cost(const encoder_state_t * const state,const coeff_t * coeff,int32_t width,int32_t type,int8_t scan_mode)313 uint32_t kvz_get_coeff_cost(const encoder_state_t * const state,
314                             const coeff_t *coeff,
315                             int32_t width,
316                             int32_t type,
317                             int8_t scan_mode)
318 {
319   uint8_t save_cccs = state->encoder_control->cfg.fastrd_sampling_on;
320   uint8_t check_accuracy = state->encoder_control->cfg.fastrd_accuracy_check_on;
321 
322   if (state->qp < state->encoder_control->cfg.fast_residual_cost_limit &&
323       state->qp < MAX_FAST_COEFF_COST_QP) {
324     // TODO: do we need to assert(0) out of the fast-estimation branch if we
325     // are to save block costs, or should we just warn about it somewhere
326     // earlier (configuration validation I guess)?
327     if (save_cccs) {
328       assert(0 && "Fast RD sampling does not work with fast-residual-cost");
329       return UINT32_MAX; // Hush little compiler don't you cry, not really gonna return anything after assert(0)
330     } else {
331       uint64_t weights = kvz_fast_coeff_get_weights(state);
332       uint32_t fast_cost = kvz_fast_coeff_cost(coeff, width, weights);
333       if (check_accuracy) {
334         uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
335         save_accuracy(state->qp, ccc, fast_cost);
336       }
337       return fast_cost;
338     }
339   } else {
340     uint32_t ccc = get_coeff_cabac_cost(state, coeff, width, type, scan_mode);
341     if (save_cccs) {
342       save_ccc(state->qp, coeff, width * width, ccc);
343     }
344     return ccc;
345   }
346 }
347 
348 #define COEF_REMAIN_BIN_REDUCTION 3
349 /** Calculates the cost for specific absolute transform level
350  * \param abs_level scaled quantized level
351  * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
352  * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
353  * \param abs_go_rice Rice parameter for coeff_abs_level_minus3
354  * \returns cost of given absolute transform level
355  * From HM 12.0
356  */
kvz_get_ic_rate(encoder_state_t * const state,uint32_t abs_level,uint16_t ctx_num_one,uint16_t ctx_num_abs,uint16_t abs_go_rice,uint32_t c1_idx,uint32_t c2_idx,int8_t type)357 INLINE int32_t kvz_get_ic_rate(encoder_state_t * const state,
358                     uint32_t abs_level,
359                     uint16_t ctx_num_one,
360                     uint16_t ctx_num_abs,
361                     uint16_t abs_go_rice,
362                     uint32_t c1_idx,
363                     uint32_t c2_idx,
364                     int8_t type)
365 {
366   cabac_data_t * const cabac = &state->cabac;
367   int32_t rate = 1 << CTX_FRAC_BITS;
368   uint32_t base_level  =  (c1_idx < C1FLAG_NUMBER)? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
369   cabac_ctx_t *base_one_ctx = (type == 0) ? &(cabac->ctx.cu_one_model_luma[0]) : &(cabac->ctx.cu_one_model_chroma[0]);
370   cabac_ctx_t *base_abs_ctx = (type == 0) ? &(cabac->ctx.cu_abs_model_luma[0]) : &(cabac->ctx.cu_abs_model_chroma[0]);
371 
372   if ( abs_level >= base_level ) {
373     int32_t symbol     = abs_level - base_level;
374     int32_t length;
375     if (symbol < (COEF_REMAIN_BIN_REDUCTION << abs_go_rice)) {
376       length = symbol>>abs_go_rice;
377       rate += (length+1+abs_go_rice) * (1 << CTX_FRAC_BITS);
378     } else {
379       length = abs_go_rice;
380       symbol  = symbol - ( COEF_REMAIN_BIN_REDUCTION << abs_go_rice);
381       while (symbol >= (1<<length)) {
382         symbol -=  (1<<(length++));
383       }
384       rate += (COEF_REMAIN_BIN_REDUCTION+length+1-abs_go_rice+length) * (1 << CTX_FRAC_BITS);
385     }
386     if (c1_idx < C1FLAG_NUMBER) {
387       rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
388 
389       if (c2_idx < C2FLAG_NUMBER) {
390         rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],1);
391       }
392     }
393   }
394   else if( abs_level == 1 ) {
395     rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],0);
396   } else if( abs_level == 2 ) {
397     rate += CTX_ENTROPY_BITS(&base_one_ctx[ctx_num_one],1);
398     rate += CTX_ENTROPY_BITS(&base_abs_ctx[ctx_num_abs],0);
399   }
400 
401   return rate;
402 }
403 
404 /** Get the best level in RD sense
405  * \param coded_cost reference to coded cost
406  * \param coded_cost0 reference to cost when coefficient is 0
407  * \param coded_cost_sig reference to cost of significant coefficient
408  * \param level_double reference to unscaled quantized level
409  * \param max_abs_level scaled quantized level
410  * \param ctx_num_sig current ctxInc for coeff_abs_significant_flag
411  * \param ctx_num_one current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
412  * \param ctx_num_abs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
413  * \param abs_go_rice current Rice parameter for coeff_abs_level_minus3
414  * \param q_bits quantization step size
415  * \param temp correction factor
416  * \param last indicates if the coefficient is the last significant
417  * \returns best quantized transform level for given scan position
418  * This method calculates the best quantized transform level for a given scan position.
419  * From HM 12.0
420  */
kvz_get_coded_level(encoder_state_t * const state,double * coded_cost,double * coded_cost0,double * coded_cost_sig,int32_t level_double,uint32_t max_abs_level,uint16_t ctx_num_sig,uint16_t ctx_num_one,uint16_t ctx_num_abs,uint16_t abs_go_rice,uint32_t c1_idx,uint32_t c2_idx,int32_t q_bits,double temp,int8_t last,int8_t type)421 INLINE uint32_t kvz_get_coded_level ( encoder_state_t * const state, double *coded_cost, double *coded_cost0, double *coded_cost_sig,
422                            int32_t level_double, uint32_t max_abs_level,
423                            uint16_t ctx_num_sig, uint16_t ctx_num_one, uint16_t ctx_num_abs,
424                            uint16_t abs_go_rice,
425                            uint32_t c1_idx, uint32_t c2_idx,
426                            int32_t q_bits,double temp, int8_t last, int8_t type)
427 {
428   cabac_data_t * const cabac = &state->cabac;
429   double cur_cost_sig   = 0;
430   uint32_t best_abs_level = 0;
431   int32_t abs_level;
432   int32_t min_abs_level;
433   cabac_ctx_t* base_sig_model = type?(cabac->ctx.cu_sig_model_chroma):(cabac->ctx.cu_sig_model_luma);
434 
435   if( !last && max_abs_level < 3 ) {
436     *coded_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 0);
437     *coded_cost     = *coded_cost0 + *coded_cost_sig;
438     if (max_abs_level == 0) return best_abs_level;
439   } else {
440     *coded_cost = MAX_DOUBLE;
441   }
442 
443   if( !last ) {
444     cur_cost_sig = state->lambda * CTX_ENTROPY_BITS(&base_sig_model[ctx_num_sig], 1);
445   }
446 
447   min_abs_level    = ( max_abs_level > 1 ? max_abs_level - 1 : 1 );
448   for (abs_level = max_abs_level; abs_level >= min_abs_level ; abs_level-- ) {
449     double err       = (double)(level_double - ( abs_level * (1 << q_bits) ) );
450     double cur_cost  = err * err * temp + state->lambda *
451                        kvz_get_ic_rate( state, abs_level, ctx_num_one, ctx_num_abs,
452                                     abs_go_rice, c1_idx, c2_idx, type);
453     cur_cost        += cur_cost_sig;
454 
455     if( cur_cost < *coded_cost ) {
456       best_abs_level  = abs_level;
457       *coded_cost     = cur_cost;
458       *coded_cost_sig = cur_cost_sig;
459     }
460   }
461 
462   return best_abs_level;
463 }
464 
465 
466 /** Calculates the cost of signaling the last significant coefficient in the block
467  * \param pos_x X coordinate of the last significant coefficient
468  * \param pos_y Y coordinate of the last significant coefficient
469  * \returns cost of last significant coefficient
470  * \param uiWidth width of the transform unit (TU)
471  *
472  * From HM 12.0
473 */
get_rate_last(const encoder_state_t * const state,const uint32_t pos_x,const uint32_t pos_y,int32_t * last_x_bits,int32_t * last_y_bits)474 static double get_rate_last(const encoder_state_t * const state,
475                             const uint32_t  pos_x, const uint32_t pos_y,
476                             int32_t* last_x_bits, int32_t* last_y_bits)
477 {
478   uint32_t ctx_x   = g_group_idx[pos_x];
479   uint32_t ctx_y   = g_group_idx[pos_y];
480   double uiCost = last_x_bits[ ctx_x ] + last_y_bits[ ctx_y ];
481   if( ctx_x > 3 ) {
482     uiCost += CTX_FRAC_ONE_BIT * ((ctx_x - 2) >> 1);
483   }
484   if( ctx_y > 3 ) {
485     uiCost += CTX_FRAC_ONE_BIT * ((ctx_y - 2) >> 1);
486   }
487   return state->lambda * uiCost;
488 }
489 
calc_last_bits(encoder_state_t * const state,int32_t width,int32_t height,int8_t type,int32_t * last_x_bits,int32_t * last_y_bits)490 static void calc_last_bits(encoder_state_t * const state, int32_t width, int32_t height, int8_t type,
491                            int32_t* last_x_bits, int32_t* last_y_bits)
492 {
493   cabac_data_t * const cabac = &state->cabac;
494   int32_t bits_x = 0, bits_y = 0;
495   int32_t blk_size_offset_x, blk_size_offset_y, shiftX, shiftY;
496   int32_t ctx;
497 
498   cabac_ctx_t *base_ctx_x = (type ? cabac->ctx.cu_ctx_last_x_chroma : cabac->ctx.cu_ctx_last_x_luma);
499   cabac_ctx_t *base_ctx_y = (type ? cabac->ctx.cu_ctx_last_y_chroma : cabac->ctx.cu_ctx_last_y_luma);
500 
501   blk_size_offset_x = type ? 0: (kvz_g_convert_to_bit[ width ] *3 + ((kvz_g_convert_to_bit[ width ] +1)>>2));
502   blk_size_offset_y = type ? 0: (kvz_g_convert_to_bit[ height ]*3 + ((kvz_g_convert_to_bit[ height ]+1)>>2));
503   shiftX = type ? kvz_g_convert_to_bit[ width  ] :((kvz_g_convert_to_bit[ width  ]+3)>>2);
504   shiftY = type ? kvz_g_convert_to_bit[ height ] :((kvz_g_convert_to_bit[ height ]+3)>>2);
505 
506 
507   for (ctx = 0; ctx < g_group_idx[ width - 1 ]; ctx++) {
508     int32_t ctx_offset = blk_size_offset_x + (ctx >>shiftX);
509     last_x_bits[ ctx ] = bits_x + CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],0);
510     bits_x += CTX_ENTROPY_BITS(&base_ctx_x[ ctx_offset ],1);
511   }
512   last_x_bits[ctx] = bits_x;
513   for (ctx = 0; ctx < g_group_idx[ height - 1 ]; ctx++) {
514     int32_t ctx_offset = blk_size_offset_y + (ctx >>shiftY);
515     last_y_bits[ ctx ] = bits_y + CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],0);
516     bits_y +=  CTX_ENTROPY_BITS(&base_ctx_y[ ctx_offset ],1);
517   }
518   last_y_bits[ctx] = bits_y;
519 }
520 
521 /**
522  * \brief Select which coefficient to change for sign hiding, and change it.
523  *
524  * When sign hiding is enabled, the last sign bit of the last coefficient is
525  * calculated from the parity of the other coefficients. If the parity is not
526  * correct, one coefficient has to be changed by one. This function uses
527  * tables generated during RDOQ to select the best coefficient to change.
528  */
kvz_rdoq_sign_hiding(const encoder_state_t * const state,const int32_t qp_scaled,const uint32_t * const scan2raster,const struct sh_rates_t * const sh_rates,const int32_t last_pos,const coeff_t * const coeffs,coeff_t * const quant_coeffs)529 void kvz_rdoq_sign_hiding(
530     const encoder_state_t *const state,
531     const int32_t qp_scaled,
532     const uint32_t *const scan2raster,
533     const struct sh_rates_t *const sh_rates,
534     const int32_t last_pos,
535     const coeff_t *const coeffs,
536     coeff_t *const quant_coeffs)
537 {
538   const encoder_control_t * const ctrl = state->encoder_control;
539 
540   int inv_quant = kvz_g_inv_quant_scales[qp_scaled % 6];
541   // This somehow scales quant_delta into fractional bits. Instead of the bits
542   // being multiplied by lambda, the residual is divided by it, or something
543   // like that.
544   const int64_t rd_factor = (inv_quant * inv_quant * (1 << (2 * (qp_scaled / 6)))
545                       / state->lambda / 16 / (1 << (2 * (ctrl->bitdepth - 8))) + 0.5);
546   const int last_cg = (last_pos - 1) >> LOG2_SCAN_SET_SIZE;
547 
548   for (int32_t cg_scan = last_cg; cg_scan >= 0; cg_scan--) {
549     const int32_t cg_coeff_scan = cg_scan << LOG2_SCAN_SET_SIZE;
550 
551     // Find positions of first and last non-zero coefficients in the CG.
552     int32_t last_nz_scan = -1;
553     for (int32_t coeff_i = SCAN_SET_SIZE - 1; coeff_i >= 0; --coeff_i) {
554       if (quant_coeffs[scan2raster[coeff_i + cg_coeff_scan]]) {
555         last_nz_scan = coeff_i;
556         break;
557       }
558     }
559     int32_t first_nz_scan = SCAN_SET_SIZE;
560     for (int32_t coeff_i = 0; coeff_i <= last_nz_scan; coeff_i++) {
561       if (quant_coeffs[scan2raster[coeff_i + cg_coeff_scan]]) {
562         first_nz_scan = coeff_i;
563         break;
564       }
565     }
566 
567     if (last_nz_scan - first_nz_scan < SBH_THRESHOLD) {
568       continue;
569     }
570 
571     const int32_t signbit = quant_coeffs[scan2raster[cg_coeff_scan + first_nz_scan]] <= 0;
572     unsigned abs_coeff_sum = 0;
573     for (int32_t coeff_scan = first_nz_scan; coeff_scan <= last_nz_scan; coeff_scan++) {
574       abs_coeff_sum += quant_coeffs[scan2raster[coeff_scan + cg_coeff_scan]];
575     }
576     if (signbit == (abs_coeff_sum & 0x1)) {
577       // Sign already matches with the parity, no need to modify coefficients.
578       continue;
579     }
580 
581     // Otherwise, search for the best coeff to change by one and change it.
582 
583     struct {
584       int64_t cost;
585       int pos;
586       int change;
587     } current, best = { MAX_INT64, 0, 0 };
588 
589     const int last_coeff_scan = (cg_scan == last_cg ? last_nz_scan : SCAN_SET_SIZE - 1);
590     for (int coeff_scan = last_coeff_scan; coeff_scan >= 0; --coeff_scan) {
591       current.pos = scan2raster[coeff_scan + cg_coeff_scan];
592       // Shift the calculation back into original precision to avoid
593       // changing the bitstream.
594 #     define PRECISION_INC (15 - CTX_FRAC_BITS)
595       int64_t quant_cost_in_bits = rd_factor * sh_rates->quant_delta[current.pos];
596 
597       coeff_t abs_coeff = abs(quant_coeffs[current.pos]);
598 
599       if (abs_coeff != 0) {
600         // Choose between incrementing and decrementing a non-zero coeff.
601 
602         int64_t inc_bits = sh_rates->inc[current.pos];
603         int64_t dec_bits = sh_rates->dec[current.pos];
604         if (abs_coeff == 1) {
605           // We save sign bit and sig_coeff goes to zero.
606           dec_bits -= CTX_FRAC_ONE_BIT + sh_rates->sig_coeff_inc[current.pos];
607         }
608         if (cg_scan == last_cg && last_nz_scan == coeff_scan && abs_coeff == 1) {
609           // Changing the last non-zero bit in the last cg to zero.
610           // This might save a lot of bits if the next bits are already
611           // zeros, or just a coupple fractional bits if they are not.
612           // TODO: Check if calculating the real savings makes sense.
613           dec_bits -= 4 * CTX_FRAC_ONE_BIT;
614         }
615 
616         inc_bits = -quant_cost_in_bits + inc_bits * (1 << PRECISION_INC);
617         dec_bits = quant_cost_in_bits + dec_bits * (1 << PRECISION_INC);
618 
619         if (inc_bits < dec_bits) {
620           current.change = 1;
621           current.cost = inc_bits;
622         } else {
623           current.change = -1;
624           current.cost = dec_bits;
625 
626           if (coeff_scan == first_nz_scan && abs_coeff == 1) {
627             // Don't turn first non-zero coeff into zero.
628             // Seems kind of arbitrary. It's probably because it could lead to
629             // breaking SBH_THRESHOLD.
630             current.cost = MAX_INT64;
631           }
632         }
633       } else {
634         // Try incrementing a zero coeff.
635 
636         // Add sign bit, other bits and sig_coeff goes to one.
637         int bits = CTX_FRAC_ONE_BIT + sh_rates->inc[current.pos] + sh_rates->sig_coeff_inc[current.pos];
638         current.cost = -llabs(quant_cost_in_bits) + bits * (1 << PRECISION_INC);
639         current.change = 1;
640 
641         if (coeff_scan < first_nz_scan) {
642           if (((coeffs[current.pos] >= 0) ? 0 : 1) != signbit) {
643             current.cost = MAX_INT64;
644           }
645         }
646       }
647 
648       if (current.cost < best.cost) {
649         best = current;
650       }
651     }
652 
653     if (quant_coeffs[best.pos] == 32767 || quant_coeffs[best.pos] == -32768) {
654       best.change = -1;
655     }
656 
657     if (coeffs[best.pos] >= 0) {
658       quant_coeffs[best.pos] += best.change;
659     } else {
660       quant_coeffs[best.pos] -= best.change;
661     }
662   }
663 }
664 
665 
666 /** RDOQ with CABAC
667  * \returns void
668  * Rate distortion optimized quantization for entropy
669  * coding engines using probability models like CABAC
670  * From HM 12.0
671  */
kvz_rdoq(encoder_state_t * const state,coeff_t * coef,coeff_t * dest_coeff,int32_t width,int32_t height,int8_t type,int8_t scan_mode,int8_t block_type,int8_t tr_depth)672 void kvz_rdoq(encoder_state_t * const state, coeff_t *coef, coeff_t *dest_coeff, int32_t width,
673            int32_t height, int8_t type, int8_t scan_mode, int8_t block_type, int8_t tr_depth)
674 {
675   const encoder_control_t * const encoder = state->encoder_control;
676   cabac_data_t * const cabac = &state->cabac;
677   uint32_t log2_tr_size      = kvz_g_convert_to_bit[ width ] + 2;
678   int32_t  transform_shift   = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size;  // Represents scaling through forward transform
679   uint16_t go_rice_param     = 0;
680   uint32_t log2_block_size   = kvz_g_convert_to_bit[ width ] + 2;
681   int32_t  scalinglist_type= (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]);
682 
683   int32_t qp_scaled = kvz_get_scaled_qp(type, state->qp, (encoder->bitdepth - 8) * 6);
684 
685   int32_t q_bits = QUANT_SHIFT + qp_scaled/6 + transform_shift;
686 
687   const int32_t *quant_coeff  = encoder->scaling_list.quant_coeff[log2_tr_size-2][scalinglist_type][qp_scaled%6];
688   const double *err_scale     = encoder->scaling_list.error_scale[log2_tr_size-2][scalinglist_type][qp_scaled%6];
689 
690   double block_uncoded_cost = 0;
691 
692   double cost_coeff [ 32 * 32 ];
693   double cost_sig   [ 32 * 32 ];
694   double cost_coeff0[ 32 * 32 ];
695 
696   struct sh_rates_t sh_rates;
697 
698   const uint32_t *scan_cg = g_sig_last_scan_cg[log2_block_size - 2][scan_mode];
699   const uint32_t cg_size = 16;
700   const int32_t  shift = 4 >> 1;
701   const uint32_t num_blk_side = width >> shift;
702   double   cost_coeffgroup_sig[ 64 ];
703   uint32_t sig_coeffgroup_flag[ 64 ];
704 
705   uint16_t    ctx_set    = 0;
706   int16_t     c1         = 1;
707   int16_t     c2         = 0;
708   double      base_cost  = 0;
709 
710   uint32_t    c1_idx     = 0;
711   uint32_t    c2_idx     = 0;
712   int32_t     base_level;
713 
714   const uint32_t *scan = kvz_g_sig_last_scan[ scan_mode ][ log2_block_size - 1 ];
715 
716   int32_t cg_last_scanpos = -1;
717   int32_t last_scanpos = -1;
718 
719   uint32_t cg_num = width * height >> 4;
720 
721   // Explicitly tell the only possible numbers of elements to be zeroed.
722   // Hope the compiler is able to utilize this information.
723   switch (cg_num) {
724     case  1: FILL_ARRAY(sig_coeffgroup_flag, 0,  1); break;
725     case  4: FILL_ARRAY(sig_coeffgroup_flag, 0,  4); break;
726     case 16: FILL_ARRAY(sig_coeffgroup_flag, 0, 16); break;
727     case 64: FILL_ARRAY(sig_coeffgroup_flag, 0, 64); break;
728     default: assert(0 && "There should be 1, 4, 16 or 64 coefficient groups");
729   }
730 
731   cabac_ctx_t *base_coeff_group_ctx = &(cabac->ctx.cu_sig_coeff_group_model[type]);
732   cabac_ctx_t *baseCtx              = (type == 0) ? &(cabac->ctx.cu_sig_model_luma[0]) : &(cabac->ctx.cu_sig_model_chroma[0]);
733   cabac_ctx_t *base_one_ctx = (type == 0) ? &(cabac->ctx.cu_one_model_luma[0]) : &(cabac->ctx.cu_one_model_chroma[0]);
734 
735   struct {
736     double coded_level_and_dist;
737     double uncoded_dist;
738     double sig_cost;
739     double sig_cost_0;
740     int32_t nnz_before_pos0;
741   } rd_stats;
742 
743   //Find last cg and last scanpos
744   int32_t cg_scanpos;
745   for (cg_scanpos = (cg_num - 1); cg_scanpos >= 0; cg_scanpos--)
746   {
747     for (int32_t scanpos_in_cg = (cg_size - 1); scanpos_in_cg >= 0; scanpos_in_cg--)
748     {
749       int32_t  scanpos        = cg_scanpos*cg_size + scanpos_in_cg;
750       uint32_t blkpos         = scan[scanpos];
751       int32_t q               = quant_coeff[blkpos];
752       int32_t level_double    = coef[blkpos];
753       level_double            = MIN(abs(level_double) * q, MAX_INT - (1 << (q_bits - 1)));
754       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
755 
756       if (max_abs_level > 0) {
757         last_scanpos    = scanpos;
758         ctx_set         = (scanpos > 0 && type == 0) ? 2 : 0;
759         cg_last_scanpos = cg_scanpos;
760         sh_rates.sig_coeff_inc[blkpos] = 0;
761         break;
762       }
763       dest_coeff[blkpos] = 0;
764     }
765     if (last_scanpos != -1) break;
766   }
767 
768   if (last_scanpos == -1) {
769     return;
770   }
771 
772   for (; cg_scanpos >= 0; cg_scanpos--) cost_coeffgroup_sig[cg_scanpos] = 0;
773 
774   int32_t last_x_bits[32], last_y_bits[32];
775   calc_last_bits(state, width, height, type, last_x_bits, last_y_bits);
776 
777   for (int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
778     uint32_t cg_blkpos  = scan_cg[cg_scanpos];
779     uint32_t cg_pos_y   = cg_blkpos / num_blk_side;
780     uint32_t cg_pos_x   = cg_blkpos - (cg_pos_y * num_blk_side);
781 
782     int32_t pattern_sig_ctx = kvz_context_calc_pattern_sig_ctx(sig_coeffgroup_flag,
783                                                            cg_pos_x, cg_pos_y, width);
784 
785     FILL(rd_stats, 0);
786     for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--)  {
787       int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
788       if (scanpos > last_scanpos) continue;
789       uint32_t blkpos         = scan[scanpos];
790       int32_t q               = quant_coeff[blkpos];
791       double temp             = err_scale[blkpos];
792       int32_t level_double    = coef[blkpos];
793       level_double            = MIN(abs(level_double) * q , MAX_INT - (1 << (q_bits - 1)));
794       uint32_t max_abs_level  = (level_double + (1 << (q_bits - 1))) >> q_bits;
795 
796       double err              = (double)level_double;
797       cost_coeff0[scanpos]    = err * err * temp;
798       block_uncoded_cost      += cost_coeff0[ scanpos ];
799       //===== coefficient level estimation =====
800       int32_t  level;
801       uint16_t  one_ctx = 4 * ctx_set + c1;
802       uint16_t  abs_ctx = ctx_set + c2;
803 
804       if( scanpos == last_scanpos ) {
805         level            = kvz_get_coded_level(state, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ],
806                                              level_double, max_abs_level, 0, one_ctx, abs_ctx, go_rice_param,
807                                              c1_idx, c2_idx, q_bits, temp, 1, type );
808       } else {
809         uint32_t  pos_y    = blkpos >> log2_block_size;
810         uint32_t  pos_x    = blkpos - ( pos_y << log2_block_size );
811         uint16_t  ctx_sig  = (uint16_t)kvz_context_get_sig_ctx_inc(pattern_sig_ctx, scan_mode, pos_x, pos_y,
812                                                      log2_block_size, type);
813         level              = kvz_get_coded_level(state, &cost_coeff[ scanpos ], &cost_coeff0[ scanpos ], &cost_sig[ scanpos ],
814                                              level_double, max_abs_level, ctx_sig, one_ctx, abs_ctx, go_rice_param,
815                                              c1_idx, c2_idx, q_bits, temp, 0, type );
816         if (encoder->cfg.signhide_enable) {
817           int greater_than_zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 1);
818           int zero = CTX_ENTROPY_BITS(&baseCtx[ctx_sig], 0);
819           sh_rates.sig_coeff_inc[blkpos] = greater_than_zero - zero;
820         }
821       }
822 
823       if (encoder->cfg.signhide_enable) {
824         sh_rates.quant_delta[blkpos] = (level_double - level * (1 << q_bits)) >> (q_bits - 8);
825         if (level > 0) {
826           int32_t rate_now  = kvz_get_ic_rate(state, level, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
827           int32_t rate_up   = kvz_get_ic_rate(state, level + 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
828           int32_t rate_down = kvz_get_ic_rate(state, level - 1, one_ctx, abs_ctx, go_rice_param, c1_idx, c2_idx, type);
829           sh_rates.inc[blkpos] = rate_up - rate_now;
830           sh_rates.dec[blkpos] = rate_down - rate_now;
831         } else { // level == 0
832           sh_rates.inc[blkpos]   = CTX_ENTROPY_BITS(&base_one_ctx[one_ctx], 0);
833         }
834       }
835       dest_coeff[blkpos] = (coeff_t)level;
836       base_cost         += cost_coeff[scanpos];
837 
838       base_level = (c1_idx < C1FLAG_NUMBER) ? (2 + (c2_idx < C2FLAG_NUMBER)) : 1;
839       if (level >= base_level) {
840         if(level  > 3*(1<<go_rice_param)) {
841           go_rice_param = MIN(go_rice_param + 1, 4);
842         }
843       }
844       if (level >= 1) c1_idx ++;
845 
846       //===== update bin model =====
847       if (level > 1) {
848         c1 = 0;
849         c2 += (c2 < 2);
850         c2_idx ++;
851       } else if( (c1 < 3) && (c1 > 0) && level) {
852         c1++;
853       }
854 
855       //===== context set update =====
856       if ((scanpos % SCAN_SET_SIZE == 0) && scanpos > 0) {
857         c2                = 0;
858         go_rice_param     = 0;
859 
860         c1_idx   = 0;
861         c2_idx   = 0;
862         ctx_set = (scanpos == SCAN_SET_SIZE || type != 0) ? 0 : 2;
863         if( c1 == 0 ) {
864           ctx_set++;
865         }
866         c1 = 1;
867       }
868 
869       rd_stats.sig_cost += cost_sig[scanpos];
870       if ( scanpos_in_cg == 0 ) {
871         rd_stats.sig_cost_0 = cost_sig[scanpos];
872       }
873       if ( dest_coeff[blkpos] )  {
874         sig_coeffgroup_flag[cg_blkpos] = 1;
875         rd_stats.coded_level_and_dist   += cost_coeff[scanpos] - cost_sig[scanpos];
876         rd_stats.uncoded_dist           += cost_coeff0[scanpos];
877         if ( scanpos_in_cg != 0 ) {
878           rd_stats.nnz_before_pos0++;
879         }
880       }
881     } //end for (scanpos_in_cg)
882 
883     if( cg_scanpos ) {
884       if (sig_coeffgroup_flag[cg_blkpos] == 0) {
885         uint32_t ctx_sig  = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
886                                                         cg_pos_y, width);
887         cost_coeffgroup_sig[cg_scanpos] = state->lambda *CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig],0);
888         base_cost += cost_coeffgroup_sig[cg_scanpos]  - rd_stats.sig_cost;
889       } else {
890         if (cg_scanpos < cg_last_scanpos){
891           double cost_zero_cg;
892           uint32_t ctx_sig;
893           if (rd_stats.nnz_before_pos0 == 0) {
894             base_cost -= rd_stats.sig_cost_0;
895             rd_stats.sig_cost -= rd_stats.sig_cost_0;
896           }
897           // rd-cost if SigCoeffGroupFlag = 0, initialization
898           cost_zero_cg = base_cost;
899 
900           // add SigCoeffGroupFlag cost to total cost
901           ctx_sig = kvz_context_get_sig_coeff_group(sig_coeffgroup_flag, cg_pos_x,
902             cg_pos_y, width);
903 
904           cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 1);
905           base_cost += cost_coeffgroup_sig[cg_scanpos];
906           cost_zero_cg += state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
907 
908           // try to convert the current coeff group from non-zero to all-zero
909           cost_zero_cg += rd_stats.uncoded_dist;          // distortion for resetting non-zero levels to zero levels
910           cost_zero_cg -= rd_stats.coded_level_and_dist;  // distortion and level cost for keeping all non-zero levels
911           cost_zero_cg -= rd_stats.sig_cost;              // sig cost for all coeffs, including zero levels and non-zerl levels
912 
913           // if we can save cost, change this block to all-zero block
914           if (cost_zero_cg < base_cost) {
915 
916             sig_coeffgroup_flag[cg_blkpos] = 0;
917             base_cost = cost_zero_cg;
918 
919             cost_coeffgroup_sig[cg_scanpos] = state->lambda * CTX_ENTROPY_BITS(&base_coeff_group_ctx[ctx_sig], 0);
920 
921             // reset coeffs to 0 in this block
922             for (int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
923               int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
924               uint32_t blkpos = scan[scanpos];
925               if (dest_coeff[blkpos]){
926                 dest_coeff[blkpos] = 0;
927                 cost_coeff[scanpos] = cost_coeff0[scanpos];
928                 cost_sig[scanpos] = 0;
929               }
930             }
931           } // end if ( cost_all_zeros < base_cost )
932         }
933       } // end if if (sig_coeffgroup_flag[ cg_blkpos ] == 0)
934     } else {
935       sig_coeffgroup_flag[cg_blkpos] = 1;
936     }
937   } //end for (cg_scanpos)
938 
939   //===== estimate last position =====
940   double  best_cost        = 0;
941   int32_t ctx_cbf          = 0;
942   int8_t found_last        = 0;
943   int32_t best_last_idx_p1 = 0;
944 
945   if( block_type != CU_INTRA && !type/* && pcCU->getTransformIdx( uiAbsPartIdx ) == 0*/ ) {
946     best_cost  = block_uncoded_cost +   state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),0);
947     base_cost +=   state->lambda * CTX_ENTROPY_BITS(&(cabac->ctx.cu_qt_root_cbf_model),1);
948   } else {
949     cabac_ctx_t* base_cbf_model = type?(cabac->ctx.qt_cbf_model_chroma):(cabac->ctx.qt_cbf_model_luma);
950     ctx_cbf    = ( type ? tr_depth : !tr_depth);
951     best_cost  = block_uncoded_cost +  state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],0);
952     base_cost +=   state->lambda * CTX_ENTROPY_BITS(&base_cbf_model[ctx_cbf],1);
953   }
954 
955   for ( int32_t cg_scanpos = cg_last_scanpos; cg_scanpos >= 0; cg_scanpos--) {
956     uint32_t cg_blkpos = scan_cg[cg_scanpos];
957     base_cost -= cost_coeffgroup_sig[cg_scanpos];
958 
959     if (sig_coeffgroup_flag[ cg_blkpos ]) {
960       for ( int32_t scanpos_in_cg = cg_size - 1; scanpos_in_cg >= 0; scanpos_in_cg--) {
961         int32_t  scanpos = cg_scanpos*cg_size + scanpos_in_cg;
962         if (scanpos > last_scanpos) continue;
963         uint32_t blkpos  = scan[scanpos];
964 
965         if( dest_coeff[ blkpos ] ) {
966           uint32_t   pos_y = blkpos >> log2_block_size;
967           uint32_t   pos_x = blkpos - ( pos_y << log2_block_size );
968 
969           double cost_last = (scan_mode == SCAN_VER) ? get_rate_last(state, pos_y, pos_x,last_x_bits,last_y_bits) : get_rate_last(state, pos_x, pos_y, last_x_bits,last_y_bits );
970           double totalCost = base_cost + cost_last - cost_sig[ scanpos ];
971 
972           if( totalCost < best_cost ) {
973             best_last_idx_p1 = scanpos + 1;
974             best_cost        = totalCost;
975           }
976           if( dest_coeff[ blkpos ] > 1 ) {
977             found_last = 1;
978             break;
979           }
980           base_cost -= cost_coeff[scanpos];
981           base_cost += cost_coeff0[scanpos];
982         } else {
983           base_cost -= cost_sig[scanpos];
984         }
985       } //end for
986       if (found_last) break;
987     } // end if (sig_coeffgroup_flag[ cg_blkpos ])
988   } // end for
989 
990   uint32_t abs_sum = 0;
991   for ( int32_t scanpos = 0; scanpos < best_last_idx_p1; scanpos++) {
992     int32_t blkPos     = scan[scanpos];
993     int32_t level      = dest_coeff[blkPos];
994     abs_sum            += level;
995     dest_coeff[blkPos] = (coeff_t)(( coef[blkPos] < 0 ) ? -level : level);
996   }
997   //===== clean uncoded coefficients =====
998   for ( int32_t scanpos = best_last_idx_p1; scanpos <= last_scanpos; scanpos++) {
999     dest_coeff[scan[scanpos]] = 0;
1000   }
1001 
1002   if (encoder->cfg.signhide_enable && abs_sum >= 2) {
1003     kvz_rdoq_sign_hiding(state, qp_scaled, scan, &sh_rates, best_last_idx_p1, coef, dest_coeff);
1004   }
1005 }
1006 
1007 /**
1008  * Calculate cost of actual motion vectors using CABAC coding
1009  */
kvz_get_mvd_coding_cost_cabac(const encoder_state_t * state,const cabac_data_t * cabac,const int32_t mvd_hor,const int32_t mvd_ver)1010 uint32_t kvz_get_mvd_coding_cost_cabac(const encoder_state_t *state,
1011                                        const cabac_data_t* cabac,
1012                                        const int32_t mvd_hor,
1013                                        const int32_t mvd_ver)
1014 {
1015   cabac_data_t cabac_copy = *cabac;
1016   cabac_copy.only_count = 1;
1017 
1018   // It is safe to drop const here because cabac->only_count is set.
1019   kvz_encode_mvd((encoder_state_t*) state, &cabac_copy, mvd_hor, mvd_ver);
1020 
1021   uint32_t bitcost =
1022     ((23 - cabac_copy.bits_left) + (cabac_copy.num_buffered_bytes << 3)) -
1023     ((23 - cabac->bits_left)     + (cabac->num_buffered_bytes << 3));
1024 
1025   return bitcost;
1026 }
1027 
1028 /** MVD cost calculation with CABAC
1029 * \returns int
1030 * Calculates Motion Vector cost and related costs using CABAC coding
1031 */
kvz_calc_mvd_cost_cabac(const encoder_state_t * state,int x,int y,int mv_shift,int16_t mv_cand[2][2],inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],int16_t num_cand,int32_t ref_idx,uint32_t * bitcost)1032 uint32_t kvz_calc_mvd_cost_cabac(const encoder_state_t * state,
1033                                  int x,
1034                                  int y,
1035                                  int mv_shift,
1036                                  int16_t mv_cand[2][2],
1037                                  inter_merge_cand_t merge_cand[MRG_MAX_NUM_CANDS],
1038                                  int16_t num_cand,
1039                                  int32_t ref_idx,
1040                                  uint32_t *bitcost)
1041 {
1042   cabac_data_t state_cabac_copy;
1043   cabac_data_t* cabac;
1044   uint32_t merge_idx;
1045   vector2d_t mvd = { 0, 0 };
1046   int8_t merged = 0;
1047   int8_t cur_mv_cand = 0;
1048 
1049   x *= 1 << mv_shift;
1050   y *= 1 << mv_shift;
1051 
1052   // Check every candidate to find a match
1053   for (merge_idx = 0; merge_idx < (uint32_t)num_cand; merge_idx++) {
1054     if (merge_cand[merge_idx].dir == 3) continue;
1055     if (merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][0] == x &&
1056       merge_cand[merge_idx].mv[merge_cand[merge_idx].dir - 1][1] == y &&
1057       state->frame->ref_LX[merge_cand[merge_idx].dir - 1][
1058         merge_cand[merge_idx].ref[merge_cand[merge_idx].dir - 1]
1059       ] == ref_idx)
1060     {
1061       merged = 1;
1062       break;
1063     }
1064   }
1065 
1066   // Store cabac state and contexts
1067   memcpy(&state_cabac_copy, &state->cabac, sizeof(cabac_data_t));
1068 
1069   // Clear bytes and bits and set mode to "count"
1070   state_cabac_copy.only_count = 1;
1071   state_cabac_copy.num_buffered_bytes = 0;
1072   state_cabac_copy.bits_left = 23;
1073 
1074   cabac = &state_cabac_copy;
1075 
1076   if (!merged) {
1077     vector2d_t mvd1 = {
1078       x - mv_cand[0][0],
1079       y - mv_cand[0][1],
1080     };
1081     vector2d_t mvd2 = {
1082       x - mv_cand[1][0],
1083       y - mv_cand[1][1],
1084     };
1085     uint32_t cand1_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd1.x, mvd1.y);
1086     uint32_t cand2_cost = kvz_get_mvd_coding_cost_cabac(state, cabac, mvd2.x, mvd2.y);
1087 
1088     // Select candidate 1 if it has lower cost
1089     if (cand2_cost < cand1_cost) {
1090       cur_mv_cand = 1;
1091       mvd = mvd2;
1092     } else {
1093       mvd = mvd1;
1094     }
1095   }
1096 
1097   cabac->cur_ctx = &(cabac->ctx.cu_merge_flag_ext_model);
1098 
1099   CABAC_BIN(cabac, merged, "MergeFlag");
1100   num_cand = state->encoder_control->cfg.max_merge;
1101   if (merged) {
1102     if (num_cand > 1) {
1103       int32_t ui;
1104       for (ui = 0; ui < num_cand - 1; ui++) {
1105         int32_t symbol = (ui != merge_idx);
1106         if (ui == 0) {
1107           cabac->cur_ctx = &(cabac->ctx.cu_merge_idx_ext_model);
1108           CABAC_BIN(cabac, symbol, "MergeIndex");
1109         } else {
1110           CABAC_BIN_EP(cabac, symbol, "MergeIndex");
1111         }
1112         if (symbol == 0) break;
1113       }
1114     }
1115   } else {
1116     uint32_t ref_list_idx;
1117     uint32_t j;
1118     int ref_list[2] = { 0, 0 };
1119     for (j = 0; j < state->frame->ref->used_size; j++) {
1120       if (state->frame->ref->pocs[j] < state->frame->poc) {
1121         ref_list[0]++;
1122       } else {
1123         ref_list[1]++;
1124       }
1125     }
1126 
1127     //ToDo: bidir mv support
1128     for (ref_list_idx = 0; ref_list_idx < 2; ref_list_idx++) {
1129       if (/*cur_cu->inter.mv_dir*/ 1 & (1 << ref_list_idx)) {
1130         if (ref_list[ref_list_idx] > 1) {
1131           // parseRefFrmIdx
1132           int32_t ref_frame = ref_idx;
1133 
1134           cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[0]);
1135           CABAC_BIN(cabac, (ref_frame != 0), "ref_idx_lX");
1136 
1137           if (ref_frame > 0) {
1138             int32_t i;
1139             int32_t ref_num = ref_list[ref_list_idx] - 2;
1140 
1141             cabac->cur_ctx = &(cabac->ctx.cu_ref_pic_model[1]);
1142             ref_frame--;
1143 
1144             for (i = 0; i < ref_num; ++i) {
1145               const uint32_t symbol = (i == ref_frame) ? 0 : 1;
1146 
1147               if (i == 0) {
1148                 CABAC_BIN(cabac, symbol, "ref_idx_lX");
1149               } else {
1150                 CABAC_BIN_EP(cabac, symbol, "ref_idx_lX");
1151               }
1152               if (symbol == 0) break;
1153             }
1154           }
1155         }
1156 
1157         // ToDo: Bidir vector support
1158         if (!(state->frame->ref_list == REF_PIC_LIST_1 && /*cur_cu->inter.mv_dir == 3*/ 0)) {
1159           // It is safe to drop const here because cabac->only_count is set.
1160           kvz_encode_mvd((encoder_state_t*) state, cabac, mvd.x, mvd.y);
1161         }
1162 
1163         // Signal which candidate MV to use
1164         kvz_cabac_write_unary_max_symbol(
1165             cabac,
1166             cabac->ctx.mvp_idx_model,
1167             cur_mv_cand,
1168             1,
1169             AMVP_MAX_NUM_CANDS - 1);
1170       }
1171     }
1172   }
1173 
1174   *bitcost = (23 - state_cabac_copy.bits_left) + (state_cabac_copy.num_buffered_bytes << 3);
1175 
1176   // Store bitcost before restoring cabac
1177   return *bitcost * (uint32_t)(state->lambda_sqrt + 0.5);
1178 }
1179 
kvz_close_rdcost_outfiles(void)1180 void kvz_close_rdcost_outfiles(void)
1181 {
1182   int i;
1183 
1184   for (i = 0; i < RD_SAMPLING_MAX_LAST_QP; i++) {
1185     FILE *curr = fastrd_learning_outfile[i];
1186     pthread_mutex_t *curr_mtx = outfile_mutex + i;
1187     if (curr != NULL) {
1188       fclose(curr);
1189     }
1190     if (curr_mtx != NULL) {
1191       pthread_mutex_destroy(curr_mtx);
1192     }
1193   }
1194 }
1195