1 /*
2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <math.h>
13 #include <stdio.h>
14 
15 #include "./vp9_rtcd.h"
16 
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_mem/vpx_mem.h"
19 #include "vpx_ports/bitops.h"
20 #include "vpx_ports/mem.h"
21 #include "vpx_ports/system_state.h"
22 
23 #include "vp9/common/vp9_common.h"
24 #include "vp9/common/vp9_entropy.h"
25 #include "vp9/common/vp9_entropymode.h"
26 #include "vp9/common/vp9_mvref_common.h"
27 #include "vp9/common/vp9_pred_common.h"
28 #include "vp9/common/vp9_quant_common.h"
29 #include "vp9/common/vp9_reconinter.h"
30 #include "vp9/common/vp9_reconintra.h"
31 #include "vp9/common/vp9_seg_common.h"
32 
33 #include "vp9/encoder/vp9_cost.h"
34 #include "vp9/encoder/vp9_encodemb.h"
35 #include "vp9/encoder/vp9_encodemv.h"
36 #include "vp9/encoder/vp9_encoder.h"
37 #include "vp9/encoder/vp9_mcomp.h"
38 #include "vp9/encoder/vp9_quantize.h"
39 #include "vp9/encoder/vp9_ratectrl.h"
40 #include "vp9/encoder/vp9_rd.h"
41 #include "vp9/encoder/vp9_tokenize.h"
42 
43 #define RD_THRESH_POW 1.25
44 
45 // Factor to weigh the rate for switchable interp filters.
46 #define SWITCHABLE_INTERP_RATE_FACTOR 1
47 
vp9_rd_cost_reset(RD_COST * rd_cost)48 void vp9_rd_cost_reset(RD_COST *rd_cost) {
49   rd_cost->rate = INT_MAX;
50   rd_cost->dist = INT64_MAX;
51   rd_cost->rdcost = INT64_MAX;
52 }
53 
vp9_rd_cost_init(RD_COST * rd_cost)54 void vp9_rd_cost_init(RD_COST *rd_cost) {
55   rd_cost->rate = 0;
56   rd_cost->dist = 0;
57   rd_cost->rdcost = 0;
58 }
59 
vp9_calculate_rd_cost(int mult,int div,int rate,int64_t dist)60 int64_t vp9_calculate_rd_cost(int mult, int div, int rate, int64_t dist) {
61   assert(mult >= 0);
62   assert(div > 0);
63   if (rate >= 0 && dist >= 0) {
64     return RDCOST(mult, div, rate, dist);
65   }
66   if (rate >= 0 && dist < 0) {
67     return RDCOST_NEG_D(mult, div, rate, -dist);
68   }
69   if (rate < 0 && dist >= 0) {
70     return RDCOST_NEG_R(mult, div, -rate, dist);
71   }
72   return -RDCOST(mult, div, -rate, -dist);
73 }
74 
vp9_rd_cost_update(int mult,int div,RD_COST * rd_cost)75 void vp9_rd_cost_update(int mult, int div, RD_COST *rd_cost) {
76   if (rd_cost->rate < INT_MAX && rd_cost->dist < INT64_MAX) {
77     rd_cost->rdcost =
78         vp9_calculate_rd_cost(mult, div, rd_cost->rate, rd_cost->dist);
79   } else {
80     vp9_rd_cost_reset(rd_cost);
81   }
82 }
83 
84 // The baseline rd thresholds for breaking out of the rd loop for
85 // certain modes are assumed to be based on 8x8 blocks.
86 // This table is used to correct for block size.
87 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
88 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
89   2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
90 };
91 
fill_mode_costs(VP9_COMP * cpi)92 static void fill_mode_costs(VP9_COMP *cpi) {
93   const FRAME_CONTEXT *const fc = cpi->common.fc;
94   int i, j;
95 
96   for (i = 0; i < INTRA_MODES; ++i) {
97     for (j = 0; j < INTRA_MODES; ++j) {
98       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
99                       vp9_intra_mode_tree);
100     }
101   }
102 
103   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
104   for (i = 0; i < INTRA_MODES; ++i) {
105     vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
106                     vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
107     vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
108                     fc->uv_mode_prob[i], vp9_intra_mode_tree);
109   }
110 
111   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
112     vp9_cost_tokens(cpi->switchable_interp_costs[i],
113                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
114   }
115 
116   for (i = TX_8X8; i < TX_SIZES; ++i) {
117     for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
118       const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
119       int k;
120       for (k = 0; k <= i; ++k) {
121         int cost = 0;
122         int m;
123         for (m = 0; m <= k - (k == i); ++m) {
124           if (m == k)
125             cost += vp9_cost_zero(tx_probs[m]);
126           else
127             cost += vp9_cost_one(tx_probs[m]);
128         }
129         cpi->tx_size_cost[i - 1][j][k] = cost;
130       }
131     }
132   }
133 }
134 
fill_token_costs(vp9_coeff_cost * c,vp9_coeff_probs_model (* p)[PLANE_TYPES])135 static void fill_token_costs(vp9_coeff_cost *c,
136                              vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
137   int i, j, k, l;
138   TX_SIZE t;
139   for (t = TX_4X4; t <= TX_32X32; ++t)
140     for (i = 0; i < PLANE_TYPES; ++i)
141       for (j = 0; j < REF_TYPES; ++j)
142         for (k = 0; k < COEF_BANDS; ++k)
143           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
144             vpx_prob probs[ENTROPY_NODES];
145             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
146             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, vp9_coef_tree);
147             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
148                                  vp9_coef_tree);
149             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
150                    c[t][i][j][k][1][l][EOB_TOKEN]);
151           }
152 }
153 
154 // Values are now correlated to quantizer.
155 static int sad_per_bit16lut_8[QINDEX_RANGE];
156 static int sad_per_bit4lut_8[QINDEX_RANGE];
157 
158 #if CONFIG_VP9_HIGHBITDEPTH
159 static int sad_per_bit16lut_10[QINDEX_RANGE];
160 static int sad_per_bit4lut_10[QINDEX_RANGE];
161 static int sad_per_bit16lut_12[QINDEX_RANGE];
162 static int sad_per_bit4lut_12[QINDEX_RANGE];
163 #endif
164 
init_me_luts_bd(int * bit16lut,int * bit4lut,int range,vpx_bit_depth_t bit_depth)165 static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
166                             vpx_bit_depth_t bit_depth) {
167   int i;
168   // Initialize the sad lut tables using a formulaic calculation for now.
169   // This is to make it easier to resolve the impact of experimental changes
170   // to the quantizer tables.
171   for (i = 0; i < range; i++) {
172     const double q = vp9_convert_qindex_to_q(i, bit_depth);
173     bit16lut[i] = (int)(0.0418 * q + 2.4107);
174     bit4lut[i] = (int)(0.063 * q + 2.742);
175   }
176 }
177 
vp9_init_me_luts(void)178 void vp9_init_me_luts(void) {
179   init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
180                   VPX_BITS_8);
181 #if CONFIG_VP9_HIGHBITDEPTH
182   init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
183                   VPX_BITS_10);
184   init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
185                   VPX_BITS_12);
186 #endif
187 }
188 
189 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
190                                          8,  8,  4,  4,  2,  2,  1,  0 };
191 
192 // Note that the element below for frame type "USE_BUF_FRAME", which indicates
193 // that the show frame flag is set, should not be used as no real frame
194 // is encoded so we should not reach here. However, a dummy value
195 // is inserted here to make sure the data structure has the right number
196 // of values assigned.
197 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
198                                                               128, 144, 144 };
199 
vp9_compute_rd_mult_based_on_qindex(const VP9_COMP * cpi,int qindex)200 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
201   // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
202   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
203   uint32_t rdmult = q * q;
204 
205   if (cpi->common.frame_type != KEY_FRAME) {
206     if (qindex < 128)
207       rdmult = rdmult * 4;
208     else if (qindex < 190)
209       rdmult = rdmult * 4 + rdmult / 2;
210     else
211       rdmult = rdmult * 3;
212   } else {
213     if (qindex < 64)
214       rdmult = rdmult * 4;
215     else if (qindex <= 128)
216       rdmult = rdmult * 3 + rdmult / 2;
217     else if (qindex < 190)
218       rdmult = rdmult * 4 + rdmult / 2;
219     else
220       rdmult = rdmult * 7 + rdmult / 2;
221   }
222 #if CONFIG_VP9_HIGHBITDEPTH
223   switch (cpi->common.bit_depth) {
224     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
225     case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
226     default: break;
227   }
228 #endif  // CONFIG_VP9_HIGHBITDEPTH
229   return rdmult > 0 ? rdmult : 1;
230 }
231 
modulate_rdmult(const VP9_COMP * cpi,int rdmult)232 static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
233   int64_t rdmult_64 = rdmult;
234   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
235     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
236     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
237     const int gfu_boost = cpi->multi_layer_arf
238                               ? gf_group->gfu_boost[gf_group->index]
239                               : cpi->rc.gfu_boost;
240     const int boost_index = VPXMIN(15, (gfu_boost / 100));
241 
242     rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7;
243     rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7);
244   }
245   return (int)rdmult_64;
246 }
247 
vp9_compute_rd_mult(const VP9_COMP * cpi,int qindex)248 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
249   int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
250   return modulate_rdmult(cpi, rdmult);
251 }
252 
vp9_get_adaptive_rdmult(const VP9_COMP * cpi,double beta)253 int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) {
254   int rdmult =
255       vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex);
256   rdmult = (int)((double)rdmult / beta);
257   rdmult = rdmult > 0 ? rdmult : 1;
258   return modulate_rdmult(cpi, rdmult);
259 }
260 
compute_rd_thresh_factor(int qindex,vpx_bit_depth_t bit_depth)261 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
262   double q;
263 #if CONFIG_VP9_HIGHBITDEPTH
264   switch (bit_depth) {
265     case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
266     case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
267     default:
268       assert(bit_depth == VPX_BITS_12);
269       q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
270       break;
271   }
272 #else
273   (void)bit_depth;
274   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
275 #endif  // CONFIG_VP9_HIGHBITDEPTH
276   // TODO(debargha): Adjust the function below.
277   return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
278 }
279 
vp9_initialize_me_consts(VP9_COMP * cpi,MACROBLOCK * x,int qindex)280 void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
281 #if CONFIG_VP9_HIGHBITDEPTH
282   switch (cpi->common.bit_depth) {
283     case VPX_BITS_8:
284       x->sadperbit16 = sad_per_bit16lut_8[qindex];
285       x->sadperbit4 = sad_per_bit4lut_8[qindex];
286       break;
287     case VPX_BITS_10:
288       x->sadperbit16 = sad_per_bit16lut_10[qindex];
289       x->sadperbit4 = sad_per_bit4lut_10[qindex];
290       break;
291     default:
292       assert(cpi->common.bit_depth == VPX_BITS_12);
293       x->sadperbit16 = sad_per_bit16lut_12[qindex];
294       x->sadperbit4 = sad_per_bit4lut_12[qindex];
295       break;
296   }
297 #else
298   (void)cpi;
299   x->sadperbit16 = sad_per_bit16lut_8[qindex];
300   x->sadperbit4 = sad_per_bit4lut_8[qindex];
301 #endif  // CONFIG_VP9_HIGHBITDEPTH
302 }
303 
set_block_thresholds(const VP9_COMMON * cm,RD_OPT * rd)304 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
305   int i, bsize, segment_id;
306 
307   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
308     const int qindex =
309         clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
310                   cm->y_dc_delta_q,
311               0, MAXQ);
312     const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
313 
314     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
315       // Threshold here seems unnecessarily harsh but fine given actual
316       // range of values used for cpi->sf.thresh_mult[].
317       const int t = q * rd_thresh_block_size_factor[bsize];
318       const int thresh_max = INT_MAX / t;
319 
320       if (bsize >= BLOCK_8X8) {
321         for (i = 0; i < MAX_MODES; ++i)
322           rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
323                                                    ? rd->thresh_mult[i] * t / 4
324                                                    : INT_MAX;
325       } else {
326         for (i = 0; i < MAX_REFS; ++i)
327           rd->threshes[segment_id][bsize][i] =
328               rd->thresh_mult_sub8x8[i] < thresh_max
329                   ? rd->thresh_mult_sub8x8[i] * t / 4
330                   : INT_MAX;
331       }
332     }
333   }
334 }
335 
vp9_build_inter_mode_cost(VP9_COMP * cpi)336 void vp9_build_inter_mode_cost(VP9_COMP *cpi) {
337   const VP9_COMMON *const cm = &cpi->common;
338   int i;
339   for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
340     vp9_cost_tokens((int *)cpi->inter_mode_cost[i], cm->fc->inter_mode_probs[i],
341                     vp9_inter_mode_tree);
342   }
343 }
344 
vp9_initialize_rd_consts(VP9_COMP * cpi)345 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
346   VP9_COMMON *const cm = &cpi->common;
347   MACROBLOCK *const x = &cpi->td.mb;
348   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
349   RD_OPT *const rd = &cpi->rd;
350   int i;
351 
352   vpx_clear_system_state();
353 
354   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
355   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
356 
357   set_error_per_bit(x, rd->RDMULT);
358 
359   x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
360                        cm->frame_type != KEY_FRAME)
361                           ? 0
362                           : 1;
363 
364   set_block_thresholds(cm, rd);
365   set_partition_probs(cm, xd);
366 
367   if (cpi->oxcf.pass == 1) {
368     if (!frame_is_intra_only(cm))
369       vp9_build_nmv_cost_table(
370           x->nmvjointcost,
371           cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
372           &cm->fc->nmvc, cm->allow_high_precision_mv);
373   } else {
374     if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
375       fill_token_costs(x->token_costs, cm->fc->coef_probs);
376 
377     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
378         cm->frame_type == KEY_FRAME) {
379       for (i = 0; i < PARTITION_CONTEXTS; ++i)
380         vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
381                         vp9_partition_tree);
382     }
383 
384     if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
385         cm->frame_type == KEY_FRAME) {
386       fill_mode_costs(cpi);
387 
388       if (!frame_is_intra_only(cm)) {
389         vp9_build_nmv_cost_table(
390             x->nmvjointcost,
391             cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
392             &cm->fc->nmvc, cm->allow_high_precision_mv);
393         vp9_build_inter_mode_cost(cpi);
394       }
395     }
396   }
397 }
398 
399 // NOTE: The tables below must be of the same size.
400 
401 // The functions described below are sampled at the four most significant
402 // bits of x^2 + 8 / 256.
403 
404 // Normalized rate:
405 // This table models the rate for a Laplacian source with given variance
406 // when quantized with a uniform quantizer with given stepsize. The
407 // closed form expression is:
408 // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
409 // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
410 // and H(x) is the binary entropy function.
411 static const int rate_tab_q10[] = {
412   65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
413   3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
414   2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
415   2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
416   1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
417   680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
418   190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
419   16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
420 };
421 
422 // Normalized distortion:
423 // This table models the normalized distortion for a Laplacian source
424 // with given variance when quantized with a uniform quantizer
425 // with given stepsize. The closed form expression is:
426 // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
427 // where x = qpstep / sqrt(variance).
428 // Note the actual distortion is Dn * variance.
429 static const int dist_tab_q10[] = {
430   0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
431   6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
432   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
433   73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
434   215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
435   495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
436   842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
437   1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
438 };
439 static const int xsq_iq_q10[] = {
440   0,      4,      8,      12,     16,     20,     24,     28,     32,
441   40,     48,     56,     64,     72,     80,     88,     96,     112,
442   128,    144,    160,    176,    192,    208,    224,    256,    288,
443   320,    352,    384,    416,    448,    480,    544,    608,    672,
444   736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
445   1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
446   3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
447   7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
448   16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
449   36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
450   81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
451   180192, 196576, 212960, 229344, 245728,
452 };
453 
model_rd_norm(int xsq_q10,int * r_q10,int * d_q10)454 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
455   const int tmp = (xsq_q10 >> 2) + 8;
456   const int k = get_msb(tmp) - 3;
457   const int xq = (k << 3) + ((tmp >> k) & 0x7);
458   const int one_q10 = 1 << 10;
459   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
460   const int b_q10 = one_q10 - a_q10;
461   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
462   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
463 }
464 
model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],int r_q10[MAX_MB_PLANE],int d_q10[MAX_MB_PLANE])465 static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
466                               int r_q10[MAX_MB_PLANE],
467                               int d_q10[MAX_MB_PLANE]) {
468   int i;
469   const int one_q10 = 1 << 10;
470   for (i = 0; i < MAX_MB_PLANE; ++i) {
471     const int tmp = (xsq_q10[i] >> 2) + 8;
472     const int k = get_msb(tmp) - 3;
473     const int xq = (k << 3) + ((tmp >> k) & 0x7);
474     const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
475     const int b_q10 = one_q10 - a_q10;
476     r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
477     d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
478   }
479 }
480 
481 static const uint32_t MAX_XSQ_Q10 = 245727;
482 
vp9_model_rd_from_var_lapndz(unsigned int var,unsigned int n_log2,unsigned int qstep,int * rate,int64_t * dist)483 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
484                                   unsigned int qstep, int *rate,
485                                   int64_t *dist) {
486   // This function models the rate and distortion for a Laplacian
487   // source with given variance when quantized with a uniform quantizer
488   // with given stepsize. The closed form expressions are in:
489   // Hang and Chen, "Source Model for transform video coder and its
490   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
491   // Sys. for Video Tech., April 1997.
492   if (var == 0) {
493     *rate = 0;
494     *dist = 0;
495   } else {
496     int d_q10, r_q10;
497     const uint64_t xsq_q10_64 =
498         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
499     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
500     model_rd_norm(xsq_q10, &r_q10, &d_q10);
501     *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
502     *dist = (var * (int64_t)d_q10 + 512) >> 10;
503   }
504 }
505 
506 // Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
507 // vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],unsigned int n_log2[MAX_MB_PLANE],unsigned int qstep[MAX_MB_PLANE],int64_t * rate_sum,int64_t * dist_sum)508 void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
509                                       unsigned int n_log2[MAX_MB_PLANE],
510                                       unsigned int qstep[MAX_MB_PLANE],
511                                       int64_t *rate_sum, int64_t *dist_sum) {
512   int i;
513   int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
514   for (i = 0; i < MAX_MB_PLANE; ++i) {
515     const uint64_t xsq_q10_64 =
516         (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
517         var[i];
518     xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
519   }
520   model_rd_norm_vec(xsq_q10, r_q10, d_q10);
521   for (i = 0; i < MAX_MB_PLANE; ++i) {
522     int rate =
523         ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
524     int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
525     *rate_sum += rate;
526     *dist_sum += dist;
527   }
528 }
529 
vp9_get_entropy_contexts(BLOCK_SIZE bsize,TX_SIZE tx_size,const struct macroblockd_plane * pd,ENTROPY_CONTEXT t_above[16],ENTROPY_CONTEXT t_left[16])530 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
531                               const struct macroblockd_plane *pd,
532                               ENTROPY_CONTEXT t_above[16],
533                               ENTROPY_CONTEXT t_left[16]) {
534   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
535   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
536   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
537   const ENTROPY_CONTEXT *const above = pd->above_context;
538   const ENTROPY_CONTEXT *const left = pd->left_context;
539 
540   int i;
541   switch (tx_size) {
542     case TX_4X4:
543       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
544       memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
545       break;
546     case TX_8X8:
547       for (i = 0; i < num_4x4_w; i += 2)
548         t_above[i] = !!*(const uint16_t *)&above[i];
549       for (i = 0; i < num_4x4_h; i += 2)
550         t_left[i] = !!*(const uint16_t *)&left[i];
551       break;
552     case TX_16X16:
553       for (i = 0; i < num_4x4_w; i += 4)
554         t_above[i] = !!*(const uint32_t *)&above[i];
555       for (i = 0; i < num_4x4_h; i += 4)
556         t_left[i] = !!*(const uint32_t *)&left[i];
557       break;
558     default:
559       assert(tx_size == TX_32X32);
560       for (i = 0; i < num_4x4_w; i += 8)
561         t_above[i] = !!*(const uint64_t *)&above[i];
562       for (i = 0; i < num_4x4_h; i += 8)
563         t_left[i] = !!*(const uint64_t *)&left[i];
564       break;
565   }
566 }
567 
vp9_mv_pred(VP9_COMP * cpi,MACROBLOCK * x,uint8_t * ref_y_buffer,int ref_y_stride,int ref_frame,BLOCK_SIZE block_size)568 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
569                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
570   int i;
571   int zero_seen = 0;
572   int best_index = 0;
573   int best_sad = INT_MAX;
574   int this_sad = INT_MAX;
575   int max_mv = 0;
576   int near_same_nearest;
577   uint8_t *src_y_ptr = x->plane[0].src.buf;
578   uint8_t *ref_y_ptr;
579   const int num_mv_refs =
580       MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size);
581 
582   MV pred_mv[3];
583   pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
584   pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
585   pred_mv[2] = x->pred_mv[ref_frame];
586   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
587 
588   near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
589                       x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
590 
591   // Get the sad for each candidate reference mv.
592   for (i = 0; i < num_mv_refs; ++i) {
593     const MV *this_mv = &pred_mv[i];
594     int fp_row, fp_col;
595     if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue;
596     if (i == 1 && near_same_nearest) continue;
597     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
598     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
599     max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
600 
601     if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
602     zero_seen |= (fp_row == 0 && fp_col == 0);
603 
604     ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
605     // Find sad for current vector.
606     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
607                                            ref_y_ptr, ref_y_stride);
608     // Note if it is the best so far.
609     if (this_sad < best_sad) {
610       best_sad = this_sad;
611       best_index = i;
612     }
613   }
614 
615   // Note the index of the mv that worked best in the reference list.
616   x->mv_best_ref_index[ref_frame] = best_index;
617   x->max_mv_context[ref_frame] = max_mv;
618   x->pred_mv_sad[ref_frame] = best_sad;
619 }
620 
vp9_setup_pred_block(const MACROBLOCKD * xd,struct buf_2d dst[MAX_MB_PLANE],const YV12_BUFFER_CONFIG * src,int mi_row,int mi_col,const struct scale_factors * scale,const struct scale_factors * scale_uv)621 void vp9_setup_pred_block(const MACROBLOCKD *xd,
622                           struct buf_2d dst[MAX_MB_PLANE],
623                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
624                           const struct scale_factors *scale,
625                           const struct scale_factors *scale_uv) {
626   int i;
627 
628   dst[0].buf = src->y_buffer;
629   dst[0].stride = src->y_stride;
630   dst[1].buf = src->u_buffer;
631   dst[2].buf = src->v_buffer;
632   dst[1].stride = dst[2].stride = src->uv_stride;
633 
634   for (i = 0; i < MAX_MB_PLANE; ++i) {
635     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
636                      i ? scale_uv : scale, xd->plane[i].subsampling_x,
637                      xd->plane[i].subsampling_y);
638   }
639 }
640 
vp9_raster_block_offset(BLOCK_SIZE plane_bsize,int raster_block,int stride)641 int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
642                             int stride) {
643   const int bw = b_width_log2_lookup[plane_bsize];
644   const int y = 4 * (raster_block >> bw);
645   const int x = 4 * (raster_block & ((1 << bw) - 1));
646   return y * stride + x;
647 }
648 
vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize,int raster_block,int16_t * base)649 int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
650                                        int16_t *base) {
651   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
652   return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
653 }
654 
vp9_get_scaled_ref_frame(const VP9_COMP * cpi,int ref_frame)655 YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
656                                              int ref_frame) {
657   const VP9_COMMON *const cm = &cpi->common;
658   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
659   const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
660   assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
661   return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
662              ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
663              : NULL;
664 }
665 
vp9_get_switchable_rate(const VP9_COMP * cpi,const MACROBLOCKD * const xd)666 int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
667   const MODE_INFO *const mi = xd->mi[0];
668   const int ctx = get_pred_context_switchable_interp(xd);
669   return SWITCHABLE_INTERP_RATE_FACTOR *
670          cpi->switchable_interp_costs[ctx][mi->interp_filter];
671 }
672 
vp9_set_rd_speed_thresholds(VP9_COMP * cpi)673 void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
674   int i;
675   RD_OPT *const rd = &cpi->rd;
676   SPEED_FEATURES *const sf = &cpi->sf;
677 
678   // Set baseline threshold values.
679   for (i = 0; i < MAX_MODES; ++i)
680     rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
681 
682   if (sf->adaptive_rd_thresh) {
683     rd->thresh_mult[THR_NEARESTMV] = 300;
684     rd->thresh_mult[THR_NEARESTG] = 300;
685     rd->thresh_mult[THR_NEARESTA] = 300;
686   } else {
687     rd->thresh_mult[THR_NEARESTMV] = 0;
688     rd->thresh_mult[THR_NEARESTG] = 0;
689     rd->thresh_mult[THR_NEARESTA] = 0;
690   }
691 
692   rd->thresh_mult[THR_DC] += 1000;
693 
694   rd->thresh_mult[THR_NEWMV] += 1000;
695   rd->thresh_mult[THR_NEWA] += 1000;
696   rd->thresh_mult[THR_NEWG] += 1000;
697 
698   rd->thresh_mult[THR_NEARMV] += 1000;
699   rd->thresh_mult[THR_NEARA] += 1000;
700   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
701   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
702 
703   rd->thresh_mult[THR_TM] += 1000;
704 
705   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
706   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
707   rd->thresh_mult[THR_NEARG] += 1000;
708   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
709   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
710 
711   rd->thresh_mult[THR_ZEROMV] += 2000;
712   rd->thresh_mult[THR_ZEROG] += 2000;
713   rd->thresh_mult[THR_ZEROA] += 2000;
714   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
715   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
716 
717   rd->thresh_mult[THR_H_PRED] += 2000;
718   rd->thresh_mult[THR_V_PRED] += 2000;
719   rd->thresh_mult[THR_D45_PRED] += 2500;
720   rd->thresh_mult[THR_D135_PRED] += 2500;
721   rd->thresh_mult[THR_D117_PRED] += 2500;
722   rd->thresh_mult[THR_D153_PRED] += 2500;
723   rd->thresh_mult[THR_D207_PRED] += 2500;
724   rd->thresh_mult[THR_D63_PRED] += 2500;
725 }
726 
vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP * cpi)727 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
728   static const int thresh_mult[2][MAX_REFS] = {
729     { 2500, 2500, 2500, 4500, 4500, 2500 },
730     { 2000, 2000, 2000, 4000, 4000, 2000 }
731   };
732   RD_OPT *const rd = &cpi->rd;
733   const int idx = cpi->oxcf.mode == BEST;
734   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
735 }
736 
vp9_update_rd_thresh_fact(int (* factor_buf)[MAX_MODES],int rd_thresh,int bsize,int best_mode_index)737 void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
738                                int bsize, int best_mode_index) {
739   if (rd_thresh > 0) {
740     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
741     int mode;
742     for (mode = 0; mode < top_mode; ++mode) {
743       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
744       const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
745       BLOCK_SIZE bs;
746       for (bs = min_size; bs <= max_size; ++bs) {
747         int *const fact = &factor_buf[bs][mode];
748         if (mode == best_mode_index) {
749           *fact -= (*fact >> 4);
750         } else {
751           *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
752         }
753       }
754     }
755   }
756 }
757 
vp9_get_intra_cost_penalty(const VP9_COMP * const cpi,BLOCK_SIZE bsize,int qindex,int qdelta)758 int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
759                                int qindex, int qdelta) {
760   // Reduce the intra cost penalty for small blocks (<=16x16).
761   int reduction_fac =
762       (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
763 
764   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
765     // Don't reduce intra cost penalty if estimated noise level is high.
766     reduction_fac = 0;
767 
768   // Always use VPX_BITS_8 as input here because the penalty is applied
769   // to rate not distortion so we want a consistent penalty for all bit
770   // depths. If the actual bit depth were passed in here then the value
771   // retured by vp9_dc_quant() would scale with the bit depth and we would
772   // then need to apply inverse scaling to correct back to a bit depth
773   // independent rate penalty.
774   return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
775 }
776