1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/enc/vp8i_enc.h"
19 #include "src/enc/cost_enc.h"
20 
21 #define DO_TRELLIS_I4  1
22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25 
26 #define MID_ALPHA 64      // neutral value for susceptibility
27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
29 
30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
31                           // power-law modulation. Must be strictly less than 1.
32 
33 // number of non-zero coeffs below which we consider the block very flat
34 // (and apply a penalty to complex predictions)
35 #define FLATNESS_LIMIT_I16 10      // I16 mode
36 #define FLATNESS_LIMIT_I4  3       // I4 mode
37 #define FLATNESS_LIMIT_UV  2       // UV mode
38 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
39 
40 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41 
42 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
43 
44 // #define DEBUG_BLOCK
45 
46 //------------------------------------------------------------------------------
47 
48 #if defined(DEBUG_BLOCK)
49 
50 #include <stdio.h>
51 #include <stdlib.h>
52 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)53 static void PrintBlockInfo(const VP8EncIterator* const it,
54                            const VP8ModeScore* const rd) {
55   int i, j;
56   const int is_i16 = (it->mb_->type_ == 1);
57   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61   printf("SOURCE / OUTPUT / ABS DELTA\n");
62   for (j = 0; j < 16; ++j) {
63     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64     printf("     ");
65     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66     printf("     ");
67     for (i = 0; i < 16; ++i) {
68       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69     }
70     printf("\n");
71   }
72   printf("\n");   // newline before the U/V block
73   for (j = 0; j < 8; ++j) {
74     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75     printf(" ");
76     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77     printf("    ");
78     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79     printf(" ");
80     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81     printf("   ");
82     for (i = 0; i < 8; ++i) {
83       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84     }
85     printf(" ");
86     for (i = 8; i < 16; ++i) {
87       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88     }
89     printf("\n");
90   }
91   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93     (int)rd->score);
94   if (is_i16) {
95     printf("Mode: %d\n", rd->mode_i16);
96     printf("y_dc_levels:");
97     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98     printf("\n");
99   } else {
100     printf("Modes[16]: ");
101     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102     printf("\n");
103   }
104   printf("y_ac_levels:\n");
105   for (j = 0; j < 16; ++j) {
106     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107       printf("%4d ", rd->y_ac_levels[j][i]);
108     }
109     printf("\n");
110   }
111   printf("\n");
112   printf("uv_levels (mode=%d):\n", rd->mode_uv);
113   for (j = 0; j < 8; ++j) {
114     for (i = 0; i < 16; ++i) {
115       printf("%4d ", rd->uv_levels[j][i]);
116     }
117     printf("\n");
118   }
119 }
120 
121 #endif   // DEBUG_BLOCK
122 
123 //------------------------------------------------------------------------------
124 
clip(int v,int m,int M)125 static WEBP_INLINE int clip(int v, int m, int M) {
126   return v < m ? m : v > M ? M : v;
127 }
128 
129 static const uint8_t kZigzag[16] = {
130   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131 };
132 
133 static const uint8_t kDcTable[128] = {
134   4,     5,   6,   7,   8,   9,  10,  10,
135   11,   12,  13,  14,  15,  16,  17,  17,
136   18,   19,  20,  20,  21,  21,  22,  22,
137   23,   23,  24,  25,  25,  26,  27,  28,
138   29,   30,  31,  32,  33,  34,  35,  36,
139   37,   37,  38,  39,  40,  41,  42,  43,
140   44,   45,  46,  46,  47,  48,  49,  50,
141   51,   52,  53,  54,  55,  56,  57,  58,
142   59,   60,  61,  62,  63,  64,  65,  66,
143   67,   68,  69,  70,  71,  72,  73,  74,
144   75,   76,  76,  77,  78,  79,  80,  81,
145   82,   83,  84,  85,  86,  87,  88,  89,
146   91,   93,  95,  96,  98, 100, 101, 102,
147   104, 106, 108, 110, 112, 114, 116, 118,
148   122, 124, 126, 128, 130, 132, 134, 136,
149   138, 140, 143, 145, 148, 151, 154, 157
150 };
151 
152 static const uint16_t kAcTable[128] = {
153   4,     5,   6,   7,   8,   9,  10,  11,
154   12,   13,  14,  15,  16,  17,  18,  19,
155   20,   21,  22,  23,  24,  25,  26,  27,
156   28,   29,  30,  31,  32,  33,  34,  35,
157   36,   37,  38,  39,  40,  41,  42,  43,
158   44,   45,  46,  47,  48,  49,  50,  51,
159   52,   53,  54,  55,  56,  57,  58,  60,
160   62,   64,  66,  68,  70,  72,  74,  76,
161   78,   80,  82,  84,  86,  88,  90,  92,
162   94,   96,  98, 100, 102, 104, 106, 108,
163   110, 112, 114, 116, 119, 122, 125, 128,
164   131, 134, 137, 140, 143, 146, 149, 152,
165   155, 158, 161, 164, 167, 170, 173, 177,
166   181, 185, 189, 193, 197, 201, 205, 209,
167   213, 217, 221, 225, 229, 234, 239, 245,
168   249, 254, 259, 264, 269, 274, 279, 284
169 };
170 
171 static const uint16_t kAcTable2[128] = {
172   8,     8,   9,  10,  12,  13,  15,  17,
173   18,   20,  21,  23,  24,  26,  27,  29,
174   31,   32,  34,  35,  37,  38,  40,  41,
175   43,   44,  46,  48,  49,  51,  52,  54,
176   55,   57,  58,  60,  62,  63,  65,  66,
177   68,   69,  71,  72,  74,  75,  77,  79,
178   80,   82,  83,  85,  86,  88,  89,  93,
179   96,   99, 102, 105, 108, 111, 114, 117,
180   120, 124, 127, 130, 133, 136, 139, 142,
181   145, 148, 151, 155, 158, 161, 164, 167,
182   170, 173, 176, 179, 184, 189, 193, 198,
183   203, 207, 212, 217, 221, 226, 230, 235,
184   240, 244, 249, 254, 258, 263, 268, 274,
185   280, 286, 292, 299, 305, 311, 317, 323,
186   330, 336, 342, 348, 354, 362, 370, 379,
187   385, 393, 401, 409, 416, 424, 432, 440
188 };
189 
190 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
191   { 96, 110 }, { 96, 108 }, { 110, 115 }
192 };
193 
194 // Sharpening by (slightly) raising the hi-frequency coeffs.
195 // Hack-ish but helpful for mid-bitrate range. Use with care.
196 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
197 static const uint8_t kFreqSharpening[16] = {
198   0,  30, 60, 90,
199   30, 60, 90, 90,
200   60, 90, 90, 90,
201   90, 90, 90, 90
202 };
203 
204 //------------------------------------------------------------------------------
205 // Initialize quantization parameters in VP8Matrix
206 
207 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)208 static int ExpandMatrix(VP8Matrix* const m, int type) {
209   int i, sum;
210   for (i = 0; i < 2; ++i) {
211     const int is_ac_coeff = (i > 0);
212     const int bias = kBiasMatrices[type][is_ac_coeff];
213     m->iq_[i] = (1 << QFIX) / m->q_[i];
214     m->bias_[i] = BIAS(bias);
215     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216     //   * zero if coeff <= zthresh
217     //   * non-zero if coeff > zthresh
218     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219   }
220   for (i = 2; i < 16; ++i) {
221     m->q_[i] = m->q_[1];
222     m->iq_[i] = m->iq_[1];
223     m->bias_[i] = m->bias_[1];
224     m->zthresh_[i] = m->zthresh_[1];
225   }
226   for (sum = 0, i = 0; i < 16; ++i) {
227     if (type == 0) {  // we only use sharpening for AC luma coeffs
228       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229     } else {
230       m->sharpen_[i] = 0;
231     }
232     sum += m->q_[i];
233   }
234   return (sum + 8) >> 4;
235 }
236 
CheckLambdaValue(int * const v)237 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238 
SetupMatrices(VP8Encoder * enc)239 static void SetupMatrices(VP8Encoder* enc) {
240   int i;
241   const int tlambda_scale =
242     (enc->method_ >= 4) ? enc->config_->sns_strength
243                         : 0;
244   const int num_segments = enc->segment_hdr_.num_segments_;
245   for (i = 0; i < num_segments; ++i) {
246     VP8SegmentInfo* const m = &enc->dqm_[i];
247     const int q = m->quant_;
248     int q_i4, q_i16, q_uv;
249     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
251 
252     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254 
255     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257 
258     q_i4  = ExpandMatrix(&m->y1_, 0);
259     q_i16 = ExpandMatrix(&m->y2_, 1);
260     q_uv  = ExpandMatrix(&m->uv_, 2);
261 
262     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
263     m->lambda_i16_         = (3 * q_i16 * q_i16);
264     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
265     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
266     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
267     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
269     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
270 
271     // none of these constants should be < 1
272     CheckLambdaValue(&m->lambda_i4_);
273     CheckLambdaValue(&m->lambda_i16_);
274     CheckLambdaValue(&m->lambda_uv_);
275     CheckLambdaValue(&m->lambda_mode_);
276     CheckLambdaValue(&m->lambda_trellis_i4_);
277     CheckLambdaValue(&m->lambda_trellis_i16_);
278     CheckLambdaValue(&m->lambda_trellis_uv_);
279     CheckLambdaValue(&m->tlambda_);
280 
281     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
282     m->max_edge_  = 0;
283 
284     m->i4_penalty_ = 1000 * q_i4 * q_i4;
285   }
286 }
287 
288 //------------------------------------------------------------------------------
289 // Initialize filtering parameters
290 
291 // Very small filter-strength values have close to no visual effect. So we can
292 // save a little decoding-CPU by turning filtering off for these.
293 #define FSTRENGTH_CUTOFF 2
294 
SetupFilterStrength(VP8Encoder * const enc)295 static void SetupFilterStrength(VP8Encoder* const enc) {
296   int i;
297   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298   const int level0 = 5 * enc->config_->filter_strength;
299   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300     VP8SegmentInfo* const m = &enc->dqm_[i];
301     // We focus on the quantization of AC coeffs.
302     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303     const int base_strength =
304         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305     // Segments with lower complexity ('beta') will be less filtered.
306     const int f = base_strength * level0 / (256 + m->beta_);
307     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308   }
309   // We record the initial strength (mainly for the case of 1-segment only).
310   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313 }
314 
315 //------------------------------------------------------------------------------
316 
317 // Note: if you change the values below, remember that the max range
318 // allowed by the syntax for DQ_UV is [-16,16].
319 #define MAX_DQ_UV (6)
320 #define MIN_DQ_UV (-4)
321 
322 // We want to emulate jpeg-like behaviour where the expected "good" quality
323 // is around q=75. Internally, our "good" middle is around c=50. So we
324 // map accordingly using linear piece-wise function
QualityToCompression(double c)325 static double QualityToCompression(double c) {
326   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327   // The file size roughly scales as pow(quantizer, 3.). Actually, the
328   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329   // in the mid-quant range. So we scale the compressibility inversely to
330   // this power-law: quant ~= compression ^ 1/3. This law holds well for
331   // low quant. Finer modeling for high-quant would make use of kAcTable[]
332   // more explicitly.
333   const double v = pow(linear_c, 1 / 3.);
334   return v;
335 }
336 
QualityToJPEGCompression(double c,double alpha)337 static double QualityToJPEGCompression(double c, double alpha) {
338   // We map the complexity 'alpha' and quality setting 'c' to a compression
339   // exponent empirically matched to the compression curve of libjpeg6b.
340   // On average, the WebP output size will be roughly similar to that of a
341   // JPEG file compressed with same quality factor.
342   const double amin = 0.30;
343   const double amax = 0.85;
344   const double exp_min = 0.4;
345   const double exp_max = 0.9;
346   const double slope = (exp_min - exp_max) / (amax - amin);
347   // Linearly interpolate 'expn' from exp_min to exp_max
348   // in the [amin, amax] range.
349   const double expn = (alpha > amax) ? exp_min
350                     : (alpha < amin) ? exp_max
351                     : exp_max + slope * (alpha - amin);
352   const double v = pow(c, expn);
353   return v;
354 }
355 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)356 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357                                  const VP8SegmentInfo* const S2) {
358   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359 }
360 
SimplifySegments(VP8Encoder * const enc)361 static void SimplifySegments(VP8Encoder* const enc) {
362   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364   // explicit check is needed to avoid a spurious warning about 'i' exceeding
365   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367                                ? enc->segment_hdr_.num_segments_
368                                : NUM_MB_SEGMENTS;
369   int num_final_segments = 1;
370   int s1, s2;
371   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
372     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373     int found = 0;
374     // check if we already have similar segment
375     for (s2 = 0; s2 < num_final_segments; ++s2) {
376       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377       if (SegmentsAreEquivalent(S1, S2)) {
378         found = 1;
379         break;
380       }
381     }
382     map[s1] = s2;
383     if (!found) {
384       if (num_final_segments != s1) {
385         enc->dqm_[num_final_segments] = enc->dqm_[s1];
386       }
387       ++num_final_segments;
388     }
389   }
390   if (num_final_segments < num_segments) {  // Remap
391     int i = enc->mb_w_ * enc->mb_h_;
392     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393     enc->segment_hdr_.num_segments_ = num_final_segments;
394     // Replicate the trailing segment infos (it's mostly cosmetics)
395     for (i = num_final_segments; i < num_segments; ++i) {
396       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397     }
398   }
399 }
400 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)401 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402   int i;
403   int dq_uv_ac, dq_uv_dc;
404   const int num_segments = enc->segment_hdr_.num_segments_;
405   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406   const double Q = quality / 100.;
407   const double c_base = enc->config_->emulate_jpeg_size ?
408       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409       QualityToCompression(Q);
410   for (i = 0; i < num_segments; ++i) {
411     // We modulate the base coefficient to accommodate for the quantization
412     // susceptibility and allow denser segments to be quantized more.
413     const double expn = 1. - amp * enc->dqm_[i].alpha_;
414     const double c = pow(c_base, expn);
415     const int q = (int)(127. * (1. - c));
416     assert(expn > 0.);
417     enc->dqm_[i].quant_ = clip(q, 0, 127);
418   }
419 
420   // purely indicative in the bitstream (except for the 1-segment case)
421   enc->base_quant_ = enc->dqm_[0].quant_;
422 
423   // fill-in values for the unused segments (required by the syntax)
424   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425     enc->dqm_[i].quant_ = enc->base_quant_;
426   }
427 
428   // uv_alpha_ is normally spread around ~60. The useful range is
429   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432                                           / (MAX_ALPHA - MIN_ALPHA);
433   // we rescale by the user-defined strength of adaptation
434   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435   // and make it safe.
436   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437   // We also boost the dc-uv-quant a little, based on sns-strength, since
438   // U/V channels are quite more reactive to high quants (flat DC-blocks
439   // tend to appear, and are unpleasant).
440   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
442 
443   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
444   enc->dq_y2_dc_ = 0;
445   enc->dq_y2_ac_ = 0;
446   enc->dq_uv_dc_ = dq_uv_dc;
447   enc->dq_uv_ac_ = dq_uv_ac;
448 
449   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
450 
451   if (num_segments > 1) SimplifySegments(enc);
452 
453   SetupMatrices(enc);         // finalize quantization matrices
454 }
455 
456 //------------------------------------------------------------------------------
457 // Form the predictions in cache
458 
459 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462 
463 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
465   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466 };
467 
VP8MakeLuma16Preds(const VP8EncIterator * const it)468 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471   VP8EncPredLuma16(it->yuv_p_, left, top);
472 }
473 
VP8MakeChroma8Preds(const VP8EncIterator * const it)474 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477   VP8EncPredChroma8(it->yuv_p_, left, top);
478 }
479 
VP8MakeIntra4Preds(const VP8EncIterator * const it)480 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482 }
483 
484 //------------------------------------------------------------------------------
485 // Quantize
486 
487 // Layout:
488 // +----+----+
489 // |YYYY|UUVV| 0
490 // |YYYY|UUVV| 4
491 // |YYYY|....| 8
492 // |YYYY|....| 12
493 // +----+----+
494 
495 const uint16_t VP8Scan[16] = {  // Luma
496   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
497   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
498   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
499   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500 };
501 
502 static const uint16_t VP8ScanUV[4 + 4] = {
503   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
504   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
505 };
506 
507 //------------------------------------------------------------------------------
508 // Distortion measurement
509 
510 static const uint16_t kWeightY[16] = {
511   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512 };
513 
514 static const uint16_t kWeightTrellis[16] = {
515 #if USE_TDISTO == 0
516   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517 #else
518   30, 27, 19, 11,
519   27, 24, 17, 10,
520   19, 17, 12,  8,
521   11, 10,  8,  6
522 #endif
523 };
524 
525 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)526 static void InitScore(VP8ModeScore* const rd) {
527   rd->D  = 0;
528   rd->SD = 0;
529   rd->R  = 0;
530   rd->H  = 0;
531   rd->nz = 0;
532   rd->score = MAX_COST;
533 }
534 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)535 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536   dst->D  = src->D;
537   dst->SD = src->SD;
538   dst->R  = src->R;
539   dst->H  = src->H;
540   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
541   dst->score = src->score;
542 }
543 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)544 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545   dst->D  += src->D;
546   dst->SD += src->SD;
547   dst->R  += src->R;
548   dst->H  += src->H;
549   dst->nz |= src->nz;     // here, new nz bits are accumulated.
550   dst->score += src->score;
551 }
552 
553 //------------------------------------------------------------------------------
554 // Performs trellis-optimized quantization.
555 
556 // Trellis node
557 typedef struct {
558   int8_t prev;            // best previous node
559   int8_t sign;            // sign of coeff_i
560   int16_t level;          // level
561 } Node;
562 
563 // Score state
564 typedef struct {
565   score_t score;          // partial RD score
566   const uint16_t* costs;  // shortcut to cost tables
567 } ScoreState;
568 
569 // If a coefficient was quantized to a value Q (using a neutral bias),
570 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571 // We don't test negative values though.
572 #define MIN_DELTA 0   // how much lower level to try
573 #define MAX_DELTA 1   // how much higher
574 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577 
SetRDScore(int lambda,VP8ModeScore * const rd)578 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580 }
581 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)582 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583                                           score_t distortion) {
584   return rate * lambda + RD_DISTO_MULT * distortion;
585 }
586 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)587 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588                                 int16_t in[16], int16_t out[16],
589                                 int ctx0, int coeff_type,
590                                 const VP8Matrix* const mtx,
591                                 int lambda) {
592   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593   CostArrayPtr const costs =
594       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595   const int first = (coeff_type == 0) ? 1 : 0;
596   Node nodes[16][NUM_NODES];
597   ScoreState score_states[2][NUM_NODES];
598   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
601   score_t best_score;
602   int n, m, p, last;
603 
604   {
605     score_t cost;
606     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608 
609     // compute the position of the last interesting coefficient
610     last = first - 1;
611     for (n = 15; n >= first; --n) {
612       const int j = kZigzag[n];
613       const int err = in[j] * in[j];
614       if (err > thresh) {
615         last = n;
616         break;
617       }
618     }
619     // we don't need to go inspect up to n = 16 coeffs. We can just go up
620     // to last + 1 (inclusive) without losing much.
621     if (last < 15) ++last;
622 
623     // compute 'skip' score. This is the max score one can do.
624     cost = VP8BitCost(0, last_proba);
625     best_score = RDScoreTrellis(lambda, cost, 0);
626 
627     // initialize source node.
628     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631       ss_cur[m].costs = costs[first][ctx0];
632     }
633   }
634 
635   // traverse trellis.
636   for (n = first; n <= last; ++n) {
637     const int j = kZigzag[n];
638     const uint32_t Q  = mtx->q_[j];
639     const uint32_t iQ = mtx->iq_[j];
640     const uint32_t B = BIAS(0x00);     // neutral bias
641     // note: it's important to take sign of the _original_ coeff,
642     // so we don't have to consider level < 0 afterward.
643     const int sign = (in[j] < 0);
644     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645     int level0 = QUANTDIV(coeff0, iQ, B);
646     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649 
650     {   // Swap current and previous score states
651       ScoreState* const tmp = ss_cur;
652       ss_cur = ss_prev;
653       ss_prev = tmp;
654     }
655 
656     // test all alternate level values around level0.
657     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658       Node* const cur = &NODE(n, m);
659       int level = level0 + m;
660       const int ctx = (level > 2) ? 2 : level;
661       const int band = VP8EncBands[n + 1];
662       score_t base_score;
663       score_t best_cur_score = MAX_COST;
664       int best_prev = 0;   // default, in case
665 
666       ss_cur[m].score = MAX_COST;
667       ss_cur[m].costs = costs[n + 1][ctx];
668       if (level < 0 || level > thresh_level) {
669         // Node is dead.
670         continue;
671       }
672 
673       {
674         // Compute delta_error = how much coding this level will
675         // subtract to max_error as distortion.
676         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677         const int new_error = coeff0 - level * Q;
678         const int delta_error =
679             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680         base_score = RDScoreTrellis(lambda, 0, delta_error);
681       }
682 
683       // Inspect all possible non-dead predecessors. Retain only the best one.
684       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686         // eliminated since their score can't be better than the current best.
687         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688         // Examine node assuming it's a non-terminal one.
689         const score_t score =
690             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691         if (score < best_cur_score) {
692           best_cur_score = score;
693           best_prev = p;
694         }
695       }
696       // Store best finding in current node.
697       cur->sign = sign;
698       cur->level = level;
699       cur->prev = best_prev;
700       ss_cur[m].score = best_cur_score;
701 
702       // Now, record best terminal node (and thus best entry in the graph).
703       if (level != 0) {
704         const score_t last_pos_cost =
705             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707         const score_t score = best_cur_score + last_pos_score;
708         if (score < best_score) {
709           best_score = score;
710           best_path[0] = n;                     // best eob position
711           best_path[1] = m;                     // best node index
712           best_path[2] = best_prev;             // best predecessor
713         }
714       }
715     }
716   }
717 
718   // Fresh start
719   memset(in + first, 0, (16 - first) * sizeof(*in));
720   memset(out + first, 0, (16 - first) * sizeof(*out));
721   if (best_path[0] == -1) {
722     return 0;   // skip!
723   }
724 
725   {
726     // Unwind the best path.
727     // Note: best-prev on terminal node is not necessarily equal to the
728     // best_prev for non-terminal. So we patch best_path[2] in.
729     int nz = 0;
730     int best_node = best_path[1];
731     n = best_path[0];
732     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
733 
734     for (; n >= first; --n) {
735       const Node* const node = &NODE(n, best_node);
736       const int j = kZigzag[n];
737       out[n] = node->sign ? -node->level : node->level;
738       nz |= node->level;
739       in[j] = out[n] * mtx->q_[j];
740       best_node = node->prev;
741     }
742     return (nz != 0);
743   }
744 }
745 
746 #undef NODE
747 
748 //------------------------------------------------------------------------------
749 // Performs: difference, transform, quantize, back-transform, add
750 // all at once. Output is the reconstructed block in *yuv_out, and the
751 // quantized levels in *levels.
752 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)753 static int ReconstructIntra16(VP8EncIterator* const it,
754                               VP8ModeScore* const rd,
755                               uint8_t* const yuv_out,
756                               int mode) {
757   const VP8Encoder* const enc = it->enc_;
758   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761   int nz = 0;
762   int n;
763   int16_t tmp[16][16], dc_tmp[16];
764 
765   for (n = 0; n < 16; n += 2) {
766     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767   }
768   VP8FTransformWHT(tmp[0], dc_tmp);
769   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770 
771   if (DO_TRELLIS_I16 && it->do_trellis_) {
772     int x, y;
773     VP8IteratorNzToBytes(it);
774     for (y = 0, n = 0; y < 4; ++y) {
775       for (x = 0; x < 4; ++x, ++n) {
776         const int ctx = it->top_nz_[x] + it->left_nz_[y];
777         const int non_zero =
778             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779                                  &dqm->y1_, dqm->lambda_trellis_i16_);
780         it->top_nz_[x] = it->left_nz_[y] = non_zero;
781         rd->y_ac_levels[n][0] = 0;
782         nz |= non_zero << n;
783       }
784     }
785   } else {
786     for (n = 0; n < 16; n += 2) {
787       // Zero-out the first coeff, so that: a) nz is correct below, and
788       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789       tmp[n][0] = tmp[n + 1][0] = 0;
790       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791       assert(rd->y_ac_levels[n + 0][0] == 0);
792       assert(rd->y_ac_levels[n + 1][0] == 0);
793     }
794   }
795 
796   // Transform back
797   VP8TransformWHT(dc_tmp, tmp[0]);
798   for (n = 0; n < 16; n += 2) {
799     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800   }
801 
802   return nz;
803 }
804 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)805 static int ReconstructIntra4(VP8EncIterator* const it,
806                              int16_t levels[16],
807                              const uint8_t* const src,
808                              uint8_t* const yuv_out,
809                              int mode) {
810   const VP8Encoder* const enc = it->enc_;
811   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813   int nz = 0;
814   int16_t tmp[16];
815 
816   VP8FTransform(src, ref, tmp);
817   if (DO_TRELLIS_I4 && it->do_trellis_) {
818     const int x = it->i4_ & 3, y = it->i4_ >> 2;
819     const int ctx = it->top_nz_[x] + it->left_nz_[y];
820     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821                               dqm->lambda_trellis_i4_);
822   } else {
823     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824   }
825   VP8ITransform(ref, tmp, yuv_out, 0);
826   return nz;
827 }
828 
829 //------------------------------------------------------------------------------
830 // DC-error diffusion
831 
832 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
833 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
834 #define C1 7    // fraction of error sent to the 4x4 block below
835 #define C2 8    // fraction of error sent to the 4x4 block on the right
836 #define DSHIFT 4
837 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
838 
839 // Quantize as usual, but also compute and return the quantization error.
840 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)841 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
842   int V = *v;
843   const int sign = (V < 0);
844   if (sign) V = -V;
845   if (V > (int)mtx->zthresh_[0]) {
846     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
847     const int err = (V - qV);
848     *v = sign ? -qV : qV;
849     return (sign ? -err : err) >> DSCALE;
850   }
851   *v = 0;
852   return (sign ? -V : V) >> DSCALE;
853 }
854 
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)855 static void CorrectDCValues(const VP8EncIterator* const it,
856                             const VP8Matrix* const mtx,
857                             int16_t tmp[][16], VP8ModeScore* const rd) {
858   //         | top[0] | top[1]
859   // --------+--------+---------
860   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
861   // left[1] | tmp[2]   tmp[3]        err2 err3
862   //
863   // Final errors {err1,err2,err3} are preserved and later restored
864   // as top[]/left[] on the next block.
865   int ch;
866   for (ch = 0; ch <= 1; ++ch) {
867     const int8_t* const top = it->top_derr_[it->x_][ch];
868     const int8_t* const left = it->left_derr_[ch];
869     int16_t (* const c)[16] = &tmp[ch * 4];
870     int err0, err1, err2, err3;
871     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
872     err0 = QuantizeSingle(&c[0][0], mtx);
873     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
874     err1 = QuantizeSingle(&c[1][0], mtx);
875     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
876     err2 = QuantizeSingle(&c[2][0], mtx);
877     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
878     err3 = QuantizeSingle(&c[3][0], mtx);
879     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
880     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
881     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
882     rd->derr[ch][0] = (int8_t)err1;
883     rd->derr[ch][1] = (int8_t)err2;
884     rd->derr[ch][2] = (int8_t)err3;
885   }
886 }
887 
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)888 static void StoreDiffusionErrors(VP8EncIterator* const it,
889                                  const VP8ModeScore* const rd) {
890   int ch;
891   for (ch = 0; ch <= 1; ++ch) {
892     int8_t* const top = it->top_derr_[it->x_][ch];
893     int8_t* const left = it->left_derr_[ch];
894     left[0] = rd->derr[ch][0];            // restore err1
895     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
896     top[0]  = rd->derr[ch][1];            //     ... err2
897     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
898   }
899 }
900 
901 #undef C1
902 #undef C2
903 #undef DSHIFT
904 #undef DSCALE
905 
906 //------------------------------------------------------------------------------
907 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)908 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
909                          uint8_t* const yuv_out, int mode) {
910   const VP8Encoder* const enc = it->enc_;
911   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
912   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
913   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
914   int nz = 0;
915   int n;
916   int16_t tmp[8][16];
917 
918   for (n = 0; n < 8; n += 2) {
919     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
920   }
921   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
922 
923   if (DO_TRELLIS_UV && it->do_trellis_) {
924     int ch, x, y;
925     for (ch = 0, n = 0; ch <= 2; ch += 2) {
926       for (y = 0; y < 2; ++y) {
927         for (x = 0; x < 2; ++x, ++n) {
928           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
929           const int non_zero =
930               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
931                                    &dqm->uv_, dqm->lambda_trellis_uv_);
932           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
933           nz |= non_zero << n;
934         }
935       }
936     }
937   } else {
938     for (n = 0; n < 8; n += 2) {
939       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
940     }
941   }
942 
943   for (n = 0; n < 8; n += 2) {
944     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
945   }
946   return (nz << 16);
947 }
948 
949 //------------------------------------------------------------------------------
950 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
951 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
952 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])953 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
954   // We look at the first three AC coefficients to determine what is the average
955   // delta between each sub-4x4 block.
956   const int v0 = abs(DCs[1]);
957   const int v1 = abs(DCs[2]);
958   const int v2 = abs(DCs[4]);
959   int max_v = (v1 > v0) ? v1 : v0;
960   max_v = (v2 > max_v) ? v2 : max_v;
961   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
962 }
963 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)964 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
965   VP8ModeScore* const tmp = *a;
966   *a = *b;
967   *b = tmp;
968 }
969 
SwapPtr(uint8_t ** a,uint8_t ** b)970 static void SwapPtr(uint8_t** a, uint8_t** b) {
971   uint8_t* const tmp = *a;
972   *a = *b;
973   *b = tmp;
974 }
975 
SwapOut(VP8EncIterator * const it)976 static void SwapOut(VP8EncIterator* const it) {
977   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
978 }
979 
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)980 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
981   score_t score = 0;
982   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
983     int i;
984     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
985       score += (levels[i] != 0);
986       if (score > thresh) return 0;
987     }
988     levels += 16;
989   }
990   return 1;
991 }
992 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)993 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
994   const int kNumBlocks = 16;
995   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
996   const int lambda = dqm->lambda_i16_;
997   const int tlambda = dqm->tlambda_;
998   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
999   VP8ModeScore rd_tmp;
1000   VP8ModeScore* rd_cur = &rd_tmp;
1001   VP8ModeScore* rd_best = rd;
1002   int mode;
1003 
1004   rd->mode_i16 = -1;
1005   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1006     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1007     rd_cur->mode_i16 = mode;
1008 
1009     // Reconstruct
1010     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1011 
1012     // Measure RD-score
1013     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1014     rd_cur->SD =
1015         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1016     rd_cur->H = VP8FixedCostsI16[mode];
1017     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1018     if (mode > 0 &&
1019         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
1020       // penalty to avoid flat area to be mispredicted by complex mode
1021       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
1022     }
1023 
1024     // Since we always examine Intra16 first, we can overwrite *rd directly.
1025     SetRDScore(lambda, rd_cur);
1026     if (mode == 0 || rd_cur->score < rd_best->score) {
1027       SwapModeScore(&rd_cur, &rd_best);
1028       SwapOut(it);
1029     }
1030   }
1031   if (rd_best != rd) {
1032     memcpy(rd, rd_best, sizeof(*rd));
1033   }
1034   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1035   VP8SetIntra16Mode(it, rd->mode_i16);
1036 
1037   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1038   // distortion, record max delta so we can later adjust the minimal filtering
1039   // strength needed to smooth these blocks out.
1040   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1041     StoreMaxDelta(dqm, rd->y_dc_levels);
1042   }
1043 }
1044 
1045 //------------------------------------------------------------------------------
1046 
1047 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1048 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1049                                      const uint8_t modes[16]) {
1050   const int preds_w = it->enc_->preds_w_;
1051   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1052   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1053   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1054   return VP8FixedCostsI4[top][left];
1055 }
1056 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1057 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058   const VP8Encoder* const enc = it->enc_;
1059   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1060   const int lambda = dqm->lambda_i4_;
1061   const int tlambda = dqm->tlambda_;
1062   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1063   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1064   int total_header_bits = 0;
1065   VP8ModeScore rd_best;
1066 
1067   if (enc->max_i4_header_bits_ == 0) {
1068     return 0;
1069   }
1070 
1071   InitScore(&rd_best);
1072   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1073   SetRDScore(dqm->lambda_mode_, &rd_best);
1074   VP8IteratorStartI4(it);
1075   do {
1076     const int kNumBlocks = 1;
1077     VP8ModeScore rd_i4;
1078     int mode;
1079     int best_mode = -1;
1080     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1081     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1082     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1083     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1084 
1085     InitScore(&rd_i4);
1086     VP8MakeIntra4Preds(it);
1087     for (mode = 0; mode < NUM_BMODES; ++mode) {
1088       VP8ModeScore rd_tmp;
1089       int16_t tmp_levels[16];
1090 
1091       // Reconstruct
1092       rd_tmp.nz =
1093           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1094 
1095       // Compute RD-score
1096       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1097       rd_tmp.SD =
1098           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1099                   : 0;
1100       rd_tmp.H = mode_costs[mode];
1101 
1102       // Add flatness penalty
1103       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1104         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1105       } else {
1106         rd_tmp.R = 0;
1107       }
1108 
1109       // early-out check
1110       SetRDScore(lambda, &rd_tmp);
1111       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1112 
1113       // finish computing score
1114       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1115       SetRDScore(lambda, &rd_tmp);
1116 
1117       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1118         CopyScore(&rd_i4, &rd_tmp);
1119         best_mode = mode;
1120         SwapPtr(&tmp_dst, &best_block);
1121         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1122                sizeof(rd_best.y_ac_levels[it->i4_]));
1123       }
1124     }
1125     SetRDScore(dqm->lambda_mode_, &rd_i4);
1126     AddScore(&rd_best, &rd_i4);
1127     if (rd_best.score >= rd->score) {
1128       return 0;
1129     }
1130     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1131     if (total_header_bits > enc->max_i4_header_bits_) {
1132       return 0;
1133     }
1134     // Copy selected samples if not in the right place already.
1135     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1136       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1137     }
1138     rd->modes_i4[it->i4_] = best_mode;
1139     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1140   } while (VP8IteratorRotateI4(it, best_blocks));
1141 
1142   // finalize state
1143   CopyScore(rd, &rd_best);
1144   VP8SetIntra4Mode(it, rd->modes_i4);
1145   SwapOut(it);
1146   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1147   return 1;   // select intra4x4 over intra16x16
1148 }
1149 
1150 //------------------------------------------------------------------------------
1151 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1152 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1153   const int kNumBlocks = 8;
1154   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1155   const int lambda = dqm->lambda_uv_;
1156   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1157   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1158   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1159   uint8_t* dst = dst0;
1160   VP8ModeScore rd_best;
1161   int mode;
1162 
1163   rd->mode_uv = -1;
1164   InitScore(&rd_best);
1165   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1166     VP8ModeScore rd_uv;
1167 
1168     // Reconstruct
1169     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1170 
1171     // Compute RD-score
1172     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1173     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1174     rd_uv.H  = VP8FixedCostsUV[mode];
1175     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1176     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1177       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1178     }
1179 
1180     SetRDScore(lambda, &rd_uv);
1181     if (mode == 0 || rd_uv.score < rd_best.score) {
1182       CopyScore(&rd_best, &rd_uv);
1183       rd->mode_uv = mode;
1184       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1185       if (it->top_derr_ != NULL) {
1186         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1187       }
1188       SwapPtr(&dst, &tmp_dst);
1189     }
1190   }
1191   VP8SetIntraUVMode(it, rd->mode_uv);
1192   AddScore(rd, &rd_best);
1193   if (dst != dst0) {   // copy 16x8 block if needed
1194     VP8Copy16x8(dst, dst0);
1195   }
1196   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1197     StoreDiffusionErrors(it, rd);
1198   }
1199 }
1200 
1201 //------------------------------------------------------------------------------
1202 // Final reconstruction and quantization.
1203 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1204 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1205   const VP8Encoder* const enc = it->enc_;
1206   const int is_i16 = (it->mb_->type_ == 1);
1207   int nz = 0;
1208 
1209   if (is_i16) {
1210     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1211   } else {
1212     VP8IteratorStartI4(it);
1213     do {
1214       const int mode =
1215           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1216       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1217       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1218       VP8MakeIntra4Preds(it);
1219       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1220                               src, dst, mode) << it->i4_;
1221     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1222   }
1223 
1224   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1225   rd->nz = nz;
1226 }
1227 
1228 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1229 static void RefineUsingDistortion(VP8EncIterator* const it,
1230                                   int try_both_modes, int refine_uv_mode,
1231                                   VP8ModeScore* const rd) {
1232   score_t best_score = MAX_COST;
1233   int nz = 0;
1234   int mode;
1235   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1236 
1237   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1238   // Some empiric constants, of approximate order of magnitude.
1239   const int lambda_d_i16 = 106;
1240   const int lambda_d_i4 = 11;
1241   const int lambda_d_uv = 120;
1242   score_t score_i4 = dqm->i4_penalty_;
1243   score_t i4_bit_sum = 0;
1244   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1245                                            : MAX_COST;  // no early-out allowed
1246 
1247   if (is_i16) {   // First, evaluate Intra16 distortion
1248     int best_mode = -1;
1249     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1250     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1251       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1252       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1253                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1254       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1255         continue;
1256       }
1257       if (score < best_score) {
1258         best_mode = mode;
1259         best_score = score;
1260       }
1261     }
1262     VP8SetIntra16Mode(it, best_mode);
1263     // we'll reconstruct later, if i16 mode actually gets selected
1264   }
1265 
1266   // Next, evaluate Intra4
1267   if (try_both_modes || !is_i16) {
1268     // We don't evaluate the rate here, but just account for it through a
1269     // constant penalty (i4 mode usually needs more bits compared to i16).
1270     is_i16 = 0;
1271     VP8IteratorStartI4(it);
1272     do {
1273       int best_i4_mode = -1;
1274       score_t best_i4_score = MAX_COST;
1275       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1276       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1277 
1278       VP8MakeIntra4Preds(it);
1279       for (mode = 0; mode < NUM_BMODES; ++mode) {
1280         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1281         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1282                             + mode_costs[mode] * lambda_d_i4;
1283         if (score < best_i4_score) {
1284           best_i4_mode = mode;
1285           best_i4_score = score;
1286         }
1287       }
1288       i4_bit_sum += mode_costs[best_i4_mode];
1289       rd->modes_i4[it->i4_] = best_i4_mode;
1290       score_i4 += best_i4_score;
1291       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1292         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1293         is_i16 = 1;
1294         break;
1295       } else {  // reconstruct partial block inside yuv_out2_ buffer
1296         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1297         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1298                                 src, tmp_dst, best_i4_mode) << it->i4_;
1299       }
1300     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1301   }
1302 
1303   // Final reconstruction, depending on which mode is selected.
1304   if (!is_i16) {
1305     VP8SetIntra4Mode(it, rd->modes_i4);
1306     SwapOut(it);
1307     best_score = score_i4;
1308   } else {
1309     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1310   }
1311 
1312   // ... and UV!
1313   if (refine_uv_mode) {
1314     int best_mode = -1;
1315     score_t best_uv_score = MAX_COST;
1316     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1317     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1318       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1319       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1320                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1321       if (score < best_uv_score) {
1322         best_mode = mode;
1323         best_uv_score = score;
1324       }
1325     }
1326     VP8SetIntraUVMode(it, best_mode);
1327   }
1328   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1329 
1330   rd->nz = nz;
1331   rd->score = best_score;
1332 }
1333 
1334 //------------------------------------------------------------------------------
1335 // Entry point
1336 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1337 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1338                 VP8RDLevel rd_opt) {
1339   int is_skipped;
1340   const int method = it->enc_->method_;
1341 
1342   InitScore(rd);
1343 
1344   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1345   // Luma4x4 predictions needs to be done as-we-go.
1346   VP8MakeLuma16Preds(it);
1347   VP8MakeChroma8Preds(it);
1348 
1349   if (rd_opt > RD_OPT_NONE) {
1350     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1351     PickBestIntra16(it, rd);
1352     if (method >= 2) {
1353       PickBestIntra4(it, rd);
1354     }
1355     PickBestUV(it, rd);
1356     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1357       it->do_trellis_ = 1;
1358       SimpleQuantize(it, rd);
1359     }
1360   } else {
1361     // At this point we have heuristically decided intra16 / intra4.
1362     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1363     // For method <= 1, we don't re-examine the decision but just go ahead with
1364     // quantization/reconstruction.
1365     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1366   }
1367   is_skipped = (rd->nz == 0);
1368   VP8SetSkip(it, is_skipped);
1369   return is_skipped;
1370 }
1371