1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21 
22 #define DO_TRELLIS_I4  1
23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26 
27 #define MID_ALPHA 64      // neutral value for susceptibility
28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
30 
31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32                           // power-law modulation. Must be strictly less than 1.
33 
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4  3       // I4 mode
38 #define FLATNESS_LIMIT_UV  2       // UV mode
39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
40 
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42 
43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44 
45 // #define DEBUG_BLOCK
46 
47 //------------------------------------------------------------------------------
48 
49 #if defined(DEBUG_BLOCK)
50 
51 #include <stdio.h>
52 #include <stdlib.h>
53 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55                            const VP8ModeScore* const rd) {
56   int i, j;
57   const int is_i16 = (it->mb_->type_ == 1);
58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62   printf("SOURCE / OUTPUT / ABS DELTA\n");
63   for (j = 0; j < 16; ++j) {
64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65     printf("     ");
66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67     printf("     ");
68     for (i = 0; i < 16; ++i) {
69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70     }
71     printf("\n");
72   }
73   printf("\n");   // newline before the U/V block
74   for (j = 0; j < 8; ++j) {
75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76     printf(" ");
77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78     printf("    ");
79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80     printf(" ");
81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82     printf("   ");
83     for (i = 0; i < 8; ++i) {
84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85     }
86     printf(" ");
87     for (i = 8; i < 16; ++i) {
88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89     }
90     printf("\n");
91   }
92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94     (int)rd->score);
95   if (is_i16) {
96     printf("Mode: %d\n", rd->mode_i16);
97     printf("y_dc_levels:");
98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99     printf("\n");
100   } else {
101     printf("Modes[16]: ");
102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103     printf("\n");
104   }
105   printf("y_ac_levels:\n");
106   for (j = 0; j < 16; ++j) {
107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108       printf("%4d ", rd->y_ac_levels[j][i]);
109     }
110     printf("\n");
111   }
112   printf("\n");
113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
114   for (j = 0; j < 8; ++j) {
115     for (i = 0; i < 16; ++i) {
116       printf("%4d ", rd->uv_levels[j][i]);
117     }
118     printf("\n");
119   }
120 }
121 
122 #endif   // DEBUG_BLOCK
123 
124 //------------------------------------------------------------------------------
125 
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127   return v < m ? m : v > M ? M : v;
128 }
129 
130 static const uint8_t kZigzag[16] = {
131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133 
134 static const uint8_t kDcTable[128] = {
135   4,     5,   6,   7,   8,   9,  10,  10,
136   11,   12,  13,  14,  15,  16,  17,  17,
137   18,   19,  20,  20,  21,  21,  22,  22,
138   23,   23,  24,  25,  25,  26,  27,  28,
139   29,   30,  31,  32,  33,  34,  35,  36,
140   37,   37,  38,  39,  40,  41,  42,  43,
141   44,   45,  46,  46,  47,  48,  49,  50,
142   51,   52,  53,  54,  55,  56,  57,  58,
143   59,   60,  61,  62,  63,  64,  65,  66,
144   67,   68,  69,  70,  71,  72,  73,  74,
145   75,   76,  76,  77,  78,  79,  80,  81,
146   82,   83,  84,  85,  86,  87,  88,  89,
147   91,   93,  95,  96,  98, 100, 101, 102,
148   104, 106, 108, 110, 112, 114, 116, 118,
149   122, 124, 126, 128, 130, 132, 134, 136,
150   138, 140, 143, 145, 148, 151, 154, 157
151 };
152 
153 static const uint16_t kAcTable[128] = {
154   4,     5,   6,   7,   8,   9,  10,  11,
155   12,   13,  14,  15,  16,  17,  18,  19,
156   20,   21,  22,  23,  24,  25,  26,  27,
157   28,   29,  30,  31,  32,  33,  34,  35,
158   36,   37,  38,  39,  40,  41,  42,  43,
159   44,   45,  46,  47,  48,  49,  50,  51,
160   52,   53,  54,  55,  56,  57,  58,  60,
161   62,   64,  66,  68,  70,  72,  74,  76,
162   78,   80,  82,  84,  86,  88,  90,  92,
163   94,   96,  98, 100, 102, 104, 106, 108,
164   110, 112, 114, 116, 119, 122, 125, 128,
165   131, 134, 137, 140, 143, 146, 149, 152,
166   155, 158, 161, 164, 167, 170, 173, 177,
167   181, 185, 189, 193, 197, 201, 205, 209,
168   213, 217, 221, 225, 229, 234, 239, 245,
169   249, 254, 259, 264, 269, 274, 279, 284
170 };
171 
172 static const uint16_t kAcTable2[128] = {
173   8,     8,   9,  10,  12,  13,  15,  17,
174   18,   20,  21,  23,  24,  26,  27,  29,
175   31,   32,  34,  35,  37,  38,  40,  41,
176   43,   44,  46,  48,  49,  51,  52,  54,
177   55,   57,  58,  60,  62,  63,  65,  66,
178   68,   69,  71,  72,  74,  75,  77,  79,
179   80,   82,  83,  85,  86,  88,  89,  93,
180   96,   99, 102, 105, 108, 111, 114, 117,
181   120, 124, 127, 130, 133, 136, 139, 142,
182   145, 148, 151, 155, 158, 161, 164, 167,
183   170, 173, 176, 179, 184, 189, 193, 198,
184   203, 207, 212, 217, 221, 226, 230, 235,
185   240, 244, 249, 254, 258, 263, 268, 274,
186   280, 286, 292, 299, 305, 311, 317, 323,
187   330, 336, 342, 348, 354, 362, 370, 379,
188   385, 393, 401, 409, 416, 424, 432, 440
189 };
190 
191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192   { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194 
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199   0,  30, 60, 90,
200   30, 60, 90, 90,
201   60, 90, 90, 90,
202   90, 90, 90, 90
203 };
204 
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207 
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210   int i, sum;
211   for (i = 0; i < 2; ++i) {
212     const int is_ac_coeff = (i > 0);
213     const int bias = kBiasMatrices[type][is_ac_coeff];
214     m->iq_[i] = (1 << QFIX) / m->q_[i];
215     m->bias_[i] = BIAS(bias);
216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217     //   * zero if coeff <= zthresh
218     //   * non-zero if coeff > zthresh
219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220   }
221   for (i = 2; i < 16; ++i) {
222     m->q_[i] = m->q_[1];
223     m->iq_[i] = m->iq_[1];
224     m->bias_[i] = m->bias_[1];
225     m->zthresh_[i] = m->zthresh_[1];
226   }
227   for (sum = 0, i = 0; i < 16; ++i) {
228     if (type == 0) {  // we only use sharpening for AC luma coeffs
229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230     } else {
231       m->sharpen_[i] = 0;
232     }
233     sum += m->q_[i];
234   }
235   return (sum + 8) >> 4;
236 }
237 
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239 
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241   int i;
242   const int tlambda_scale =
243     (enc->method_ >= 4) ? enc->config_->sns_strength
244                         : 0;
245   const int num_segments = enc->segment_hdr_.num_segments_;
246   for (i = 0; i < num_segments; ++i) {
247     VP8SegmentInfo* const m = &enc->dqm_[i];
248     const int q = m->quant_;
249     int q_i4, q_i16, q_uv;
250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252 
253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255 
256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258 
259     q_i4  = ExpandMatrix(&m->y1_, 0);
260     q_i16 = ExpandMatrix(&m->y2_, 1);
261     q_uv  = ExpandMatrix(&m->uv_, 2);
262 
263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264     m->lambda_i16_         = (3 * q_i16 * q_i16);
265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271 
272     // none of these constants should be < 1
273     CheckLambdaValue(&m->lambda_i4_);
274     CheckLambdaValue(&m->lambda_i16_);
275     CheckLambdaValue(&m->lambda_uv_);
276     CheckLambdaValue(&m->lambda_mode_);
277     CheckLambdaValue(&m->lambda_trellis_i4_);
278     CheckLambdaValue(&m->lambda_trellis_i16_);
279     CheckLambdaValue(&m->lambda_trellis_uv_);
280     CheckLambdaValue(&m->tlambda_);
281 
282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283     m->max_edge_  = 0;
284 
285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
286   }
287 }
288 
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291 
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295 
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297   int i;
298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299   const int level0 = 5 * enc->config_->filter_strength;
300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301     VP8SegmentInfo* const m = &enc->dqm_[i];
302     // We focus on the quantization of AC coeffs.
303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304     const int base_strength =
305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306     // Segments with lower complexity ('beta') will be less filtered.
307     const int f = base_strength * level0 / (256 + m->beta_);
308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309   }
310   // We record the initial strength (mainly for the case of 1-segment only).
311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322 
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330   // in the mid-quant range. So we scale the compressibility inversely to
331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
333   // more explicitly.
334   const double v = pow(linear_c, 1 / 3.);
335   return v;
336 }
337 
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339   // We map the complexity 'alpha' and quality setting 'c' to a compression
340   // exponent empirically matched to the compression curve of libjpeg6b.
341   // On average, the WebP output size will be roughly similar to that of a
342   // JPEG file compressed with same quality factor.
343   const double amin = 0.30;
344   const double amax = 0.85;
345   const double exp_min = 0.4;
346   const double exp_max = 0.9;
347   const double slope = (exp_min - exp_max) / (amax - amin);
348   // Linearly interpolate 'expn' from exp_min to exp_max
349   // in the [amin, amax] range.
350   const double expn = (alpha > amax) ? exp_min
351                     : (alpha < amin) ? exp_max
352                     : exp_max + slope * (alpha - amin);
353   const double v = pow(c, expn);
354   return v;
355 }
356 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358                                  const VP8SegmentInfo* const S2) {
359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361 
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368                                ? enc->segment_hdr_.num_segments_
369                                : NUM_MB_SEGMENTS;
370   int num_final_segments = 1;
371   int s1, s2;
372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374     int found = 0;
375     // check if we already have similar segment
376     for (s2 = 0; s2 < num_final_segments; ++s2) {
377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378       if (SegmentsAreEquivalent(S1, S2)) {
379         found = 1;
380         break;
381       }
382     }
383     map[s1] = s2;
384     if (!found) {
385       if (num_final_segments != s1) {
386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
387       }
388       ++num_final_segments;
389     }
390   }
391   if (num_final_segments < num_segments) {  // Remap
392     int i = enc->mb_w_ * enc->mb_h_;
393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394     enc->segment_hdr_.num_segments_ = num_final_segments;
395     // Replicate the trailing segment infos (it's mostly cosmetics)
396     for (i = num_final_segments; i < num_segments; ++i) {
397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398     }
399   }
400 }
401 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403   int i;
404   int dq_uv_ac, dq_uv_dc;
405   const int num_segments = enc->segment_hdr_.num_segments_;
406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407   const double Q = quality / 100.;
408   const double c_base = enc->config_->emulate_jpeg_size ?
409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410       QualityToCompression(Q);
411   for (i = 0; i < num_segments; ++i) {
412     // We modulate the base coefficient to accommodate for the quantization
413     // susceptibility and allow denser segments to be quantized more.
414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
415     const double c = pow(c_base, expn);
416     const int q = (int)(127. * (1. - c));
417     assert(expn > 0.);
418     enc->dqm_[i].quant_ = clip(q, 0, 127);
419   }
420 
421   // purely indicative in the bitstream (except for the 1-segment case)
422   enc->base_quant_ = enc->dqm_[0].quant_;
423 
424   // fill-in values for the unused segments (required by the syntax)
425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426     enc->dqm_[i].quant_ = enc->base_quant_;
427   }
428 
429   // uv_alpha_ is normally spread around ~60. The useful range is
430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433                                           / (MAX_ALPHA - MIN_ALPHA);
434   // we rescale by the user-defined strength of adaptation
435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436   // and make it safe.
437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438   // We also boost the dc-uv-quant a little, based on sns-strength, since
439   // U/V channels are quite more reactive to high quants (flat DC-blocks
440   // tend to appear, and are unpleasant).
441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443 
444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445   enc->dq_y2_dc_ = 0;
446   enc->dq_y2_ac_ = 0;
447   enc->dq_uv_dc_ = dq_uv_dc;
448   enc->dq_uv_ac_ = dq_uv_ac;
449 
450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451 
452   if (num_segments > 1) SimplifySegments(enc);
453 
454   SetupMatrices(enc);         // finalize quantization matrices
455 }
456 
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459 
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463 
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468 
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472   VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474 
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478   VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480 
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484 
485 //------------------------------------------------------------------------------
486 // Quantize
487 
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495 
496 const uint16_t VP8Scan[16] = {  // Luma
497   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
498   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
499   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
500   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502 
503 static const uint16_t VP8ScanUV[4 + 4] = {
504   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
505   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
506 };
507 
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510 
511 static const uint16_t kWeightY[16] = {
512   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514 
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519   30, 27, 19, 11,
520   27, 24, 17, 10,
521   19, 17, 12,  8,
522   11, 10,  8,  6
523 #endif
524 };
525 
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528   rd->D  = 0;
529   rd->SD = 0;
530   rd->R  = 0;
531   rd->H  = 0;
532   rd->nz = 0;
533   rd->score = MAX_COST;
534 }
535 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537   dst->D  = src->D;
538   dst->SD = src->SD;
539   dst->R  = src->R;
540   dst->H  = src->H;
541   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
542   dst->score = src->score;
543 }
544 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546   dst->D  += src->D;
547   dst->SD += src->SD;
548   dst->R  += src->R;
549   dst->H  += src->H;
550   dst->nz |= src->nz;     // here, new nz bits are accumulated.
551   dst->score += src->score;
552 }
553 
554 //------------------------------------------------------------------------------
555 // Performs trellis-optimized quantization.
556 
557 // Trellis node
558 typedef struct {
559   int8_t prev;            // best previous node
560   int8_t sign;            // sign of coeff_i
561   int16_t level;          // level
562 } Node;
563 
564 // Score state
565 typedef struct {
566   score_t score;          // partial RD score
567   const uint16_t* costs;  // shortcut to cost tables
568 } ScoreState;
569 
570 // If a coefficient was quantized to a value Q (using a neutral bias),
571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572 // We don't test negative values though.
573 #define MIN_DELTA 0   // how much lower level to try
574 #define MAX_DELTA 1   // how much higher
575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578 
SetRDScore(int lambda,VP8ModeScore * const rd)579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581 }
582 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584                                           score_t distortion) {
585   return rate * lambda + RD_DISTO_MULT * distortion;
586 }
587 
588 // Coefficient type.
589 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
590 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)591 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
592                                 int16_t in[16], int16_t out[16],
593                                 int ctx0, int coeff_type,
594                                 const VP8Matrix* const mtx,
595                                 int lambda) {
596   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
597   CostArrayPtr const costs =
598       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
599   const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
600   Node nodes[16][NUM_NODES];
601   ScoreState score_states[2][NUM_NODES];
602   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
603   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
604   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
605   score_t best_score;
606   int n, m, p, last;
607 
608   {
609     score_t cost;
610     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
611     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
612 
613     // compute the position of the last interesting coefficient
614     last = first - 1;
615     for (n = 15; n >= first; --n) {
616       const int j = kZigzag[n];
617       const int err = in[j] * in[j];
618       if (err > thresh) {
619         last = n;
620         break;
621       }
622     }
623     // we don't need to go inspect up to n = 16 coeffs. We can just go up
624     // to last + 1 (inclusive) without losing much.
625     if (last < 15) ++last;
626 
627     // compute 'skip' score. This is the max score one can do.
628     cost = VP8BitCost(0, last_proba);
629     best_score = RDScoreTrellis(lambda, cost, 0);
630 
631     // initialize source node.
632     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
633       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
634       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
635       ss_cur[m].costs = costs[first][ctx0];
636     }
637   }
638 
639   // traverse trellis.
640   for (n = first; n <= last; ++n) {
641     const int j = kZigzag[n];
642     const uint32_t Q  = mtx->q_[j];
643     const uint32_t iQ = mtx->iq_[j];
644     const uint32_t B = BIAS(0x00);     // neutral bias
645     // note: it's important to take sign of the _original_ coeff,
646     // so we don't have to consider level < 0 afterward.
647     const int sign = (in[j] < 0);
648     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
649     int level0 = QUANTDIV(coeff0, iQ, B);
650     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
651     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
652     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
653 
654     {   // Swap current and previous score states
655       ScoreState* const tmp = ss_cur;
656       ss_cur = ss_prev;
657       ss_prev = tmp;
658     }
659 
660     // test all alternate level values around level0.
661     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
662       Node* const cur = &NODE(n, m);
663       const int level = level0 + m;
664       const int ctx = (level > 2) ? 2 : level;
665       const int band = VP8EncBands[n + 1];
666       score_t base_score;
667       score_t best_cur_score;
668       int best_prev;
669       score_t cost, score;
670 
671       ss_cur[m].costs = costs[n + 1][ctx];
672       if (level < 0 || level > thresh_level) {
673         ss_cur[m].score = MAX_COST;
674         // Node is dead.
675         continue;
676       }
677 
678       {
679         // Compute delta_error = how much coding this level will
680         // subtract to max_error as distortion.
681         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
682         const int new_error = coeff0 - level * Q;
683         const int delta_error =
684             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
685         base_score = RDScoreTrellis(lambda, 0, delta_error);
686       }
687 
688       // Inspect all possible non-dead predecessors. Retain only the best one.
689       // The base_score is added to all scores so it is only added for the final
690       // value after the loop.
691       cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
692       best_cur_score =
693           ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
694       best_prev = -MIN_DELTA;
695       for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
696         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
697         // eliminated since their score can't be better than the current best.
698         cost = VP8LevelCost(ss_prev[p].costs, level);
699         // Examine node assuming it's a non-terminal one.
700         score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
701         if (score < best_cur_score) {
702           best_cur_score = score;
703           best_prev = p;
704         }
705       }
706       best_cur_score += base_score;
707       // Store best finding in current node.
708       cur->sign = sign;
709       cur->level = level;
710       cur->prev = best_prev;
711       ss_cur[m].score = best_cur_score;
712 
713       // Now, record best terminal node (and thus best entry in the graph).
714       if (level != 0 && best_cur_score < best_score) {
715         const score_t last_pos_cost =
716             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
717         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
718         score = best_cur_score + last_pos_score;
719         if (score < best_score) {
720           best_score = score;
721           best_path[0] = n;                     // best eob position
722           best_path[1] = m;                     // best node index
723           best_path[2] = best_prev;             // best predecessor
724         }
725       }
726     }
727   }
728 
729   // Fresh start
730   // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
731   if (coeff_type == TYPE_I16_AC) {
732     memset(in + 1, 0, 15 * sizeof(*in));
733     memset(out + 1, 0, 15 * sizeof(*out));
734   } else {
735     memset(in, 0, 16 * sizeof(*in));
736     memset(out, 0, 16 * sizeof(*out));
737   }
738   if (best_path[0] == -1) {
739     return 0;  // skip!
740   }
741 
742   {
743     // Unwind the best path.
744     // Note: best-prev on terminal node is not necessarily equal to the
745     // best_prev for non-terminal. So we patch best_path[2] in.
746     int nz = 0;
747     int best_node = best_path[1];
748     n = best_path[0];
749     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
750 
751     for (; n >= first; --n) {
752       const Node* const node = &NODE(n, best_node);
753       const int j = kZigzag[n];
754       out[n] = node->sign ? -node->level : node->level;
755       nz |= node->level;
756       in[j] = out[n] * mtx->q_[j];
757       best_node = node->prev;
758     }
759     return (nz != 0);
760   }
761 }
762 
763 #undef NODE
764 
765 //------------------------------------------------------------------------------
766 // Performs: difference, transform, quantize, back-transform, add
767 // all at once. Output is the reconstructed block in *yuv_out, and the
768 // quantized levels in *levels.
769 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)770 static int ReconstructIntra16(VP8EncIterator* const it,
771                               VP8ModeScore* const rd,
772                               uint8_t* const yuv_out,
773                               int mode) {
774   const VP8Encoder* const enc = it->enc_;
775   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
776   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
777   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
778   int nz = 0;
779   int n;
780   int16_t tmp[16][16], dc_tmp[16];
781 
782   for (n = 0; n < 16; n += 2) {
783     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
784   }
785   VP8FTransformWHT(tmp[0], dc_tmp);
786   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
787 
788   if (DO_TRELLIS_I16 && it->do_trellis_) {
789     int x, y;
790     VP8IteratorNzToBytes(it);
791     for (y = 0, n = 0; y < 4; ++y) {
792       for (x = 0; x < 4; ++x, ++n) {
793         const int ctx = it->top_nz_[x] + it->left_nz_[y];
794         const int non_zero = TrellisQuantizeBlock(
795             enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
796             dqm->lambda_trellis_i16_);
797         it->top_nz_[x] = it->left_nz_[y] = non_zero;
798         rd->y_ac_levels[n][0] = 0;
799         nz |= non_zero << n;
800       }
801     }
802   } else {
803     for (n = 0; n < 16; n += 2) {
804       // Zero-out the first coeff, so that: a) nz is correct below, and
805       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
806       tmp[n][0] = tmp[n + 1][0] = 0;
807       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
808       assert(rd->y_ac_levels[n + 0][0] == 0);
809       assert(rd->y_ac_levels[n + 1][0] == 0);
810     }
811   }
812 
813   // Transform back
814   VP8TransformWHT(dc_tmp, tmp[0]);
815   for (n = 0; n < 16; n += 2) {
816     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
817   }
818 
819   return nz;
820 }
821 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)822 static int ReconstructIntra4(VP8EncIterator* const it,
823                              int16_t levels[16],
824                              const uint8_t* const src,
825                              uint8_t* const yuv_out,
826                              int mode) {
827   const VP8Encoder* const enc = it->enc_;
828   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
829   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
830   int nz = 0;
831   int16_t tmp[16];
832 
833   VP8FTransform(src, ref, tmp);
834   if (DO_TRELLIS_I4 && it->do_trellis_) {
835     const int x = it->i4_ & 3, y = it->i4_ >> 2;
836     const int ctx = it->top_nz_[x] + it->left_nz_[y];
837     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
838                               dqm->lambda_trellis_i4_);
839   } else {
840     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
841   }
842   VP8ITransform(ref, tmp, yuv_out, 0);
843   return nz;
844 }
845 
846 //------------------------------------------------------------------------------
847 // DC-error diffusion
848 
849 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
850 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
851 #define C1 7    // fraction of error sent to the 4x4 block below
852 #define C2 8    // fraction of error sent to the 4x4 block on the right
853 #define DSHIFT 4
854 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
855 
856 // Quantize as usual, but also compute and return the quantization error.
857 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)858 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
859   int V = *v;
860   const int sign = (V < 0);
861   if (sign) V = -V;
862   if (V > (int)mtx->zthresh_[0]) {
863     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
864     const int err = (V - qV);
865     *v = sign ? -qV : qV;
866     return (sign ? -err : err) >> DSCALE;
867   }
868   *v = 0;
869   return (sign ? -V : V) >> DSCALE;
870 }
871 
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)872 static void CorrectDCValues(const VP8EncIterator* const it,
873                             const VP8Matrix* const mtx,
874                             int16_t tmp[][16], VP8ModeScore* const rd) {
875   //         | top[0] | top[1]
876   // --------+--------+---------
877   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
878   // left[1] | tmp[2]   tmp[3]        err2 err3
879   //
880   // Final errors {err1,err2,err3} are preserved and later restored
881   // as top[]/left[] on the next block.
882   int ch;
883   for (ch = 0; ch <= 1; ++ch) {
884     const int8_t* const top = it->top_derr_[it->x_][ch];
885     const int8_t* const left = it->left_derr_[ch];
886     int16_t (* const c)[16] = &tmp[ch * 4];
887     int err0, err1, err2, err3;
888     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
889     err0 = QuantizeSingle(&c[0][0], mtx);
890     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
891     err1 = QuantizeSingle(&c[1][0], mtx);
892     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
893     err2 = QuantizeSingle(&c[2][0], mtx);
894     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
895     err3 = QuantizeSingle(&c[3][0], mtx);
896     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
897     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
898     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
899     rd->derr[ch][0] = (int8_t)err1;
900     rd->derr[ch][1] = (int8_t)err2;
901     rd->derr[ch][2] = (int8_t)err3;
902   }
903 }
904 
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)905 static void StoreDiffusionErrors(VP8EncIterator* const it,
906                                  const VP8ModeScore* const rd) {
907   int ch;
908   for (ch = 0; ch <= 1; ++ch) {
909     int8_t* const top = it->top_derr_[it->x_][ch];
910     int8_t* const left = it->left_derr_[ch];
911     left[0] = rd->derr[ch][0];            // restore err1
912     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
913     top[0]  = rd->derr[ch][1];            //     ... err2
914     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
915   }
916 }
917 
918 #undef C1
919 #undef C2
920 #undef DSHIFT
921 #undef DSCALE
922 
923 //------------------------------------------------------------------------------
924 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)925 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
926                          uint8_t* const yuv_out, int mode) {
927   const VP8Encoder* const enc = it->enc_;
928   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
929   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
930   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
931   int nz = 0;
932   int n;
933   int16_t tmp[8][16];
934 
935   for (n = 0; n < 8; n += 2) {
936     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
937   }
938   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
939 
940   if (DO_TRELLIS_UV && it->do_trellis_) {
941     int ch, x, y;
942     for (ch = 0, n = 0; ch <= 2; ch += 2) {
943       for (y = 0; y < 2; ++y) {
944         for (x = 0; x < 2; ++x, ++n) {
945           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
946           const int non_zero = TrellisQuantizeBlock(
947               enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
948               dqm->lambda_trellis_uv_);
949           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
950           nz |= non_zero << n;
951         }
952       }
953     }
954   } else {
955     for (n = 0; n < 8; n += 2) {
956       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
957     }
958   }
959 
960   for (n = 0; n < 8; n += 2) {
961     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
962   }
963   return (nz << 16);
964 }
965 
966 //------------------------------------------------------------------------------
967 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
968 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
969 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])970 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
971   // We look at the first three AC coefficients to determine what is the average
972   // delta between each sub-4x4 block.
973   const int v0 = abs(DCs[1]);
974   const int v1 = abs(DCs[2]);
975   const int v2 = abs(DCs[4]);
976   int max_v = (v1 > v0) ? v1 : v0;
977   max_v = (v2 > max_v) ? v2 : max_v;
978   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
979 }
980 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)981 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
982   VP8ModeScore* const tmp = *a;
983   *a = *b;
984   *b = tmp;
985 }
986 
SwapPtr(uint8_t ** a,uint8_t ** b)987 static void SwapPtr(uint8_t** a, uint8_t** b) {
988   uint8_t* const tmp = *a;
989   *a = *b;
990   *b = tmp;
991 }
992 
SwapOut(VP8EncIterator * const it)993 static void SwapOut(VP8EncIterator* const it) {
994   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
995 }
996 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)997 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
998   const int kNumBlocks = 16;
999   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1000   const int lambda = dqm->lambda_i16_;
1001   const int tlambda = dqm->tlambda_;
1002   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1003   VP8ModeScore rd_tmp;
1004   VP8ModeScore* rd_cur = &rd_tmp;
1005   VP8ModeScore* rd_best = rd;
1006   int mode;
1007   int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1008 
1009   rd->mode_i16 = -1;
1010   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1011     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
1012     rd_cur->mode_i16 = mode;
1013 
1014     // Reconstruct
1015     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1016 
1017     // Measure RD-score
1018     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1019     rd_cur->SD =
1020         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1021     rd_cur->H = VP8FixedCostsI16[mode];
1022     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1023     if (is_flat) {
1024       // refine the first impression (which was in pixel space)
1025       is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1026       if (is_flat) {
1027         // Block is very flat. We put emphasis on the distortion being very low!
1028         rd_cur->D *= 2;
1029         rd_cur->SD *= 2;
1030       }
1031     }
1032 
1033     // Since we always examine Intra16 first, we can overwrite *rd directly.
1034     SetRDScore(lambda, rd_cur);
1035     if (mode == 0 || rd_cur->score < rd_best->score) {
1036       SwapModeScore(&rd_cur, &rd_best);
1037       SwapOut(it);
1038     }
1039   }
1040   if (rd_best != rd) {
1041     memcpy(rd, rd_best, sizeof(*rd));
1042   }
1043   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1044   VP8SetIntra16Mode(it, rd->mode_i16);
1045 
1046   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1047   // distortion, record max delta so we can later adjust the minimal filtering
1048   // strength needed to smooth these blocks out.
1049   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1050     StoreMaxDelta(dqm, rd->y_dc_levels);
1051   }
1052 }
1053 
1054 //------------------------------------------------------------------------------
1055 
1056 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1057 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1058                                      const uint8_t modes[16]) {
1059   const int preds_w = it->enc_->preds_w_;
1060   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1061   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1062   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1063   return VP8FixedCostsI4[top][left];
1064 }
1065 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1066 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1067   const VP8Encoder* const enc = it->enc_;
1068   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1069   const int lambda = dqm->lambda_i4_;
1070   const int tlambda = dqm->tlambda_;
1071   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1072   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1073   int total_header_bits = 0;
1074   VP8ModeScore rd_best;
1075 
1076   if (enc->max_i4_header_bits_ == 0) {
1077     return 0;
1078   }
1079 
1080   InitScore(&rd_best);
1081   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1082   SetRDScore(dqm->lambda_mode_, &rd_best);
1083   VP8IteratorStartI4(it);
1084   do {
1085     const int kNumBlocks = 1;
1086     VP8ModeScore rd_i4;
1087     int mode;
1088     int best_mode = -1;
1089     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1090     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1091     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1092     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1093 
1094     InitScore(&rd_i4);
1095     VP8MakeIntra4Preds(it);
1096     for (mode = 0; mode < NUM_BMODES; ++mode) {
1097       VP8ModeScore rd_tmp;
1098       int16_t tmp_levels[16];
1099 
1100       // Reconstruct
1101       rd_tmp.nz =
1102           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1103 
1104       // Compute RD-score
1105       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1106       rd_tmp.SD =
1107           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1108                   : 0;
1109       rd_tmp.H = mode_costs[mode];
1110 
1111       // Add flatness penalty, to avoid flat area to be mispredicted
1112       // by a complex mode.
1113       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1114         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1115       } else {
1116         rd_tmp.R = 0;
1117       }
1118 
1119       // early-out check
1120       SetRDScore(lambda, &rd_tmp);
1121       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1122 
1123       // finish computing score
1124       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1125       SetRDScore(lambda, &rd_tmp);
1126 
1127       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1128         CopyScore(&rd_i4, &rd_tmp);
1129         best_mode = mode;
1130         SwapPtr(&tmp_dst, &best_block);
1131         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1132                sizeof(rd_best.y_ac_levels[it->i4_]));
1133       }
1134     }
1135     SetRDScore(dqm->lambda_mode_, &rd_i4);
1136     AddScore(&rd_best, &rd_i4);
1137     if (rd_best.score >= rd->score) {
1138       return 0;
1139     }
1140     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1141     if (total_header_bits > enc->max_i4_header_bits_) {
1142       return 0;
1143     }
1144     // Copy selected samples if not in the right place already.
1145     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1146       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1147     }
1148     rd->modes_i4[it->i4_] = best_mode;
1149     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1150   } while (VP8IteratorRotateI4(it, best_blocks));
1151 
1152   // finalize state
1153   CopyScore(rd, &rd_best);
1154   VP8SetIntra4Mode(it, rd->modes_i4);
1155   SwapOut(it);
1156   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1157   return 1;   // select intra4x4 over intra16x16
1158 }
1159 
1160 //------------------------------------------------------------------------------
1161 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1162 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1163   const int kNumBlocks = 8;
1164   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1165   const int lambda = dqm->lambda_uv_;
1166   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1167   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1168   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1169   uint8_t* dst = dst0;
1170   VP8ModeScore rd_best;
1171   int mode;
1172 
1173   rd->mode_uv = -1;
1174   InitScore(&rd_best);
1175   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1176     VP8ModeScore rd_uv;
1177 
1178     // Reconstruct
1179     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1180 
1181     // Compute RD-score
1182     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1183     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1184     rd_uv.H  = VP8FixedCostsUV[mode];
1185     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1186     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1187       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1188     }
1189 
1190     SetRDScore(lambda, &rd_uv);
1191     if (mode == 0 || rd_uv.score < rd_best.score) {
1192       CopyScore(&rd_best, &rd_uv);
1193       rd->mode_uv = mode;
1194       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1195       if (it->top_derr_ != NULL) {
1196         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1197       }
1198       SwapPtr(&dst, &tmp_dst);
1199     }
1200   }
1201   VP8SetIntraUVMode(it, rd->mode_uv);
1202   AddScore(rd, &rd_best);
1203   if (dst != dst0) {   // copy 16x8 block if needed
1204     VP8Copy16x8(dst, dst0);
1205   }
1206   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1207     StoreDiffusionErrors(it, rd);
1208   }
1209 }
1210 
1211 //------------------------------------------------------------------------------
1212 // Final reconstruction and quantization.
1213 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1214 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1215   const VP8Encoder* const enc = it->enc_;
1216   const int is_i16 = (it->mb_->type_ == 1);
1217   int nz = 0;
1218 
1219   if (is_i16) {
1220     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1221   } else {
1222     VP8IteratorStartI4(it);
1223     do {
1224       const int mode =
1225           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1226       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1227       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1228       VP8MakeIntra4Preds(it);
1229       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1230                               src, dst, mode) << it->i4_;
1231     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1232   }
1233 
1234   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1235   rd->nz = nz;
1236 }
1237 
1238 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1239 static void RefineUsingDistortion(VP8EncIterator* const it,
1240                                   int try_both_modes, int refine_uv_mode,
1241                                   VP8ModeScore* const rd) {
1242   score_t best_score = MAX_COST;
1243   int nz = 0;
1244   int mode;
1245   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1246 
1247   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1248   // Some empiric constants, of approximate order of magnitude.
1249   const int lambda_d_i16 = 106;
1250   const int lambda_d_i4 = 11;
1251   const int lambda_d_uv = 120;
1252   score_t score_i4 = dqm->i4_penalty_;
1253   score_t i4_bit_sum = 0;
1254   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1255                                            : MAX_COST;  // no early-out allowed
1256 
1257   if (is_i16) {   // First, evaluate Intra16 distortion
1258     int best_mode = -1;
1259     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1260     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1261       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1262       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1263                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1264       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1265         continue;
1266       }
1267 
1268       if (score < best_score) {
1269         best_mode = mode;
1270         best_score = score;
1271       }
1272     }
1273     if (it->x_ == 0 || it->y_ == 0) {
1274       // avoid starting a checkerboard resonance from the border. See bug #432.
1275       if (IsFlatSource16(src)) {
1276         best_mode = (it->x_ == 0) ? 0 : 2;
1277         try_both_modes = 0;  // stick to i16
1278       }
1279     }
1280     VP8SetIntra16Mode(it, best_mode);
1281     // we'll reconstruct later, if i16 mode actually gets selected
1282   }
1283 
1284   // Next, evaluate Intra4
1285   if (try_both_modes || !is_i16) {
1286     // We don't evaluate the rate here, but just account for it through a
1287     // constant penalty (i4 mode usually needs more bits compared to i16).
1288     is_i16 = 0;
1289     VP8IteratorStartI4(it);
1290     do {
1291       int best_i4_mode = -1;
1292       score_t best_i4_score = MAX_COST;
1293       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1294       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1295 
1296       VP8MakeIntra4Preds(it);
1297       for (mode = 0; mode < NUM_BMODES; ++mode) {
1298         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1299         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1300                             + mode_costs[mode] * lambda_d_i4;
1301         if (score < best_i4_score) {
1302           best_i4_mode = mode;
1303           best_i4_score = score;
1304         }
1305       }
1306       i4_bit_sum += mode_costs[best_i4_mode];
1307       rd->modes_i4[it->i4_] = best_i4_mode;
1308       score_i4 += best_i4_score;
1309       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1310         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1311         is_i16 = 1;
1312         break;
1313       } else {  // reconstruct partial block inside yuv_out2_ buffer
1314         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1315         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1316                                 src, tmp_dst, best_i4_mode) << it->i4_;
1317       }
1318     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1319   }
1320 
1321   // Final reconstruction, depending on which mode is selected.
1322   if (!is_i16) {
1323     VP8SetIntra4Mode(it, rd->modes_i4);
1324     SwapOut(it);
1325     best_score = score_i4;
1326   } else {
1327     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1328   }
1329 
1330   // ... and UV!
1331   if (refine_uv_mode) {
1332     int best_mode = -1;
1333     score_t best_uv_score = MAX_COST;
1334     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1335     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1336       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1337       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1338                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1339       if (score < best_uv_score) {
1340         best_mode = mode;
1341         best_uv_score = score;
1342       }
1343     }
1344     VP8SetIntraUVMode(it, best_mode);
1345   }
1346   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1347 
1348   rd->nz = nz;
1349   rd->score = best_score;
1350 }
1351 
1352 //------------------------------------------------------------------------------
1353 // Entry point
1354 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1355 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1356                 VP8RDLevel rd_opt) {
1357   int is_skipped;
1358   const int method = it->enc_->method_;
1359 
1360   InitScore(rd);
1361 
1362   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1363   // Luma4x4 predictions needs to be done as-we-go.
1364   VP8MakeLuma16Preds(it);
1365   VP8MakeChroma8Preds(it);
1366 
1367   if (rd_opt > RD_OPT_NONE) {
1368     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1369     PickBestIntra16(it, rd);
1370     if (method >= 2) {
1371       PickBestIntra4(it, rd);
1372     }
1373     PickBestUV(it, rd);
1374     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1375       it->do_trellis_ = 1;
1376       SimpleQuantize(it, rd);
1377     }
1378   } else {
1379     // At this point we have heuristically decided intra16 / intra4.
1380     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1381     // For method <= 1, we don't re-examine the decision but just go ahead with
1382     // quantization/reconstruction.
1383     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1384   }
1385   is_skipped = (rd->nz == 0);
1386   VP8SetSkip(it, is_skipped);
1387   return is_skipped;
1388 }
1389