1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 //   Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13 
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h>  // for abs()
17 
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21 
22 #define DO_TRELLIS_I4  1
23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26 
27 #define MID_ALPHA 64      // neutral value for susceptibility
28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
30 
31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
32                           // power-law modulation. Must be strictly less than 1.
33 
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0       // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4  3       // I4 mode
38 #define FLATNESS_LIMIT_UV  2       // UV mode
39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
40 
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42 
43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
44 
45 // #define DEBUG_BLOCK
46 
47 //------------------------------------------------------------------------------
48 
49 #if defined(DEBUG_BLOCK)
50 
51 #include <stdio.h>
52 #include <stdlib.h>
53 
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55                            const VP8ModeScore* const rd) {
56   int i, j;
57   const int is_i16 = (it->mb_->type_ == 1);
58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62   printf("SOURCE / OUTPUT / ABS DELTA\n");
63   for (j = 0; j < 16; ++j) {
64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65     printf("     ");
66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67     printf("     ");
68     for (i = 0; i < 16; ++i) {
69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70     }
71     printf("\n");
72   }
73   printf("\n");   // newline before the U/V block
74   for (j = 0; j < 8; ++j) {
75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76     printf(" ");
77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78     printf("    ");
79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80     printf(" ");
81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82     printf("   ");
83     for (i = 0; i < 8; ++i) {
84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85     }
86     printf(" ");
87     for (i = 8; i < 16; ++i) {
88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89     }
90     printf("\n");
91   }
92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94     (int)rd->score);
95   if (is_i16) {
96     printf("Mode: %d\n", rd->mode_i16);
97     printf("y_dc_levels:");
98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99     printf("\n");
100   } else {
101     printf("Modes[16]: ");
102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103     printf("\n");
104   }
105   printf("y_ac_levels:\n");
106   for (j = 0; j < 16; ++j) {
107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108       printf("%4d ", rd->y_ac_levels[j][i]);
109     }
110     printf("\n");
111   }
112   printf("\n");
113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
114   for (j = 0; j < 8; ++j) {
115     for (i = 0; i < 16; ++i) {
116       printf("%4d ", rd->uv_levels[j][i]);
117     }
118     printf("\n");
119   }
120 }
121 
122 #endif   // DEBUG_BLOCK
123 
124 //------------------------------------------------------------------------------
125 
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127   return v < m ? m : v > M ? M : v;
128 }
129 
130 static const uint8_t kZigzag[16] = {
131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133 
134 static const uint8_t kDcTable[128] = {
135   4,     5,   6,   7,   8,   9,  10,  10,
136   11,   12,  13,  14,  15,  16,  17,  17,
137   18,   19,  20,  20,  21,  21,  22,  22,
138   23,   23,  24,  25,  25,  26,  27,  28,
139   29,   30,  31,  32,  33,  34,  35,  36,
140   37,   37,  38,  39,  40,  41,  42,  43,
141   44,   45,  46,  46,  47,  48,  49,  50,
142   51,   52,  53,  54,  55,  56,  57,  58,
143   59,   60,  61,  62,  63,  64,  65,  66,
144   67,   68,  69,  70,  71,  72,  73,  74,
145   75,   76,  76,  77,  78,  79,  80,  81,
146   82,   83,  84,  85,  86,  87,  88,  89,
147   91,   93,  95,  96,  98, 100, 101, 102,
148   104, 106, 108, 110, 112, 114, 116, 118,
149   122, 124, 126, 128, 130, 132, 134, 136,
150   138, 140, 143, 145, 148, 151, 154, 157
151 };
152 
153 static const uint16_t kAcTable[128] = {
154   4,     5,   6,   7,   8,   9,  10,  11,
155   12,   13,  14,  15,  16,  17,  18,  19,
156   20,   21,  22,  23,  24,  25,  26,  27,
157   28,   29,  30,  31,  32,  33,  34,  35,
158   36,   37,  38,  39,  40,  41,  42,  43,
159   44,   45,  46,  47,  48,  49,  50,  51,
160   52,   53,  54,  55,  56,  57,  58,  60,
161   62,   64,  66,  68,  70,  72,  74,  76,
162   78,   80,  82,  84,  86,  88,  90,  92,
163   94,   96,  98, 100, 102, 104, 106, 108,
164   110, 112, 114, 116, 119, 122, 125, 128,
165   131, 134, 137, 140, 143, 146, 149, 152,
166   155, 158, 161, 164, 167, 170, 173, 177,
167   181, 185, 189, 193, 197, 201, 205, 209,
168   213, 217, 221, 225, 229, 234, 239, 245,
169   249, 254, 259, 264, 269, 274, 279, 284
170 };
171 
172 static const uint16_t kAcTable2[128] = {
173   8,     8,   9,  10,  12,  13,  15,  17,
174   18,   20,  21,  23,  24,  26,  27,  29,
175   31,   32,  34,  35,  37,  38,  40,  41,
176   43,   44,  46,  48,  49,  51,  52,  54,
177   55,   57,  58,  60,  62,  63,  65,  66,
178   68,   69,  71,  72,  74,  75,  77,  79,
179   80,   82,  83,  85,  86,  88,  89,  93,
180   96,   99, 102, 105, 108, 111, 114, 117,
181   120, 124, 127, 130, 133, 136, 139, 142,
182   145, 148, 151, 155, 158, 161, 164, 167,
183   170, 173, 176, 179, 184, 189, 193, 198,
184   203, 207, 212, 217, 221, 226, 230, 235,
185   240, 244, 249, 254, 258, 263, 268, 274,
186   280, 286, 292, 299, 305, 311, 317, 323,
187   330, 336, 342, 348, 354, 362, 370, 379,
188   385, 393, 401, 409, 416, 424, 432, 440
189 };
190 
191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
192   { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194 
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199   0,  30, 60, 90,
200   30, 60, 90, 90,
201   60, 90, 90, 90,
202   90, 90, 90, 90
203 };
204 
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207 
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210   int i, sum;
211   for (i = 0; i < 2; ++i) {
212     const int is_ac_coeff = (i > 0);
213     const int bias = kBiasMatrices[type][is_ac_coeff];
214     m->iq_[i] = (1 << QFIX) / m->q_[i];
215     m->bias_[i] = BIAS(bias);
216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217     //   * zero if coeff <= zthresh
218     //   * non-zero if coeff > zthresh
219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220   }
221   for (i = 2; i < 16; ++i) {
222     m->q_[i] = m->q_[1];
223     m->iq_[i] = m->iq_[1];
224     m->bias_[i] = m->bias_[1];
225     m->zthresh_[i] = m->zthresh_[1];
226   }
227   for (sum = 0, i = 0; i < 16; ++i) {
228     if (type == 0) {  // we only use sharpening for AC luma coeffs
229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230     } else {
231       m->sharpen_[i] = 0;
232     }
233     sum += m->q_[i];
234   }
235   return (sum + 8) >> 4;
236 }
237 
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239 
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241   int i;
242   const int tlambda_scale =
243     (enc->method_ >= 4) ? enc->config_->sns_strength
244                         : 0;
245   const int num_segments = enc->segment_hdr_.num_segments_;
246   for (i = 0; i < num_segments; ++i) {
247     VP8SegmentInfo* const m = &enc->dqm_[i];
248     const int q = m->quant_;
249     int q_i4, q_i16, q_uv;
250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
252 
253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255 
256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258 
259     q_i4  = ExpandMatrix(&m->y1_, 0);
260     q_i16 = ExpandMatrix(&m->y2_, 1);
261     q_uv  = ExpandMatrix(&m->uv_, 2);
262 
263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
264     m->lambda_i16_         = (3 * q_i16 * q_i16);
265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
271 
272     // none of these constants should be < 1
273     CheckLambdaValue(&m->lambda_i4_);
274     CheckLambdaValue(&m->lambda_i16_);
275     CheckLambdaValue(&m->lambda_uv_);
276     CheckLambdaValue(&m->lambda_mode_);
277     CheckLambdaValue(&m->lambda_trellis_i4_);
278     CheckLambdaValue(&m->lambda_trellis_i16_);
279     CheckLambdaValue(&m->lambda_trellis_uv_);
280     CheckLambdaValue(&m->tlambda_);
281 
282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
283     m->max_edge_  = 0;
284 
285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
286   }
287 }
288 
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291 
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295 
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297   int i;
298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299   const int level0 = 5 * enc->config_->filter_strength;
300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301     VP8SegmentInfo* const m = &enc->dqm_[i];
302     // We focus on the quantization of AC coeffs.
303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304     const int base_strength =
305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306     // Segments with lower complexity ('beta') will be less filtered.
307     const int f = base_strength * level0 / (256 + m->beta_);
308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309   }
310   // We record the initial strength (mainly for the case of 1-segment only).
311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315 
316 //------------------------------------------------------------------------------
317 
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322 
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330   // in the mid-quant range. So we scale the compressibility inversely to
331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
333   // more explicitly.
334   const double v = pow(linear_c, 1 / 3.);
335   return v;
336 }
337 
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339   // We map the complexity 'alpha' and quality setting 'c' to a compression
340   // exponent empirically matched to the compression curve of libjpeg6b.
341   // On average, the WebP output size will be roughly similar to that of a
342   // JPEG file compressed with same quality factor.
343   const double amin = 0.30;
344   const double amax = 0.85;
345   const double exp_min = 0.4;
346   const double exp_max = 0.9;
347   const double slope = (exp_min - exp_max) / (amax - amin);
348   // Linearly interpolate 'expn' from exp_min to exp_max
349   // in the [amin, amax] range.
350   const double expn = (alpha > amax) ? exp_min
351                     : (alpha < amin) ? exp_max
352                     : exp_max + slope * (alpha - amin);
353   const double v = pow(c, expn);
354   return v;
355 }
356 
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358                                  const VP8SegmentInfo* const S2) {
359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361 
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368                                ? enc->segment_hdr_.num_segments_
369                                : NUM_MB_SEGMENTS;
370   int num_final_segments = 1;
371   int s1, s2;
372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374     int found = 0;
375     // check if we already have similar segment
376     for (s2 = 0; s2 < num_final_segments; ++s2) {
377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378       if (SegmentsAreEquivalent(S1, S2)) {
379         found = 1;
380         break;
381       }
382     }
383     map[s1] = s2;
384     if (!found) {
385       if (num_final_segments != s1) {
386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
387       }
388       ++num_final_segments;
389     }
390   }
391   if (num_final_segments < num_segments) {  // Remap
392     int i = enc->mb_w_ * enc->mb_h_;
393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394     enc->segment_hdr_.num_segments_ = num_final_segments;
395     // Replicate the trailing segment infos (it's mostly cosmetics)
396     for (i = num_final_segments; i < num_segments; ++i) {
397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398     }
399   }
400 }
401 
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403   int i;
404   int dq_uv_ac, dq_uv_dc;
405   const int num_segments = enc->segment_hdr_.num_segments_;
406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407   const double Q = quality / 100.;
408   const double c_base = enc->config_->emulate_jpeg_size ?
409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410       QualityToCompression(Q);
411   for (i = 0; i < num_segments; ++i) {
412     // We modulate the base coefficient to accommodate for the quantization
413     // susceptibility and allow denser segments to be quantized more.
414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
415     const double c = pow(c_base, expn);
416     const int q = (int)(127. * (1. - c));
417     assert(expn > 0.);
418     enc->dqm_[i].quant_ = clip(q, 0, 127);
419   }
420 
421   // purely indicative in the bitstream (except for the 1-segment case)
422   enc->base_quant_ = enc->dqm_[0].quant_;
423 
424   // fill-in values for the unused segments (required by the syntax)
425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426     enc->dqm_[i].quant_ = enc->base_quant_;
427   }
428 
429   // uv_alpha_ is normally spread around ~60. The useful range is
430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433                                           / (MAX_ALPHA - MIN_ALPHA);
434   // we rescale by the user-defined strength of adaptation
435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436   // and make it safe.
437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438   // We also boost the dc-uv-quant a little, based on sns-strength, since
439   // U/V channels are quite more reactive to high quants (flat DC-blocks
440   // tend to appear, and are unpleasant).
441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
443 
444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
445   enc->dq_y2_dc_ = 0;
446   enc->dq_y2_ac_ = 0;
447   enc->dq_uv_dc_ = dq_uv_dc;
448   enc->dq_uv_ac_ = dq_uv_ac;
449 
450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
451 
452   if (num_segments > 1) SimplifySegments(enc);
453 
454   SetupMatrices(enc);         // finalize quantization matrices
455 }
456 
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459 
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463 
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468 
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472   VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474 
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478   VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480 
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484 
485 //------------------------------------------------------------------------------
486 // Quantize
487 
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495 
496 const uint16_t VP8Scan[16] = {  // Luma
497   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
498   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
499   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
500   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502 
503 static const uint16_t VP8ScanUV[4 + 4] = {
504   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
505   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
506 };
507 
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510 
511 static const uint16_t kWeightY[16] = {
512   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514 
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519   30, 27, 19, 11,
520   27, 24, 17, 10,
521   19, 17, 12,  8,
522   11, 10,  8,  6
523 #endif
524 };
525 
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528   rd->D  = 0;
529   rd->SD = 0;
530   rd->R  = 0;
531   rd->H  = 0;
532   rd->nz = 0;
533   rd->score = MAX_COST;
534 }
535 
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537   dst->D  = src->D;
538   dst->SD = src->SD;
539   dst->R  = src->R;
540   dst->H  = src->H;
541   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
542   dst->score = src->score;
543 }
544 
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546   dst->D  += src->D;
547   dst->SD += src->SD;
548   dst->R  += src->R;
549   dst->H  += src->H;
550   dst->nz |= src->nz;     // here, new nz bits are accumulated.
551   dst->score += src->score;
552 }
553 
554 //------------------------------------------------------------------------------
555 // Performs trellis-optimized quantization.
556 
557 // Trellis node
558 typedef struct {
559   int8_t prev;            // best previous node
560   int8_t sign;            // sign of coeff_i
561   int16_t level;          // level
562 } Node;
563 
564 // Score state
565 typedef struct {
566   score_t score;          // partial RD score
567   const uint16_t* costs;  // shortcut to cost tables
568 } ScoreState;
569 
570 // If a coefficient was quantized to a value Q (using a neutral bias),
571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572 // We don't test negative values though.
573 #define MIN_DELTA 0   // how much lower level to try
574 #define MAX_DELTA 1   // how much higher
575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578 
SetRDScore(int lambda,VP8ModeScore * const rd)579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581 }
582 
RDScoreTrellis(int lambda,score_t rate,score_t distortion)583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584                                           score_t distortion) {
585   return rate * lambda + RD_DISTO_MULT * distortion;
586 }
587 
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)588 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
589                                 int16_t in[16], int16_t out[16],
590                                 int ctx0, int coeff_type,
591                                 const VP8Matrix* const mtx,
592                                 int lambda) {
593   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
594   CostArrayPtr const costs =
595       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
596   const int first = (coeff_type == 0) ? 1 : 0;
597   Node nodes[16][NUM_NODES];
598   ScoreState score_states[2][NUM_NODES];
599   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
600   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
601   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
602   score_t best_score;
603   int n, m, p, last;
604 
605   {
606     score_t cost;
607     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
608     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
609 
610     // compute the position of the last interesting coefficient
611     last = first - 1;
612     for (n = 15; n >= first; --n) {
613       const int j = kZigzag[n];
614       const int err = in[j] * in[j];
615       if (err > thresh) {
616         last = n;
617         break;
618       }
619     }
620     // we don't need to go inspect up to n = 16 coeffs. We can just go up
621     // to last + 1 (inclusive) without losing much.
622     if (last < 15) ++last;
623 
624     // compute 'skip' score. This is the max score one can do.
625     cost = VP8BitCost(0, last_proba);
626     best_score = RDScoreTrellis(lambda, cost, 0);
627 
628     // initialize source node.
629     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
630       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
631       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
632       ss_cur[m].costs = costs[first][ctx0];
633     }
634   }
635 
636   // traverse trellis.
637   for (n = first; n <= last; ++n) {
638     const int j = kZigzag[n];
639     const uint32_t Q  = mtx->q_[j];
640     const uint32_t iQ = mtx->iq_[j];
641     const uint32_t B = BIAS(0x00);     // neutral bias
642     // note: it's important to take sign of the _original_ coeff,
643     // so we don't have to consider level < 0 afterward.
644     const int sign = (in[j] < 0);
645     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
646     int level0 = QUANTDIV(coeff0, iQ, B);
647     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
648     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
649     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
650 
651     {   // Swap current and previous score states
652       ScoreState* const tmp = ss_cur;
653       ss_cur = ss_prev;
654       ss_prev = tmp;
655     }
656 
657     // test all alternate level values around level0.
658     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
659       Node* const cur = &NODE(n, m);
660       int level = level0 + m;
661       const int ctx = (level > 2) ? 2 : level;
662       const int band = VP8EncBands[n + 1];
663       score_t base_score;
664       score_t best_cur_score = MAX_COST;
665       int best_prev = 0;   // default, in case
666 
667       ss_cur[m].score = MAX_COST;
668       ss_cur[m].costs = costs[n + 1][ctx];
669       if (level < 0 || level > thresh_level) {
670         // Node is dead.
671         continue;
672       }
673 
674       {
675         // Compute delta_error = how much coding this level will
676         // subtract to max_error as distortion.
677         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
678         const int new_error = coeff0 - level * Q;
679         const int delta_error =
680             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
681         base_score = RDScoreTrellis(lambda, 0, delta_error);
682       }
683 
684       // Inspect all possible non-dead predecessors. Retain only the best one.
685       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
686         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
687         // eliminated since their score can't be better than the current best.
688         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
689         // Examine node assuming it's a non-terminal one.
690         const score_t score =
691             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
692         if (score < best_cur_score) {
693           best_cur_score = score;
694           best_prev = p;
695         }
696       }
697       // Store best finding in current node.
698       cur->sign = sign;
699       cur->level = level;
700       cur->prev = best_prev;
701       ss_cur[m].score = best_cur_score;
702 
703       // Now, record best terminal node (and thus best entry in the graph).
704       if (level != 0) {
705         const score_t last_pos_cost =
706             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
707         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
708         const score_t score = best_cur_score + last_pos_score;
709         if (score < best_score) {
710           best_score = score;
711           best_path[0] = n;                     // best eob position
712           best_path[1] = m;                     // best node index
713           best_path[2] = best_prev;             // best predecessor
714         }
715       }
716     }
717   }
718 
719   // Fresh start
720   memset(in + first, 0, (16 - first) * sizeof(*in));
721   memset(out + first, 0, (16 - first) * sizeof(*out));
722   if (best_path[0] == -1) {
723     return 0;   // skip!
724   }
725 
726   {
727     // Unwind the best path.
728     // Note: best-prev on terminal node is not necessarily equal to the
729     // best_prev for non-terminal. So we patch best_path[2] in.
730     int nz = 0;
731     int best_node = best_path[1];
732     n = best_path[0];
733     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
734 
735     for (; n >= first; --n) {
736       const Node* const node = &NODE(n, best_node);
737       const int j = kZigzag[n];
738       out[n] = node->sign ? -node->level : node->level;
739       nz |= node->level;
740       in[j] = out[n] * mtx->q_[j];
741       best_node = node->prev;
742     }
743     return (nz != 0);
744   }
745 }
746 
747 #undef NODE
748 
749 //------------------------------------------------------------------------------
750 // Performs: difference, transform, quantize, back-transform, add
751 // all at once. Output is the reconstructed block in *yuv_out, and the
752 // quantized levels in *levels.
753 
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)754 static int ReconstructIntra16(VP8EncIterator* const it,
755                               VP8ModeScore* const rd,
756                               uint8_t* const yuv_out,
757                               int mode) {
758   const VP8Encoder* const enc = it->enc_;
759   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
760   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
761   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
762   int nz = 0;
763   int n;
764   int16_t tmp[16][16], dc_tmp[16];
765 
766   for (n = 0; n < 16; n += 2) {
767     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
768   }
769   VP8FTransformWHT(tmp[0], dc_tmp);
770   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
771 
772   if (DO_TRELLIS_I16 && it->do_trellis_) {
773     int x, y;
774     VP8IteratorNzToBytes(it);
775     for (y = 0, n = 0; y < 4; ++y) {
776       for (x = 0; x < 4; ++x, ++n) {
777         const int ctx = it->top_nz_[x] + it->left_nz_[y];
778         const int non_zero =
779             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
780                                  &dqm->y1_, dqm->lambda_trellis_i16_);
781         it->top_nz_[x] = it->left_nz_[y] = non_zero;
782         rd->y_ac_levels[n][0] = 0;
783         nz |= non_zero << n;
784       }
785     }
786   } else {
787     for (n = 0; n < 16; n += 2) {
788       // Zero-out the first coeff, so that: a) nz is correct below, and
789       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
790       tmp[n][0] = tmp[n + 1][0] = 0;
791       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
792       assert(rd->y_ac_levels[n + 0][0] == 0);
793       assert(rd->y_ac_levels[n + 1][0] == 0);
794     }
795   }
796 
797   // Transform back
798   VP8TransformWHT(dc_tmp, tmp[0]);
799   for (n = 0; n < 16; n += 2) {
800     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
801   }
802 
803   return nz;
804 }
805 
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)806 static int ReconstructIntra4(VP8EncIterator* const it,
807                              int16_t levels[16],
808                              const uint8_t* const src,
809                              uint8_t* const yuv_out,
810                              int mode) {
811   const VP8Encoder* const enc = it->enc_;
812   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
813   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
814   int nz = 0;
815   int16_t tmp[16];
816 
817   VP8FTransform(src, ref, tmp);
818   if (DO_TRELLIS_I4 && it->do_trellis_) {
819     const int x = it->i4_ & 3, y = it->i4_ >> 2;
820     const int ctx = it->top_nz_[x] + it->left_nz_[y];
821     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
822                               dqm->lambda_trellis_i4_);
823   } else {
824     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
825   }
826   VP8ITransform(ref, tmp, yuv_out, 0);
827   return nz;
828 }
829 
830 //------------------------------------------------------------------------------
831 // DC-error diffusion
832 
833 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
834 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
835 #define C1 7    // fraction of error sent to the 4x4 block below
836 #define C2 8    // fraction of error sent to the 4x4 block on the right
837 #define DSHIFT 4
838 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
839 
840 // Quantize as usual, but also compute and return the quantization error.
841 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)842 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
843   int V = *v;
844   const int sign = (V < 0);
845   if (sign) V = -V;
846   if (V > (int)mtx->zthresh_[0]) {
847     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
848     const int err = (V - qV);
849     *v = sign ? -qV : qV;
850     return (sign ? -err : err) >> DSCALE;
851   }
852   *v = 0;
853   return (sign ? -V : V) >> DSCALE;
854 }
855 
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)856 static void CorrectDCValues(const VP8EncIterator* const it,
857                             const VP8Matrix* const mtx,
858                             int16_t tmp[][16], VP8ModeScore* const rd) {
859   //         | top[0] | top[1]
860   // --------+--------+---------
861   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
862   // left[1] | tmp[2]   tmp[3]        err2 err3
863   //
864   // Final errors {err1,err2,err3} are preserved and later restored
865   // as top[]/left[] on the next block.
866   int ch;
867   for (ch = 0; ch <= 1; ++ch) {
868     const int8_t* const top = it->top_derr_[it->x_][ch];
869     const int8_t* const left = it->left_derr_[ch];
870     int16_t (* const c)[16] = &tmp[ch * 4];
871     int err0, err1, err2, err3;
872     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
873     err0 = QuantizeSingle(&c[0][0], mtx);
874     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
875     err1 = QuantizeSingle(&c[1][0], mtx);
876     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
877     err2 = QuantizeSingle(&c[2][0], mtx);
878     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
879     err3 = QuantizeSingle(&c[3][0], mtx);
880     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
881     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
882     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
883     rd->derr[ch][0] = (int8_t)err1;
884     rd->derr[ch][1] = (int8_t)err2;
885     rd->derr[ch][2] = (int8_t)err3;
886   }
887 }
888 
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)889 static void StoreDiffusionErrors(VP8EncIterator* const it,
890                                  const VP8ModeScore* const rd) {
891   int ch;
892   for (ch = 0; ch <= 1; ++ch) {
893     int8_t* const top = it->top_derr_[it->x_][ch];
894     int8_t* const left = it->left_derr_[ch];
895     left[0] = rd->derr[ch][0];            // restore err1
896     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
897     top[0]  = rd->derr[ch][1];            //     ... err2
898     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
899   }
900 }
901 
902 #undef C1
903 #undef C2
904 #undef DSHIFT
905 #undef DSCALE
906 
907 //------------------------------------------------------------------------------
908 
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)909 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
910                          uint8_t* const yuv_out, int mode) {
911   const VP8Encoder* const enc = it->enc_;
912   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
913   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
914   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
915   int nz = 0;
916   int n;
917   int16_t tmp[8][16];
918 
919   for (n = 0; n < 8; n += 2) {
920     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
921   }
922   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
923 
924   if (DO_TRELLIS_UV && it->do_trellis_) {
925     int ch, x, y;
926     for (ch = 0, n = 0; ch <= 2; ch += 2) {
927       for (y = 0; y < 2; ++y) {
928         for (x = 0; x < 2; ++x, ++n) {
929           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
930           const int non_zero =
931               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
932                                    &dqm->uv_, dqm->lambda_trellis_uv_);
933           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
934           nz |= non_zero << n;
935         }
936       }
937     }
938   } else {
939     for (n = 0; n < 8; n += 2) {
940       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
941     }
942   }
943 
944   for (n = 0; n < 8; n += 2) {
945     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
946   }
947   return (nz << 16);
948 }
949 
950 //------------------------------------------------------------------------------
951 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
952 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
953 
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])954 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
955   // We look at the first three AC coefficients to determine what is the average
956   // delta between each sub-4x4 block.
957   const int v0 = abs(DCs[1]);
958   const int v1 = abs(DCs[2]);
959   const int v2 = abs(DCs[4]);
960   int max_v = (v1 > v0) ? v1 : v0;
961   max_v = (v2 > max_v) ? v2 : max_v;
962   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
963 }
964 
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)965 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
966   VP8ModeScore* const tmp = *a;
967   *a = *b;
968   *b = tmp;
969 }
970 
SwapPtr(uint8_t ** a,uint8_t ** b)971 static void SwapPtr(uint8_t** a, uint8_t** b) {
972   uint8_t* const tmp = *a;
973   *a = *b;
974   *b = tmp;
975 }
976 
SwapOut(VP8EncIterator * const it)977 static void SwapOut(VP8EncIterator* const it) {
978   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
979 }
980 
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)981 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
982   const int kNumBlocks = 16;
983   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
984   const int lambda = dqm->lambda_i16_;
985   const int tlambda = dqm->tlambda_;
986   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
987   VP8ModeScore rd_tmp;
988   VP8ModeScore* rd_cur = &rd_tmp;
989   VP8ModeScore* rd_best = rd;
990   int mode;
991   int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
992 
993   rd->mode_i16 = -1;
994   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
995     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
996     rd_cur->mode_i16 = mode;
997 
998     // Reconstruct
999     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1000 
1001     // Measure RD-score
1002     rd_cur->D = VP8SSE16x16(src, tmp_dst);
1003     rd_cur->SD =
1004         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1005     rd_cur->H = VP8FixedCostsI16[mode];
1006     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1007     if (is_flat) {
1008       // refine the first impression (which was in pixel space)
1009       is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1010       if (is_flat) {
1011         // Block is very flat. We put emphasis on the distortion being very low!
1012         rd_cur->D *= 2;
1013         rd_cur->SD *= 2;
1014       }
1015     }
1016 
1017     // Since we always examine Intra16 first, we can overwrite *rd directly.
1018     SetRDScore(lambda, rd_cur);
1019     if (mode == 0 || rd_cur->score < rd_best->score) {
1020       SwapModeScore(&rd_cur, &rd_best);
1021       SwapOut(it);
1022     }
1023   }
1024   if (rd_best != rd) {
1025     memcpy(rd, rd_best, sizeof(*rd));
1026   }
1027   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
1028   VP8SetIntra16Mode(it, rd->mode_i16);
1029 
1030   // we have a blocky macroblock (only DCs are non-zero) with fairly high
1031   // distortion, record max delta so we can later adjust the minimal filtering
1032   // strength needed to smooth these blocks out.
1033   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1034     StoreMaxDelta(dqm, rd->y_dc_levels);
1035   }
1036 }
1037 
1038 //------------------------------------------------------------------------------
1039 
1040 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1041 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1042                                      const uint8_t modes[16]) {
1043   const int preds_w = it->enc_->preds_w_;
1044   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1045   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1046   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1047   return VP8FixedCostsI4[top][left];
1048 }
1049 
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1050 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1051   const VP8Encoder* const enc = it->enc_;
1052   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1053   const int lambda = dqm->lambda_i4_;
1054   const int tlambda = dqm->tlambda_;
1055   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1056   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1057   int total_header_bits = 0;
1058   VP8ModeScore rd_best;
1059 
1060   if (enc->max_i4_header_bits_ == 0) {
1061     return 0;
1062   }
1063 
1064   InitScore(&rd_best);
1065   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
1066   SetRDScore(dqm->lambda_mode_, &rd_best);
1067   VP8IteratorStartI4(it);
1068   do {
1069     const int kNumBlocks = 1;
1070     VP8ModeScore rd_i4;
1071     int mode;
1072     int best_mode = -1;
1073     const uint8_t* const src = src0 + VP8Scan[it->i4_];
1074     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1075     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1076     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
1077 
1078     InitScore(&rd_i4);
1079     VP8MakeIntra4Preds(it);
1080     for (mode = 0; mode < NUM_BMODES; ++mode) {
1081       VP8ModeScore rd_tmp;
1082       int16_t tmp_levels[16];
1083 
1084       // Reconstruct
1085       rd_tmp.nz =
1086           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1087 
1088       // Compute RD-score
1089       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1090       rd_tmp.SD =
1091           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1092                   : 0;
1093       rd_tmp.H = mode_costs[mode];
1094 
1095       // Add flatness penalty, to avoid flat area to be mispredicted
1096       // by a complex mode.
1097       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1098         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1099       } else {
1100         rd_tmp.R = 0;
1101       }
1102 
1103       // early-out check
1104       SetRDScore(lambda, &rd_tmp);
1105       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1106 
1107       // finish computing score
1108       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1109       SetRDScore(lambda, &rd_tmp);
1110 
1111       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1112         CopyScore(&rd_i4, &rd_tmp);
1113         best_mode = mode;
1114         SwapPtr(&tmp_dst, &best_block);
1115         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1116                sizeof(rd_best.y_ac_levels[it->i4_]));
1117       }
1118     }
1119     SetRDScore(dqm->lambda_mode_, &rd_i4);
1120     AddScore(&rd_best, &rd_i4);
1121     if (rd_best.score >= rd->score) {
1122       return 0;
1123     }
1124     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
1125     if (total_header_bits > enc->max_i4_header_bits_) {
1126       return 0;
1127     }
1128     // Copy selected samples if not in the right place already.
1129     if (best_block != best_blocks + VP8Scan[it->i4_]) {
1130       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1131     }
1132     rd->modes_i4[it->i4_] = best_mode;
1133     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1134   } while (VP8IteratorRotateI4(it, best_blocks));
1135 
1136   // finalize state
1137   CopyScore(rd, &rd_best);
1138   VP8SetIntra4Mode(it, rd->modes_i4);
1139   SwapOut(it);
1140   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1141   return 1;   // select intra4x4 over intra16x16
1142 }
1143 
1144 //------------------------------------------------------------------------------
1145 
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1146 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1147   const int kNumBlocks = 8;
1148   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1149   const int lambda = dqm->lambda_uv_;
1150   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1151   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
1152   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1153   uint8_t* dst = dst0;
1154   VP8ModeScore rd_best;
1155   int mode;
1156 
1157   rd->mode_uv = -1;
1158   InitScore(&rd_best);
1159   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1160     VP8ModeScore rd_uv;
1161 
1162     // Reconstruct
1163     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1164 
1165     // Compute RD-score
1166     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
1167     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
1168     rd_uv.H  = VP8FixedCostsUV[mode];
1169     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
1170     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1171       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1172     }
1173 
1174     SetRDScore(lambda, &rd_uv);
1175     if (mode == 0 || rd_uv.score < rd_best.score) {
1176       CopyScore(&rd_best, &rd_uv);
1177       rd->mode_uv = mode;
1178       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1179       if (it->top_derr_ != NULL) {
1180         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1181       }
1182       SwapPtr(&dst, &tmp_dst);
1183     }
1184   }
1185   VP8SetIntraUVMode(it, rd->mode_uv);
1186   AddScore(rd, &rd_best);
1187   if (dst != dst0) {   // copy 16x8 block if needed
1188     VP8Copy16x8(dst, dst0);
1189   }
1190   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
1191     StoreDiffusionErrors(it, rd);
1192   }
1193 }
1194 
1195 //------------------------------------------------------------------------------
1196 // Final reconstruction and quantization.
1197 
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1198 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1199   const VP8Encoder* const enc = it->enc_;
1200   const int is_i16 = (it->mb_->type_ == 1);
1201   int nz = 0;
1202 
1203   if (is_i16) {
1204     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1205   } else {
1206     VP8IteratorStartI4(it);
1207     do {
1208       const int mode =
1209           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1210       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1211       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1212       VP8MakeIntra4Preds(it);
1213       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1214                               src, dst, mode) << it->i4_;
1215     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1216   }
1217 
1218   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1219   rd->nz = nz;
1220 }
1221 
1222 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1223 static void RefineUsingDistortion(VP8EncIterator* const it,
1224                                   int try_both_modes, int refine_uv_mode,
1225                                   VP8ModeScore* const rd) {
1226   score_t best_score = MAX_COST;
1227   int nz = 0;
1228   int mode;
1229   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1230 
1231   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1232   // Some empiric constants, of approximate order of magnitude.
1233   const int lambda_d_i16 = 106;
1234   const int lambda_d_i4 = 11;
1235   const int lambda_d_uv = 120;
1236   score_t score_i4 = dqm->i4_penalty_;
1237   score_t i4_bit_sum = 0;
1238   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1239                                            : MAX_COST;  // no early-out allowed
1240 
1241   if (is_i16) {   // First, evaluate Intra16 distortion
1242     int best_mode = -1;
1243     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1244     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1245       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1246       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1247                           + VP8FixedCostsI16[mode] * lambda_d_i16;
1248       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1249         continue;
1250       }
1251 
1252       if (score < best_score) {
1253         best_mode = mode;
1254         best_score = score;
1255       }
1256     }
1257     if (it->x_ == 0 || it->y_ == 0) {
1258       // avoid starting a checkerboard resonance from the border. See bug #432.
1259       if (IsFlatSource16(src)) {
1260         best_mode = (it->x_ == 0) ? 0 : 2;
1261         try_both_modes = 0;  // stick to i16
1262       }
1263     }
1264     VP8SetIntra16Mode(it, best_mode);
1265     // we'll reconstruct later, if i16 mode actually gets selected
1266   }
1267 
1268   // Next, evaluate Intra4
1269   if (try_both_modes || !is_i16) {
1270     // We don't evaluate the rate here, but just account for it through a
1271     // constant penalty (i4 mode usually needs more bits compared to i16).
1272     is_i16 = 0;
1273     VP8IteratorStartI4(it);
1274     do {
1275       int best_i4_mode = -1;
1276       score_t best_i4_score = MAX_COST;
1277       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1278       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1279 
1280       VP8MakeIntra4Preds(it);
1281       for (mode = 0; mode < NUM_BMODES; ++mode) {
1282         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1283         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1284                             + mode_costs[mode] * lambda_d_i4;
1285         if (score < best_i4_score) {
1286           best_i4_mode = mode;
1287           best_i4_score = score;
1288         }
1289       }
1290       i4_bit_sum += mode_costs[best_i4_mode];
1291       rd->modes_i4[it->i4_] = best_i4_mode;
1292       score_i4 += best_i4_score;
1293       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1294         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1295         is_i16 = 1;
1296         break;
1297       } else {  // reconstruct partial block inside yuv_out2_ buffer
1298         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1299         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1300                                 src, tmp_dst, best_i4_mode) << it->i4_;
1301       }
1302     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1303   }
1304 
1305   // Final reconstruction, depending on which mode is selected.
1306   if (!is_i16) {
1307     VP8SetIntra4Mode(it, rd->modes_i4);
1308     SwapOut(it);
1309     best_score = score_i4;
1310   } else {
1311     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1312   }
1313 
1314   // ... and UV!
1315   if (refine_uv_mode) {
1316     int best_mode = -1;
1317     score_t best_uv_score = MAX_COST;
1318     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1319     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1320       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1321       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1322                           + VP8FixedCostsUV[mode] * lambda_d_uv;
1323       if (score < best_uv_score) {
1324         best_mode = mode;
1325         best_uv_score = score;
1326       }
1327     }
1328     VP8SetIntraUVMode(it, best_mode);
1329   }
1330   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1331 
1332   rd->nz = nz;
1333   rd->score = best_score;
1334 }
1335 
1336 //------------------------------------------------------------------------------
1337 // Entry point
1338 
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1339 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1340                 VP8RDLevel rd_opt) {
1341   int is_skipped;
1342   const int method = it->enc_->method_;
1343 
1344   InitScore(rd);
1345 
1346   // We can perform predictions for Luma16x16 and Chroma8x8 already.
1347   // Luma4x4 predictions needs to be done as-we-go.
1348   VP8MakeLuma16Preds(it);
1349   VP8MakeChroma8Preds(it);
1350 
1351   if (rd_opt > RD_OPT_NONE) {
1352     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1353     PickBestIntra16(it, rd);
1354     if (method >= 2) {
1355       PickBestIntra4(it, rd);
1356     }
1357     PickBestUV(it, rd);
1358     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
1359       it->do_trellis_ = 1;
1360       SimpleQuantize(it, rd);
1361     }
1362   } else {
1363     // At this point we have heuristically decided intra16 / intra4.
1364     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1365     // For method <= 1, we don't re-examine the decision but just go ahead with
1366     // quantization/reconstruction.
1367     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1368   }
1369   is_skipped = (rd->nz == 0);
1370   VP8SetSkip(it, is_skipped);
1371   return is_skipped;
1372 }
1373