1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "src/dsp/quant.h"
19 #include "src/enc/vp8i_enc.h"
20 #include "src/enc/cost_enc.h"
21
22 #define DO_TRELLIS_I4 1
23 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25 #define USE_TDISTO 1
26
27 #define MID_ALPHA 64 // neutral value for susceptibility
28 #define MIN_ALPHA 30 // lowest usable value for susceptibility
29 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32 // power-law modulation. Must be strictly less than 1.
33
34 // number of non-zero coeffs below which we consider the block very flat
35 // (and apply a penalty to complex predictions)
36 #define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37 #define FLATNESS_LIMIT_I4 3 // I4 mode
38 #define FLATNESS_LIMIT_UV 2 // UV mode
39 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45 // #define DEBUG_BLOCK
46
47 //------------------------------------------------------------------------------
48
49 #if defined(DEBUG_BLOCK)
50
51 #include <stdio.h>
52 #include <stdlib.h>
53
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)54 static void PrintBlockInfo(const VP8EncIterator* const it,
55 const VP8ModeScore* const rd) {
56 int i, j;
57 const int is_i16 = (it->mb_->type_ == 1);
58 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62 printf("SOURCE / OUTPUT / ABS DELTA\n");
63 for (j = 0; j < 16; ++j) {
64 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65 printf(" ");
66 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67 printf(" ");
68 for (i = 0; i < 16; ++i) {
69 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70 }
71 printf("\n");
72 }
73 printf("\n"); // newline before the U/V block
74 for (j = 0; j < 8; ++j) {
75 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76 printf(" ");
77 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78 printf(" ");
79 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80 printf(" ");
81 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82 printf(" ");
83 for (i = 0; i < 8; ++i) {
84 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85 }
86 printf(" ");
87 for (i = 8; i < 16; ++i) {
88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89 }
90 printf("\n");
91 }
92 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94 (int)rd->score);
95 if (is_i16) {
96 printf("Mode: %d\n", rd->mode_i16);
97 printf("y_dc_levels:");
98 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99 printf("\n");
100 } else {
101 printf("Modes[16]: ");
102 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103 printf("\n");
104 }
105 printf("y_ac_levels:\n");
106 for (j = 0; j < 16; ++j) {
107 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108 printf("%4d ", rd->y_ac_levels[j][i]);
109 }
110 printf("\n");
111 }
112 printf("\n");
113 printf("uv_levels (mode=%d):\n", rd->mode_uv);
114 for (j = 0; j < 8; ++j) {
115 for (i = 0; i < 16; ++i) {
116 printf("%4d ", rd->uv_levels[j][i]);
117 }
118 printf("\n");
119 }
120 }
121
122 #endif // DEBUG_BLOCK
123
124 //------------------------------------------------------------------------------
125
clip(int v,int m,int M)126 static WEBP_INLINE int clip(int v, int m, int M) {
127 return v < m ? m : v > M ? M : v;
128 }
129
130 static const uint8_t kZigzag[16] = {
131 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132 };
133
134 static const uint8_t kDcTable[128] = {
135 4, 5, 6, 7, 8, 9, 10, 10,
136 11, 12, 13, 14, 15, 16, 17, 17,
137 18, 19, 20, 20, 21, 21, 22, 22,
138 23, 23, 24, 25, 25, 26, 27, 28,
139 29, 30, 31, 32, 33, 34, 35, 36,
140 37, 37, 38, 39, 40, 41, 42, 43,
141 44, 45, 46, 46, 47, 48, 49, 50,
142 51, 52, 53, 54, 55, 56, 57, 58,
143 59, 60, 61, 62, 63, 64, 65, 66,
144 67, 68, 69, 70, 71, 72, 73, 74,
145 75, 76, 76, 77, 78, 79, 80, 81,
146 82, 83, 84, 85, 86, 87, 88, 89,
147 91, 93, 95, 96, 98, 100, 101, 102,
148 104, 106, 108, 110, 112, 114, 116, 118,
149 122, 124, 126, 128, 130, 132, 134, 136,
150 138, 140, 143, 145, 148, 151, 154, 157
151 };
152
153 static const uint16_t kAcTable[128] = {
154 4, 5, 6, 7, 8, 9, 10, 11,
155 12, 13, 14, 15, 16, 17, 18, 19,
156 20, 21, 22, 23, 24, 25, 26, 27,
157 28, 29, 30, 31, 32, 33, 34, 35,
158 36, 37, 38, 39, 40, 41, 42, 43,
159 44, 45, 46, 47, 48, 49, 50, 51,
160 52, 53, 54, 55, 56, 57, 58, 60,
161 62, 64, 66, 68, 70, 72, 74, 76,
162 78, 80, 82, 84, 86, 88, 90, 92,
163 94, 96, 98, 100, 102, 104, 106, 108,
164 110, 112, 114, 116, 119, 122, 125, 128,
165 131, 134, 137, 140, 143, 146, 149, 152,
166 155, 158, 161, 164, 167, 170, 173, 177,
167 181, 185, 189, 193, 197, 201, 205, 209,
168 213, 217, 221, 225, 229, 234, 239, 245,
169 249, 254, 259, 264, 269, 274, 279, 284
170 };
171
172 static const uint16_t kAcTable2[128] = {
173 8, 8, 9, 10, 12, 13, 15, 17,
174 18, 20, 21, 23, 24, 26, 27, 29,
175 31, 32, 34, 35, 37, 38, 40, 41,
176 43, 44, 46, 48, 49, 51, 52, 54,
177 55, 57, 58, 60, 62, 63, 65, 66,
178 68, 69, 71, 72, 74, 75, 77, 79,
179 80, 82, 83, 85, 86, 88, 89, 93,
180 96, 99, 102, 105, 108, 111, 114, 117,
181 120, 124, 127, 130, 133, 136, 139, 142,
182 145, 148, 151, 155, 158, 161, 164, 167,
183 170, 173, 176, 179, 184, 189, 193, 198,
184 203, 207, 212, 217, 221, 226, 230, 235,
185 240, 244, 249, 254, 258, 263, 268, 274,
186 280, 286, 292, 299, 305, 311, 317, 323,
187 330, 336, 342, 348, 354, 362, 370, 379,
188 385, 393, 401, 409, 416, 424, 432, 440
189 };
190
191 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192 { 96, 110 }, { 96, 108 }, { 110, 115 }
193 };
194
195 // Sharpening by (slightly) raising the hi-frequency coeffs.
196 // Hack-ish but helpful for mid-bitrate range. Use with care.
197 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198 static const uint8_t kFreqSharpening[16] = {
199 0, 30, 60, 90,
200 30, 60, 90, 90,
201 60, 90, 90, 90,
202 90, 90, 90, 90
203 };
204
205 //------------------------------------------------------------------------------
206 // Initialize quantization parameters in VP8Matrix
207
208 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)209 static int ExpandMatrix(VP8Matrix* const m, int type) {
210 int i, sum;
211 for (i = 0; i < 2; ++i) {
212 const int is_ac_coeff = (i > 0);
213 const int bias = kBiasMatrices[type][is_ac_coeff];
214 m->iq_[i] = (1 << QFIX) / m->q_[i];
215 m->bias_[i] = BIAS(bias);
216 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217 // * zero if coeff <= zthresh
218 // * non-zero if coeff > zthresh
219 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220 }
221 for (i = 2; i < 16; ++i) {
222 m->q_[i] = m->q_[1];
223 m->iq_[i] = m->iq_[1];
224 m->bias_[i] = m->bias_[1];
225 m->zthresh_[i] = m->zthresh_[1];
226 }
227 for (sum = 0, i = 0; i < 16; ++i) {
228 if (type == 0) { // we only use sharpening for AC luma coeffs
229 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230 } else {
231 m->sharpen_[i] = 0;
232 }
233 sum += m->q_[i];
234 }
235 return (sum + 8) >> 4;
236 }
237
CheckLambdaValue(int * const v)238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
SetupMatrices(VP8Encoder * enc)240 static void SetupMatrices(VP8Encoder* enc) {
241 int i;
242 const int tlambda_scale =
243 (enc->method_ >= 4) ? enc->config_->sns_strength
244 : 0;
245 const int num_segments = enc->segment_hdr_.num_segments_;
246 for (i = 0; i < num_segments; ++i) {
247 VP8SegmentInfo* const m = &enc->dqm_[i];
248 const int q = m->quant_;
249 int q_i4, q_i16, q_uv;
250 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259 q_i4 = ExpandMatrix(&m->y1_, 0);
260 q_i16 = ExpandMatrix(&m->y2_, 1);
261 q_uv = ExpandMatrix(&m->uv_, 2);
262
263 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264 m->lambda_i16_ = (3 * q_i16 * q_i16);
265 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272 // none of these constants should be < 1
273 CheckLambdaValue(&m->lambda_i4_);
274 CheckLambdaValue(&m->lambda_i16_);
275 CheckLambdaValue(&m->lambda_uv_);
276 CheckLambdaValue(&m->lambda_mode_);
277 CheckLambdaValue(&m->lambda_trellis_i4_);
278 CheckLambdaValue(&m->lambda_trellis_i16_);
279 CheckLambdaValue(&m->lambda_trellis_uv_);
280 CheckLambdaValue(&m->tlambda_);
281
282 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283 m->max_edge_ = 0;
284
285 m->i4_penalty_ = 1000 * q_i4 * q_i4;
286 }
287 }
288
289 //------------------------------------------------------------------------------
290 // Initialize filtering parameters
291
292 // Very small filter-strength values have close to no visual effect. So we can
293 // save a little decoding-CPU by turning filtering off for these.
294 #define FSTRENGTH_CUTOFF 2
295
SetupFilterStrength(VP8Encoder * const enc)296 static void SetupFilterStrength(VP8Encoder* const enc) {
297 int i;
298 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299 const int level0 = 5 * enc->config_->filter_strength;
300 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301 VP8SegmentInfo* const m = &enc->dqm_[i];
302 // We focus on the quantization of AC coeffs.
303 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304 const int base_strength =
305 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306 // Segments with lower complexity ('beta') will be less filtered.
307 const int f = base_strength * level0 / (256 + m->beta_);
308 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309 }
310 // We record the initial strength (mainly for the case of 1-segment only).
311 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314 }
315
316 //------------------------------------------------------------------------------
317
318 // Note: if you change the values below, remember that the max range
319 // allowed by the syntax for DQ_UV is [-16,16].
320 #define MAX_DQ_UV (6)
321 #define MIN_DQ_UV (-4)
322
323 // We want to emulate jpeg-like behaviour where the expected "good" quality
324 // is around q=75. Internally, our "good" middle is around c=50. So we
325 // map accordingly using linear piece-wise function
QualityToCompression(double c)326 static double QualityToCompression(double c) {
327 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328 // The file size roughly scales as pow(quantizer, 3.). Actually, the
329 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330 // in the mid-quant range. So we scale the compressibility inversely to
331 // this power-law: quant ~= compression ^ 1/3. This law holds well for
332 // low quant. Finer modeling for high-quant would make use of kAcTable[]
333 // more explicitly.
334 const double v = pow(linear_c, 1 / 3.);
335 return v;
336 }
337
QualityToJPEGCompression(double c,double alpha)338 static double QualityToJPEGCompression(double c, double alpha) {
339 // We map the complexity 'alpha' and quality setting 'c' to a compression
340 // exponent empirically matched to the compression curve of libjpeg6b.
341 // On average, the WebP output size will be roughly similar to that of a
342 // JPEG file compressed with same quality factor.
343 const double amin = 0.30;
344 const double amax = 0.85;
345 const double exp_min = 0.4;
346 const double exp_max = 0.9;
347 const double slope = (exp_min - exp_max) / (amax - amin);
348 // Linearly interpolate 'expn' from exp_min to exp_max
349 // in the [amin, amax] range.
350 const double expn = (alpha > amax) ? exp_min
351 : (alpha < amin) ? exp_max
352 : exp_max + slope * (alpha - amin);
353 const double v = pow(c, expn);
354 return v;
355 }
356
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358 const VP8SegmentInfo* const S2) {
359 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360 }
361
SimplifySegments(VP8Encoder * const enc)362 static void SimplifySegments(VP8Encoder* const enc) {
363 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365 // explicit check is needed to avoid a spurious warning about 'i' exceeding
366 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368 ? enc->segment_hdr_.num_segments_
369 : NUM_MB_SEGMENTS;
370 int num_final_segments = 1;
371 int s1, s2;
372 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374 int found = 0;
375 // check if we already have similar segment
376 for (s2 = 0; s2 < num_final_segments; ++s2) {
377 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378 if (SegmentsAreEquivalent(S1, S2)) {
379 found = 1;
380 break;
381 }
382 }
383 map[s1] = s2;
384 if (!found) {
385 if (num_final_segments != s1) {
386 enc->dqm_[num_final_segments] = enc->dqm_[s1];
387 }
388 ++num_final_segments;
389 }
390 }
391 if (num_final_segments < num_segments) { // Remap
392 int i = enc->mb_w_ * enc->mb_h_;
393 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394 enc->segment_hdr_.num_segments_ = num_final_segments;
395 // Replicate the trailing segment infos (it's mostly cosmetics)
396 for (i = num_final_segments; i < num_segments; ++i) {
397 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398 }
399 }
400 }
401
VP8SetSegmentParams(VP8Encoder * const enc,float quality)402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403 int i;
404 int dq_uv_ac, dq_uv_dc;
405 const int num_segments = enc->segment_hdr_.num_segments_;
406 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407 const double Q = quality / 100.;
408 const double c_base = enc->config_->emulate_jpeg_size ?
409 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410 QualityToCompression(Q);
411 for (i = 0; i < num_segments; ++i) {
412 // We modulate the base coefficient to accommodate for the quantization
413 // susceptibility and allow denser segments to be quantized more.
414 const double expn = 1. - amp * enc->dqm_[i].alpha_;
415 const double c = pow(c_base, expn);
416 const int q = (int)(127. * (1. - c));
417 assert(expn > 0.);
418 enc->dqm_[i].quant_ = clip(q, 0, 127);
419 }
420
421 // purely indicative in the bitstream (except for the 1-segment case)
422 enc->base_quant_ = enc->dqm_[0].quant_;
423
424 // fill-in values for the unused segments (required by the syntax)
425 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426 enc->dqm_[i].quant_ = enc->base_quant_;
427 }
428
429 // uv_alpha_ is normally spread around ~60. The useful range is
430 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433 / (MAX_ALPHA - MIN_ALPHA);
434 // we rescale by the user-defined strength of adaptation
435 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436 // and make it safe.
437 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438 // We also boost the dc-uv-quant a little, based on sns-strength, since
439 // U/V channels are quite more reactive to high quants (flat DC-blocks
440 // tend to appear, and are unpleasant).
441 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445 enc->dq_y2_dc_ = 0;
446 enc->dq_y2_ac_ = 0;
447 enc->dq_uv_dc_ = dq_uv_dc;
448 enc->dq_uv_ac_ = dq_uv_ac;
449
450 SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452 if (num_segments > 1) SimplifySegments(enc);
453
454 SetupMatrices(enc); // finalize quantization matrices
455 }
456
457 //------------------------------------------------------------------------------
458 // Form the predictions in cache
459
460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467 };
468
VP8MakeLuma16Preds(const VP8EncIterator * const it)469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472 VP8EncPredLuma16(it->yuv_p_, left, top);
473 }
474
VP8MakeChroma8Preds(const VP8EncIterator * const it)475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478 VP8EncPredChroma8(it->yuv_p_, left, top);
479 }
480
VP8MakeIntra4Preds(const VP8EncIterator * const it)481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483 }
484
485 //------------------------------------------------------------------------------
486 // Quantize
487
488 // Layout:
489 // +----+----+
490 // |YYYY|UUVV| 0
491 // |YYYY|UUVV| 4
492 // |YYYY|....| 8
493 // |YYYY|....| 12
494 // +----+----+
495
496 const uint16_t VP8Scan[16] = { // Luma
497 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
498 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
499 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
500 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501 };
502
503 static const uint16_t VP8ScanUV[4 + 4] = {
504 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
505 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
506 };
507
508 //------------------------------------------------------------------------------
509 // Distortion measurement
510
511 static const uint16_t kWeightY[16] = {
512 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513 };
514
515 static const uint16_t kWeightTrellis[16] = {
516 #if USE_TDISTO == 0
517 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518 #else
519 30, 27, 19, 11,
520 27, 24, 17, 10,
521 19, 17, 12, 8,
522 11, 10, 8, 6
523 #endif
524 };
525
526 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)527 static void InitScore(VP8ModeScore* const rd) {
528 rd->D = 0;
529 rd->SD = 0;
530 rd->R = 0;
531 rd->H = 0;
532 rd->nz = 0;
533 rd->score = MAX_COST;
534 }
535
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537 dst->D = src->D;
538 dst->SD = src->SD;
539 dst->R = src->R;
540 dst->H = src->H;
541 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
542 dst->score = src->score;
543 }
544
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546 dst->D += src->D;
547 dst->SD += src->SD;
548 dst->R += src->R;
549 dst->H += src->H;
550 dst->nz |= src->nz; // here, new nz bits are accumulated.
551 dst->score += src->score;
552 }
553
554 //------------------------------------------------------------------------------
555 // Performs trellis-optimized quantization.
556
557 // Trellis node
558 typedef struct {
559 int8_t prev; // best previous node
560 int8_t sign; // sign of coeff_i
561 int16_t level; // level
562 } Node;
563
564 // Score state
565 typedef struct {
566 score_t score; // partial RD score
567 const uint16_t* costs; // shortcut to cost tables
568 } ScoreState;
569
570 // If a coefficient was quantized to a value Q (using a neutral bias),
571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572 // We don't test negative values though.
573 #define MIN_DELTA 0 // how much lower level to try
574 #define MAX_DELTA 1 // how much higher
575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578
SetRDScore(int lambda,VP8ModeScore * const rd)579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581 }
582
RDScoreTrellis(int lambda,score_t rate,score_t distortion)583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584 score_t distortion) {
585 return rate * lambda + RD_DISTO_MULT * distortion;
586 }
587
588 // Coefficient type.
589 enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
590
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)591 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
592 int16_t in[16], int16_t out[16],
593 int ctx0, int coeff_type,
594 const VP8Matrix* const mtx,
595 int lambda) {
596 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
597 CostArrayPtr const costs =
598 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
599 const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
600 Node nodes[16][NUM_NODES];
601 ScoreState score_states[2][NUM_NODES];
602 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
603 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
604 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
605 score_t best_score;
606 int n, m, p, last;
607
608 {
609 score_t cost;
610 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
611 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
612
613 // compute the position of the last interesting coefficient
614 last = first - 1;
615 for (n = 15; n >= first; --n) {
616 const int j = kZigzag[n];
617 const int err = in[j] * in[j];
618 if (err > thresh) {
619 last = n;
620 break;
621 }
622 }
623 // we don't need to go inspect up to n = 16 coeffs. We can just go up
624 // to last + 1 (inclusive) without losing much.
625 if (last < 15) ++last;
626
627 // compute 'skip' score. This is the max score one can do.
628 cost = VP8BitCost(0, last_proba);
629 best_score = RDScoreTrellis(lambda, cost, 0);
630
631 // initialize source node.
632 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
633 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
634 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
635 ss_cur[m].costs = costs[first][ctx0];
636 }
637 }
638
639 // traverse trellis.
640 for (n = first; n <= last; ++n) {
641 const int j = kZigzag[n];
642 const uint32_t Q = mtx->q_[j];
643 const uint32_t iQ = mtx->iq_[j];
644 const uint32_t B = BIAS(0x00); // neutral bias
645 // note: it's important to take sign of the _original_ coeff,
646 // so we don't have to consider level < 0 afterward.
647 const int sign = (in[j] < 0);
648 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
649 int level0 = QUANTDIV(coeff0, iQ, B);
650 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
651 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
652 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
653
654 { // Swap current and previous score states
655 ScoreState* const tmp = ss_cur;
656 ss_cur = ss_prev;
657 ss_prev = tmp;
658 }
659
660 // test all alternate level values around level0.
661 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
662 Node* const cur = &NODE(n, m);
663 const int level = level0 + m;
664 const int ctx = (level > 2) ? 2 : level;
665 const int band = VP8EncBands[n + 1];
666 score_t base_score;
667 score_t best_cur_score;
668 int best_prev;
669 score_t cost, score;
670
671 ss_cur[m].costs = costs[n + 1][ctx];
672 if (level < 0 || level > thresh_level) {
673 ss_cur[m].score = MAX_COST;
674 // Node is dead.
675 continue;
676 }
677
678 {
679 // Compute delta_error = how much coding this level will
680 // subtract to max_error as distortion.
681 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
682 const int new_error = coeff0 - level * Q;
683 const int delta_error =
684 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
685 base_score = RDScoreTrellis(lambda, 0, delta_error);
686 }
687
688 // Inspect all possible non-dead predecessors. Retain only the best one.
689 // The base_score is added to all scores so it is only added for the final
690 // value after the loop.
691 cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
692 best_cur_score =
693 ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
694 best_prev = -MIN_DELTA;
695 for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
696 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
697 // eliminated since their score can't be better than the current best.
698 cost = VP8LevelCost(ss_prev[p].costs, level);
699 // Examine node assuming it's a non-terminal one.
700 score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
701 if (score < best_cur_score) {
702 best_cur_score = score;
703 best_prev = p;
704 }
705 }
706 best_cur_score += base_score;
707 // Store best finding in current node.
708 cur->sign = sign;
709 cur->level = level;
710 cur->prev = best_prev;
711 ss_cur[m].score = best_cur_score;
712
713 // Now, record best terminal node (and thus best entry in the graph).
714 if (level != 0 && best_cur_score < best_score) {
715 const score_t last_pos_cost =
716 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
717 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
718 score = best_cur_score + last_pos_score;
719 if (score < best_score) {
720 best_score = score;
721 best_path[0] = n; // best eob position
722 best_path[1] = m; // best node index
723 best_path[2] = best_prev; // best predecessor
724 }
725 }
726 }
727 }
728
729 // Fresh start
730 // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
731 if (coeff_type == TYPE_I16_AC) {
732 memset(in + 1, 0, 15 * sizeof(*in));
733 memset(out + 1, 0, 15 * sizeof(*out));
734 } else {
735 memset(in, 0, 16 * sizeof(*in));
736 memset(out, 0, 16 * sizeof(*out));
737 }
738 if (best_path[0] == -1) {
739 return 0; // skip!
740 }
741
742 {
743 // Unwind the best path.
744 // Note: best-prev on terminal node is not necessarily equal to the
745 // best_prev for non-terminal. So we patch best_path[2] in.
746 int nz = 0;
747 int best_node = best_path[1];
748 n = best_path[0];
749 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
750
751 for (; n >= first; --n) {
752 const Node* const node = &NODE(n, best_node);
753 const int j = kZigzag[n];
754 out[n] = node->sign ? -node->level : node->level;
755 nz |= node->level;
756 in[j] = out[n] * mtx->q_[j];
757 best_node = node->prev;
758 }
759 return (nz != 0);
760 }
761 }
762
763 #undef NODE
764
765 //------------------------------------------------------------------------------
766 // Performs: difference, transform, quantize, back-transform, add
767 // all at once. Output is the reconstructed block in *yuv_out, and the
768 // quantized levels in *levels.
769
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)770 static int ReconstructIntra16(VP8EncIterator* const it,
771 VP8ModeScore* const rd,
772 uint8_t* const yuv_out,
773 int mode) {
774 const VP8Encoder* const enc = it->enc_;
775 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
776 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
777 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
778 int nz = 0;
779 int n;
780 int16_t tmp[16][16], dc_tmp[16];
781
782 for (n = 0; n < 16; n += 2) {
783 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
784 }
785 VP8FTransformWHT(tmp[0], dc_tmp);
786 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
787
788 if (DO_TRELLIS_I16 && it->do_trellis_) {
789 int x, y;
790 VP8IteratorNzToBytes(it);
791 for (y = 0, n = 0; y < 4; ++y) {
792 for (x = 0; x < 4; ++x, ++n) {
793 const int ctx = it->top_nz_[x] + it->left_nz_[y];
794 const int non_zero = TrellisQuantizeBlock(
795 enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
796 dqm->lambda_trellis_i16_);
797 it->top_nz_[x] = it->left_nz_[y] = non_zero;
798 rd->y_ac_levels[n][0] = 0;
799 nz |= non_zero << n;
800 }
801 }
802 } else {
803 for (n = 0; n < 16; n += 2) {
804 // Zero-out the first coeff, so that: a) nz is correct below, and
805 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
806 tmp[n][0] = tmp[n + 1][0] = 0;
807 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
808 assert(rd->y_ac_levels[n + 0][0] == 0);
809 assert(rd->y_ac_levels[n + 1][0] == 0);
810 }
811 }
812
813 // Transform back
814 VP8TransformWHT(dc_tmp, tmp[0]);
815 for (n = 0; n < 16; n += 2) {
816 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
817 }
818
819 return nz;
820 }
821
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)822 static int ReconstructIntra4(VP8EncIterator* const it,
823 int16_t levels[16],
824 const uint8_t* const src,
825 uint8_t* const yuv_out,
826 int mode) {
827 const VP8Encoder* const enc = it->enc_;
828 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
829 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
830 int nz = 0;
831 int16_t tmp[16];
832
833 VP8FTransform(src, ref, tmp);
834 if (DO_TRELLIS_I4 && it->do_trellis_) {
835 const int x = it->i4_ & 3, y = it->i4_ >> 2;
836 const int ctx = it->top_nz_[x] + it->left_nz_[y];
837 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
838 dqm->lambda_trellis_i4_);
839 } else {
840 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
841 }
842 VP8ITransform(ref, tmp, yuv_out, 0);
843 return nz;
844 }
845
846 //------------------------------------------------------------------------------
847 // DC-error diffusion
848
849 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
850 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
851 #define C1 7 // fraction of error sent to the 4x4 block below
852 #define C2 8 // fraction of error sent to the 4x4 block on the right
853 #define DSHIFT 4
854 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t
855
856 // Quantize as usual, but also compute and return the quantization error.
857 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)858 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
859 int V = *v;
860 const int sign = (V < 0);
861 if (sign) V = -V;
862 if (V > (int)mtx->zthresh_[0]) {
863 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
864 const int err = (V - qV);
865 *v = sign ? -qV : qV;
866 return (sign ? -err : err) >> DSCALE;
867 }
868 *v = 0;
869 return (sign ? -V : V) >> DSCALE;
870 }
871
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)872 static void CorrectDCValues(const VP8EncIterator* const it,
873 const VP8Matrix* const mtx,
874 int16_t tmp[][16], VP8ModeScore* const rd) {
875 // | top[0] | top[1]
876 // --------+--------+---------
877 // left[0] | tmp[0] tmp[1] <-> err0 err1
878 // left[1] | tmp[2] tmp[3] err2 err3
879 //
880 // Final errors {err1,err2,err3} are preserved and later restored
881 // as top[]/left[] on the next block.
882 int ch;
883 for (ch = 0; ch <= 1; ++ch) {
884 const int8_t* const top = it->top_derr_[it->x_][ch];
885 const int8_t* const left = it->left_derr_[ch];
886 int16_t (* const c)[16] = &tmp[ch * 4];
887 int err0, err1, err2, err3;
888 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
889 err0 = QuantizeSingle(&c[0][0], mtx);
890 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
891 err1 = QuantizeSingle(&c[1][0], mtx);
892 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
893 err2 = QuantizeSingle(&c[2][0], mtx);
894 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
895 err3 = QuantizeSingle(&c[3][0], mtx);
896 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
897 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
898 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
899 rd->derr[ch][0] = (int8_t)err1;
900 rd->derr[ch][1] = (int8_t)err2;
901 rd->derr[ch][2] = (int8_t)err3;
902 }
903 }
904
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)905 static void StoreDiffusionErrors(VP8EncIterator* const it,
906 const VP8ModeScore* const rd) {
907 int ch;
908 for (ch = 0; ch <= 1; ++ch) {
909 int8_t* const top = it->top_derr_[it->x_][ch];
910 int8_t* const left = it->left_derr_[ch];
911 left[0] = rd->derr[ch][0]; // restore err1
912 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
913 top[0] = rd->derr[ch][1]; // ... err2
914 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
915 }
916 }
917
918 #undef C1
919 #undef C2
920 #undef DSHIFT
921 #undef DSCALE
922
923 //------------------------------------------------------------------------------
924
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)925 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
926 uint8_t* const yuv_out, int mode) {
927 const VP8Encoder* const enc = it->enc_;
928 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
929 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
930 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
931 int nz = 0;
932 int n;
933 int16_t tmp[8][16];
934
935 for (n = 0; n < 8; n += 2) {
936 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
937 }
938 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
939
940 if (DO_TRELLIS_UV && it->do_trellis_) {
941 int ch, x, y;
942 for (ch = 0, n = 0; ch <= 2; ch += 2) {
943 for (y = 0; y < 2; ++y) {
944 for (x = 0; x < 2; ++x, ++n) {
945 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
946 const int non_zero = TrellisQuantizeBlock(
947 enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
948 dqm->lambda_trellis_uv_);
949 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
950 nz |= non_zero << n;
951 }
952 }
953 }
954 } else {
955 for (n = 0; n < 8; n += 2) {
956 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
957 }
958 }
959
960 for (n = 0; n < 8; n += 2) {
961 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
962 }
963 return (nz << 16);
964 }
965
966 //------------------------------------------------------------------------------
967 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
968 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
969
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])970 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
971 // We look at the first three AC coefficients to determine what is the average
972 // delta between each sub-4x4 block.
973 const int v0 = abs(DCs[1]);
974 const int v1 = abs(DCs[2]);
975 const int v2 = abs(DCs[4]);
976 int max_v = (v1 > v0) ? v1 : v0;
977 max_v = (v2 > max_v) ? v2 : max_v;
978 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
979 }
980
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)981 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
982 VP8ModeScore* const tmp = *a;
983 *a = *b;
984 *b = tmp;
985 }
986
SwapPtr(uint8_t ** a,uint8_t ** b)987 static void SwapPtr(uint8_t** a, uint8_t** b) {
988 uint8_t* const tmp = *a;
989 *a = *b;
990 *b = tmp;
991 }
992
SwapOut(VP8EncIterator * const it)993 static void SwapOut(VP8EncIterator* const it) {
994 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
995 }
996
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)997 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
998 const int kNumBlocks = 16;
999 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1000 const int lambda = dqm->lambda_i16_;
1001 const int tlambda = dqm->tlambda_;
1002 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1003 VP8ModeScore rd_tmp;
1004 VP8ModeScore* rd_cur = &rd_tmp;
1005 VP8ModeScore* rd_best = rd;
1006 int mode;
1007 int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1008
1009 rd->mode_i16 = -1;
1010 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1011 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1012 rd_cur->mode_i16 = mode;
1013
1014 // Reconstruct
1015 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1016
1017 // Measure RD-score
1018 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1019 rd_cur->SD =
1020 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1021 rd_cur->H = VP8FixedCostsI16[mode];
1022 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1023 if (is_flat) {
1024 // refine the first impression (which was in pixel space)
1025 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1026 if (is_flat) {
1027 // Block is very flat. We put emphasis on the distortion being very low!
1028 rd_cur->D *= 2;
1029 rd_cur->SD *= 2;
1030 }
1031 }
1032
1033 // Since we always examine Intra16 first, we can overwrite *rd directly.
1034 SetRDScore(lambda, rd_cur);
1035 if (mode == 0 || rd_cur->score < rd_best->score) {
1036 SwapModeScore(&rd_cur, &rd_best);
1037 SwapOut(it);
1038 }
1039 }
1040 if (rd_best != rd) {
1041 memcpy(rd, rd_best, sizeof(*rd));
1042 }
1043 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1044 VP8SetIntra16Mode(it, rd->mode_i16);
1045
1046 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1047 // distortion, record max delta so we can later adjust the minimal filtering
1048 // strength needed to smooth these blocks out.
1049 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1050 StoreMaxDelta(dqm, rd->y_dc_levels);
1051 }
1052 }
1053
1054 //------------------------------------------------------------------------------
1055
1056 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1057 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1058 const uint8_t modes[16]) {
1059 const int preds_w = it->enc_->preds_w_;
1060 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1061 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1062 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1063 return VP8FixedCostsI4[top][left];
1064 }
1065
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1066 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1067 const VP8Encoder* const enc = it->enc_;
1068 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1069 const int lambda = dqm->lambda_i4_;
1070 const int tlambda = dqm->tlambda_;
1071 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1072 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1073 int total_header_bits = 0;
1074 VP8ModeScore rd_best;
1075
1076 if (enc->max_i4_header_bits_ == 0) {
1077 return 0;
1078 }
1079
1080 InitScore(&rd_best);
1081 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1082 SetRDScore(dqm->lambda_mode_, &rd_best);
1083 VP8IteratorStartI4(it);
1084 do {
1085 const int kNumBlocks = 1;
1086 VP8ModeScore rd_i4;
1087 int mode;
1088 int best_mode = -1;
1089 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1090 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1091 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1092 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1093
1094 InitScore(&rd_i4);
1095 VP8MakeIntra4Preds(it);
1096 for (mode = 0; mode < NUM_BMODES; ++mode) {
1097 VP8ModeScore rd_tmp;
1098 int16_t tmp_levels[16];
1099
1100 // Reconstruct
1101 rd_tmp.nz =
1102 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1103
1104 // Compute RD-score
1105 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1106 rd_tmp.SD =
1107 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1108 : 0;
1109 rd_tmp.H = mode_costs[mode];
1110
1111 // Add flatness penalty, to avoid flat area to be mispredicted
1112 // by a complex mode.
1113 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1114 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1115 } else {
1116 rd_tmp.R = 0;
1117 }
1118
1119 // early-out check
1120 SetRDScore(lambda, &rd_tmp);
1121 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1122
1123 // finish computing score
1124 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1125 SetRDScore(lambda, &rd_tmp);
1126
1127 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1128 CopyScore(&rd_i4, &rd_tmp);
1129 best_mode = mode;
1130 SwapPtr(&tmp_dst, &best_block);
1131 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1132 sizeof(rd_best.y_ac_levels[it->i4_]));
1133 }
1134 }
1135 SetRDScore(dqm->lambda_mode_, &rd_i4);
1136 AddScore(&rd_best, &rd_i4);
1137 if (rd_best.score >= rd->score) {
1138 return 0;
1139 }
1140 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1141 if (total_header_bits > enc->max_i4_header_bits_) {
1142 return 0;
1143 }
1144 // Copy selected samples if not in the right place already.
1145 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1146 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1147 }
1148 rd->modes_i4[it->i4_] = best_mode;
1149 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1150 } while (VP8IteratorRotateI4(it, best_blocks));
1151
1152 // finalize state
1153 CopyScore(rd, &rd_best);
1154 VP8SetIntra4Mode(it, rd->modes_i4);
1155 SwapOut(it);
1156 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1157 return 1; // select intra4x4 over intra16x16
1158 }
1159
1160 //------------------------------------------------------------------------------
1161
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1162 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1163 const int kNumBlocks = 8;
1164 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1165 const int lambda = dqm->lambda_uv_;
1166 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1167 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1168 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1169 uint8_t* dst = dst0;
1170 VP8ModeScore rd_best;
1171 int mode;
1172
1173 rd->mode_uv = -1;
1174 InitScore(&rd_best);
1175 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1176 VP8ModeScore rd_uv;
1177
1178 // Reconstruct
1179 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1180
1181 // Compute RD-score
1182 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1183 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1184 rd_uv.H = VP8FixedCostsUV[mode];
1185 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1186 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1187 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1188 }
1189
1190 SetRDScore(lambda, &rd_uv);
1191 if (mode == 0 || rd_uv.score < rd_best.score) {
1192 CopyScore(&rd_best, &rd_uv);
1193 rd->mode_uv = mode;
1194 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1195 if (it->top_derr_ != NULL) {
1196 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1197 }
1198 SwapPtr(&dst, &tmp_dst);
1199 }
1200 }
1201 VP8SetIntraUVMode(it, rd->mode_uv);
1202 AddScore(rd, &rd_best);
1203 if (dst != dst0) { // copy 16x8 block if needed
1204 VP8Copy16x8(dst, dst0);
1205 }
1206 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1207 StoreDiffusionErrors(it, rd);
1208 }
1209 }
1210
1211 //------------------------------------------------------------------------------
1212 // Final reconstruction and quantization.
1213
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1214 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1215 const VP8Encoder* const enc = it->enc_;
1216 const int is_i16 = (it->mb_->type_ == 1);
1217 int nz = 0;
1218
1219 if (is_i16) {
1220 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1221 } else {
1222 VP8IteratorStartI4(it);
1223 do {
1224 const int mode =
1225 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1226 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1227 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1228 VP8MakeIntra4Preds(it);
1229 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1230 src, dst, mode) << it->i4_;
1231 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1232 }
1233
1234 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1235 rd->nz = nz;
1236 }
1237
1238 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1239 static void RefineUsingDistortion(VP8EncIterator* const it,
1240 int try_both_modes, int refine_uv_mode,
1241 VP8ModeScore* const rd) {
1242 score_t best_score = MAX_COST;
1243 int nz = 0;
1244 int mode;
1245 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1246
1247 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1248 // Some empiric constants, of approximate order of magnitude.
1249 const int lambda_d_i16 = 106;
1250 const int lambda_d_i4 = 11;
1251 const int lambda_d_uv = 120;
1252 score_t score_i4 = dqm->i4_penalty_;
1253 score_t i4_bit_sum = 0;
1254 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1255 : MAX_COST; // no early-out allowed
1256
1257 if (is_i16) { // First, evaluate Intra16 distortion
1258 int best_mode = -1;
1259 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1260 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1261 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1262 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1263 + VP8FixedCostsI16[mode] * lambda_d_i16;
1264 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1265 continue;
1266 }
1267
1268 if (score < best_score) {
1269 best_mode = mode;
1270 best_score = score;
1271 }
1272 }
1273 if (it->x_ == 0 || it->y_ == 0) {
1274 // avoid starting a checkerboard resonance from the border. See bug #432.
1275 if (IsFlatSource16(src)) {
1276 best_mode = (it->x_ == 0) ? 0 : 2;
1277 try_both_modes = 0; // stick to i16
1278 }
1279 }
1280 VP8SetIntra16Mode(it, best_mode);
1281 // we'll reconstruct later, if i16 mode actually gets selected
1282 }
1283
1284 // Next, evaluate Intra4
1285 if (try_both_modes || !is_i16) {
1286 // We don't evaluate the rate here, but just account for it through a
1287 // constant penalty (i4 mode usually needs more bits compared to i16).
1288 is_i16 = 0;
1289 VP8IteratorStartI4(it);
1290 do {
1291 int best_i4_mode = -1;
1292 score_t best_i4_score = MAX_COST;
1293 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1294 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1295
1296 VP8MakeIntra4Preds(it);
1297 for (mode = 0; mode < NUM_BMODES; ++mode) {
1298 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1299 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1300 + mode_costs[mode] * lambda_d_i4;
1301 if (score < best_i4_score) {
1302 best_i4_mode = mode;
1303 best_i4_score = score;
1304 }
1305 }
1306 i4_bit_sum += mode_costs[best_i4_mode];
1307 rd->modes_i4[it->i4_] = best_i4_mode;
1308 score_i4 += best_i4_score;
1309 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1310 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1311 is_i16 = 1;
1312 break;
1313 } else { // reconstruct partial block inside yuv_out2_ buffer
1314 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1315 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1316 src, tmp_dst, best_i4_mode) << it->i4_;
1317 }
1318 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1319 }
1320
1321 // Final reconstruction, depending on which mode is selected.
1322 if (!is_i16) {
1323 VP8SetIntra4Mode(it, rd->modes_i4);
1324 SwapOut(it);
1325 best_score = score_i4;
1326 } else {
1327 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1328 }
1329
1330 // ... and UV!
1331 if (refine_uv_mode) {
1332 int best_mode = -1;
1333 score_t best_uv_score = MAX_COST;
1334 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1335 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1336 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1337 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1338 + VP8FixedCostsUV[mode] * lambda_d_uv;
1339 if (score < best_uv_score) {
1340 best_mode = mode;
1341 best_uv_score = score;
1342 }
1343 }
1344 VP8SetIntraUVMode(it, best_mode);
1345 }
1346 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1347
1348 rd->nz = nz;
1349 rd->score = best_score;
1350 }
1351
1352 //------------------------------------------------------------------------------
1353 // Entry point
1354
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1355 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1356 VP8RDLevel rd_opt) {
1357 int is_skipped;
1358 const int method = it->enc_->method_;
1359
1360 InitScore(rd);
1361
1362 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1363 // Luma4x4 predictions needs to be done as-we-go.
1364 VP8MakeLuma16Preds(it);
1365 VP8MakeChroma8Preds(it);
1366
1367 if (rd_opt > RD_OPT_NONE) {
1368 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1369 PickBestIntra16(it, rd);
1370 if (method >= 2) {
1371 PickBestIntra4(it, rd);
1372 }
1373 PickBestUV(it, rd);
1374 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1375 it->do_trellis_ = 1;
1376 SimpleQuantize(it, rd);
1377 }
1378 } else {
1379 // At this point we have heuristically decided intra16 / intra4.
1380 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1381 // For method <= 1, we don't re-examine the decision but just go ahead with
1382 // quantization/reconstruction.
1383 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1384 }
1385 is_skipped = (rd->nz == 0);
1386 VP8SetSkip(it, is_skipped);
1387 return is_skipped;
1388 }
1389