1 // Copyright 2011 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // Quantization
11 //
12 // Author: Skal (pascal.massimino@gmail.com)
13
14 #include <assert.h>
15 #include <math.h>
16 #include <stdlib.h> // for abs()
17
18 #include "src/enc/vp8i_enc.h"
19 #include "src/enc/cost_enc.h"
20
21 #define DO_TRELLIS_I4 1
22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24 #define USE_TDISTO 1
25
26 #define MID_ALPHA 64 // neutral value for susceptibility
27 #define MIN_ALPHA 30 // lowest usable value for susceptibility
28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility
29
30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31 // power-law modulation. Must be strictly less than 1.
32
33 // number of non-zero coeffs below which we consider the block very flat
34 // (and apply a penalty to complex predictions)
35 #define FLATNESS_LIMIT_I16 10 // I16 mode
36 #define FLATNESS_LIMIT_I4 3 // I4 mode
37 #define FLATNESS_LIMIT_UV 2 // UV mode
38 #define FLATNESS_PENALTY 140 // roughly ~1bit per block
39
40 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41
42 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
43
44 // #define DEBUG_BLOCK
45
46 //------------------------------------------------------------------------------
47
48 #if defined(DEBUG_BLOCK)
49
50 #include <stdio.h>
51 #include <stdlib.h>
52
PrintBlockInfo(const VP8EncIterator * const it,const VP8ModeScore * const rd)53 static void PrintBlockInfo(const VP8EncIterator* const it,
54 const VP8ModeScore* const rd) {
55 int i, j;
56 const int is_i16 = (it->mb_->type_ == 1);
57 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61 printf("SOURCE / OUTPUT / ABS DELTA\n");
62 for (j = 0; j < 16; ++j) {
63 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
64 printf(" ");
65 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
66 printf(" ");
67 for (i = 0; i < 16; ++i) {
68 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69 }
70 printf("\n");
71 }
72 printf("\n"); // newline before the U/V block
73 for (j = 0; j < 8; ++j) {
74 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
75 printf(" ");
76 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
77 printf(" ");
78 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
79 printf(" ");
80 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
81 printf(" ");
82 for (i = 0; i < 8; ++i) {
83 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84 }
85 printf(" ");
86 for (i = 8; i < 16; ++i) {
87 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88 }
89 printf("\n");
90 }
91 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93 (int)rd->score);
94 if (is_i16) {
95 printf("Mode: %d\n", rd->mode_i16);
96 printf("y_dc_levels:");
97 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
98 printf("\n");
99 } else {
100 printf("Modes[16]: ");
101 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
102 printf("\n");
103 }
104 printf("y_ac_levels:\n");
105 for (j = 0; j < 16; ++j) {
106 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
107 printf("%4d ", rd->y_ac_levels[j][i]);
108 }
109 printf("\n");
110 }
111 printf("\n");
112 printf("uv_levels (mode=%d):\n", rd->mode_uv);
113 for (j = 0; j < 8; ++j) {
114 for (i = 0; i < 16; ++i) {
115 printf("%4d ", rd->uv_levels[j][i]);
116 }
117 printf("\n");
118 }
119 }
120
121 #endif // DEBUG_BLOCK
122
123 //------------------------------------------------------------------------------
124
clip(int v,int m,int M)125 static WEBP_INLINE int clip(int v, int m, int M) {
126 return v < m ? m : v > M ? M : v;
127 }
128
129 static const uint8_t kZigzag[16] = {
130 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
131 };
132
133 static const uint8_t kDcTable[128] = {
134 4, 5, 6, 7, 8, 9, 10, 10,
135 11, 12, 13, 14, 15, 16, 17, 17,
136 18, 19, 20, 20, 21, 21, 22, 22,
137 23, 23, 24, 25, 25, 26, 27, 28,
138 29, 30, 31, 32, 33, 34, 35, 36,
139 37, 37, 38, 39, 40, 41, 42, 43,
140 44, 45, 46, 46, 47, 48, 49, 50,
141 51, 52, 53, 54, 55, 56, 57, 58,
142 59, 60, 61, 62, 63, 64, 65, 66,
143 67, 68, 69, 70, 71, 72, 73, 74,
144 75, 76, 76, 77, 78, 79, 80, 81,
145 82, 83, 84, 85, 86, 87, 88, 89,
146 91, 93, 95, 96, 98, 100, 101, 102,
147 104, 106, 108, 110, 112, 114, 116, 118,
148 122, 124, 126, 128, 130, 132, 134, 136,
149 138, 140, 143, 145, 148, 151, 154, 157
150 };
151
152 static const uint16_t kAcTable[128] = {
153 4, 5, 6, 7, 8, 9, 10, 11,
154 12, 13, 14, 15, 16, 17, 18, 19,
155 20, 21, 22, 23, 24, 25, 26, 27,
156 28, 29, 30, 31, 32, 33, 34, 35,
157 36, 37, 38, 39, 40, 41, 42, 43,
158 44, 45, 46, 47, 48, 49, 50, 51,
159 52, 53, 54, 55, 56, 57, 58, 60,
160 62, 64, 66, 68, 70, 72, 74, 76,
161 78, 80, 82, 84, 86, 88, 90, 92,
162 94, 96, 98, 100, 102, 104, 106, 108,
163 110, 112, 114, 116, 119, 122, 125, 128,
164 131, 134, 137, 140, 143, 146, 149, 152,
165 155, 158, 161, 164, 167, 170, 173, 177,
166 181, 185, 189, 193, 197, 201, 205, 209,
167 213, 217, 221, 225, 229, 234, 239, 245,
168 249, 254, 259, 264, 269, 274, 279, 284
169 };
170
171 static const uint16_t kAcTable2[128] = {
172 8, 8, 9, 10, 12, 13, 15, 17,
173 18, 20, 21, 23, 24, 26, 27, 29,
174 31, 32, 34, 35, 37, 38, 40, 41,
175 43, 44, 46, 48, 49, 51, 52, 54,
176 55, 57, 58, 60, 62, 63, 65, 66,
177 68, 69, 71, 72, 74, 75, 77, 79,
178 80, 82, 83, 85, 86, 88, 89, 93,
179 96, 99, 102, 105, 108, 111, 114, 117,
180 120, 124, 127, 130, 133, 136, 139, 142,
181 145, 148, 151, 155, 158, 161, 164, 167,
182 170, 173, 176, 179, 184, 189, 193, 198,
183 203, 207, 212, 217, 221, 226, 230, 235,
184 240, 244, 249, 254, 258, 263, 268, 274,
185 280, 286, 292, 299, 305, 311, 317, 323,
186 330, 336, 342, 348, 354, 362, 370, 379,
187 385, 393, 401, 409, 416, 424, 432, 440
188 };
189
190 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
191 { 96, 110 }, { 96, 108 }, { 110, 115 }
192 };
193
194 // Sharpening by (slightly) raising the hi-frequency coeffs.
195 // Hack-ish but helpful for mid-bitrate range. Use with care.
196 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
197 static const uint8_t kFreqSharpening[16] = {
198 0, 30, 60, 90,
199 30, 60, 90, 90,
200 60, 90, 90, 90,
201 90, 90, 90, 90
202 };
203
204 //------------------------------------------------------------------------------
205 // Initialize quantization parameters in VP8Matrix
206
207 // Returns the average quantizer
ExpandMatrix(VP8Matrix * const m,int type)208 static int ExpandMatrix(VP8Matrix* const m, int type) {
209 int i, sum;
210 for (i = 0; i < 2; ++i) {
211 const int is_ac_coeff = (i > 0);
212 const int bias = kBiasMatrices[type][is_ac_coeff];
213 m->iq_[i] = (1 << QFIX) / m->q_[i];
214 m->bias_[i] = BIAS(bias);
215 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216 // * zero if coeff <= zthresh
217 // * non-zero if coeff > zthresh
218 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
219 }
220 for (i = 2; i < 16; ++i) {
221 m->q_[i] = m->q_[1];
222 m->iq_[i] = m->iq_[1];
223 m->bias_[i] = m->bias_[1];
224 m->zthresh_[i] = m->zthresh_[1];
225 }
226 for (sum = 0, i = 0; i < 16; ++i) {
227 if (type == 0) { // we only use sharpening for AC luma coeffs
228 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229 } else {
230 m->sharpen_[i] = 0;
231 }
232 sum += m->q_[i];
233 }
234 return (sum + 8) >> 4;
235 }
236
CheckLambdaValue(int * const v)237 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
238
SetupMatrices(VP8Encoder * enc)239 static void SetupMatrices(VP8Encoder* enc) {
240 int i;
241 const int tlambda_scale =
242 (enc->method_ >= 4) ? enc->config_->sns_strength
243 : 0;
244 const int num_segments = enc->segment_hdr_.num_segments_;
245 for (i = 0; i < num_segments; ++i) {
246 VP8SegmentInfo* const m = &enc->dqm_[i];
247 const int q = m->quant_;
248 int q_i4, q_i16, q_uv;
249 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
250 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
251
252 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
253 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
254
255 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
256 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
257
258 q_i4 = ExpandMatrix(&m->y1_, 0);
259 q_i16 = ExpandMatrix(&m->y2_, 1);
260 q_uv = ExpandMatrix(&m->uv_, 2);
261
262 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
263 m->lambda_i16_ = (3 * q_i16 * q_i16);
264 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
265 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
266 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
267 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
268 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
269 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
270
271 // none of these constants should be < 1
272 CheckLambdaValue(&m->lambda_i4_);
273 CheckLambdaValue(&m->lambda_i16_);
274 CheckLambdaValue(&m->lambda_uv_);
275 CheckLambdaValue(&m->lambda_mode_);
276 CheckLambdaValue(&m->lambda_trellis_i4_);
277 CheckLambdaValue(&m->lambda_trellis_i16_);
278 CheckLambdaValue(&m->lambda_trellis_uv_);
279 CheckLambdaValue(&m->tlambda_);
280
281 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
282 m->max_edge_ = 0;
283
284 m->i4_penalty_ = 1000 * q_i4 * q_i4;
285 }
286 }
287
288 //------------------------------------------------------------------------------
289 // Initialize filtering parameters
290
291 // Very small filter-strength values have close to no visual effect. So we can
292 // save a little decoding-CPU by turning filtering off for these.
293 #define FSTRENGTH_CUTOFF 2
294
SetupFilterStrength(VP8Encoder * const enc)295 static void SetupFilterStrength(VP8Encoder* const enc) {
296 int i;
297 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298 const int level0 = 5 * enc->config_->filter_strength;
299 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
300 VP8SegmentInfo* const m = &enc->dqm_[i];
301 // We focus on the quantization of AC coeffs.
302 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
303 const int base_strength =
304 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305 // Segments with lower complexity ('beta') will be less filtered.
306 const int f = base_strength * level0 / (256 + m->beta_);
307 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
308 }
309 // We record the initial strength (mainly for the case of 1-segment only).
310 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
311 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
312 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313 }
314
315 //------------------------------------------------------------------------------
316
317 // Note: if you change the values below, remember that the max range
318 // allowed by the syntax for DQ_UV is [-16,16].
319 #define MAX_DQ_UV (6)
320 #define MIN_DQ_UV (-4)
321
322 // We want to emulate jpeg-like behaviour where the expected "good" quality
323 // is around q=75. Internally, our "good" middle is around c=50. So we
324 // map accordingly using linear piece-wise function
QualityToCompression(double c)325 static double QualityToCompression(double c) {
326 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
327 // The file size roughly scales as pow(quantizer, 3.). Actually, the
328 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329 // in the mid-quant range. So we scale the compressibility inversely to
330 // this power-law: quant ~= compression ^ 1/3. This law holds well for
331 // low quant. Finer modeling for high-quant would make use of kAcTable[]
332 // more explicitly.
333 const double v = pow(linear_c, 1 / 3.);
334 return v;
335 }
336
QualityToJPEGCompression(double c,double alpha)337 static double QualityToJPEGCompression(double c, double alpha) {
338 // We map the complexity 'alpha' and quality setting 'c' to a compression
339 // exponent empirically matched to the compression curve of libjpeg6b.
340 // On average, the WebP output size will be roughly similar to that of a
341 // JPEG file compressed with same quality factor.
342 const double amin = 0.30;
343 const double amax = 0.85;
344 const double exp_min = 0.4;
345 const double exp_max = 0.9;
346 const double slope = (exp_min - exp_max) / (amax - amin);
347 // Linearly interpolate 'expn' from exp_min to exp_max
348 // in the [amin, amax] range.
349 const double expn = (alpha > amax) ? exp_min
350 : (alpha < amin) ? exp_max
351 : exp_max + slope * (alpha - amin);
352 const double v = pow(c, expn);
353 return v;
354 }
355
SegmentsAreEquivalent(const VP8SegmentInfo * const S1,const VP8SegmentInfo * const S2)356 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357 const VP8SegmentInfo* const S2) {
358 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359 }
360
SimplifySegments(VP8Encoder * const enc)361 static void SimplifySegments(VP8Encoder* const enc) {
362 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
363 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364 // explicit check is needed to avoid a spurious warning about 'i' exceeding
365 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367 ? enc->segment_hdr_.num_segments_
368 : NUM_MB_SEGMENTS;
369 int num_final_segments = 1;
370 int s1, s2;
371 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
372 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373 int found = 0;
374 // check if we already have similar segment
375 for (s2 = 0; s2 < num_final_segments; ++s2) {
376 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377 if (SegmentsAreEquivalent(S1, S2)) {
378 found = 1;
379 break;
380 }
381 }
382 map[s1] = s2;
383 if (!found) {
384 if (num_final_segments != s1) {
385 enc->dqm_[num_final_segments] = enc->dqm_[s1];
386 }
387 ++num_final_segments;
388 }
389 }
390 if (num_final_segments < num_segments) { // Remap
391 int i = enc->mb_w_ * enc->mb_h_;
392 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393 enc->segment_hdr_.num_segments_ = num_final_segments;
394 // Replicate the trailing segment infos (it's mostly cosmetics)
395 for (i = num_final_segments; i < num_segments; ++i) {
396 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
397 }
398 }
399 }
400
VP8SetSegmentParams(VP8Encoder * const enc,float quality)401 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402 int i;
403 int dq_uv_ac, dq_uv_dc;
404 const int num_segments = enc->segment_hdr_.num_segments_;
405 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
406 const double Q = quality / 100.;
407 const double c_base = enc->config_->emulate_jpeg_size ?
408 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
409 QualityToCompression(Q);
410 for (i = 0; i < num_segments; ++i) {
411 // We modulate the base coefficient to accommodate for the quantization
412 // susceptibility and allow denser segments to be quantized more.
413 const double expn = 1. - amp * enc->dqm_[i].alpha_;
414 const double c = pow(c_base, expn);
415 const int q = (int)(127. * (1. - c));
416 assert(expn > 0.);
417 enc->dqm_[i].quant_ = clip(q, 0, 127);
418 }
419
420 // purely indicative in the bitstream (except for the 1-segment case)
421 enc->base_quant_ = enc->dqm_[0].quant_;
422
423 // fill-in values for the unused segments (required by the syntax)
424 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425 enc->dqm_[i].quant_ = enc->base_quant_;
426 }
427
428 // uv_alpha_ is normally spread around ~60. The useful range is
429 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432 / (MAX_ALPHA - MIN_ALPHA);
433 // we rescale by the user-defined strength of adaptation
434 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
435 // and make it safe.
436 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437 // We also boost the dc-uv-quant a little, based on sns-strength, since
438 // U/V channels are quite more reactive to high quants (flat DC-blocks
439 // tend to appear, and are unpleasant).
440 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
441 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
442
443 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
444 enc->dq_y2_dc_ = 0;
445 enc->dq_y2_ac_ = 0;
446 enc->dq_uv_dc_ = dq_uv_dc;
447 enc->dq_uv_ac_ = dq_uv_ac;
448
449 SetupFilterStrength(enc); // initialize segments' filtering, eventually
450
451 if (num_segments > 1) SimplifySegments(enc);
452
453 SetupMatrices(enc); // finalize quantization matrices
454 }
455
456 //------------------------------------------------------------------------------
457 // Form the predictions in cache
458
459 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462
463 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
465 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466 };
467
VP8MakeLuma16Preds(const VP8EncIterator * const it)468 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471 VP8EncPredLuma16(it->yuv_p_, left, top);
472 }
473
VP8MakeChroma8Preds(const VP8EncIterator * const it)474 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477 VP8EncPredChroma8(it->yuv_p_, left, top);
478 }
479
VP8MakeIntra4Preds(const VP8EncIterator * const it)480 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482 }
483
484 //------------------------------------------------------------------------------
485 // Quantize
486
487 // Layout:
488 // +----+----+
489 // |YYYY|UUVV| 0
490 // |YYYY|UUVV| 4
491 // |YYYY|....| 8
492 // |YYYY|....| 12
493 // +----+----+
494
495 const uint16_t VP8Scan[16] = { // Luma
496 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
497 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
498 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
499 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
500 };
501
502 static const uint16_t VP8ScanUV[4 + 4] = {
503 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
504 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
505 };
506
507 //------------------------------------------------------------------------------
508 // Distortion measurement
509
510 static const uint16_t kWeightY[16] = {
511 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
512 };
513
514 static const uint16_t kWeightTrellis[16] = {
515 #if USE_TDISTO == 0
516 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
517 #else
518 30, 27, 19, 11,
519 27, 24, 17, 10,
520 19, 17, 12, 8,
521 11, 10, 8, 6
522 #endif
523 };
524
525 // Init/Copy the common fields in score.
InitScore(VP8ModeScore * const rd)526 static void InitScore(VP8ModeScore* const rd) {
527 rd->D = 0;
528 rd->SD = 0;
529 rd->R = 0;
530 rd->H = 0;
531 rd->nz = 0;
532 rd->score = MAX_COST;
533 }
534
CopyScore(VP8ModeScore * const dst,const VP8ModeScore * const src)535 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536 dst->D = src->D;
537 dst->SD = src->SD;
538 dst->R = src->R;
539 dst->H = src->H;
540 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
541 dst->score = src->score;
542 }
543
AddScore(VP8ModeScore * const dst,const VP8ModeScore * const src)544 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545 dst->D += src->D;
546 dst->SD += src->SD;
547 dst->R += src->R;
548 dst->H += src->H;
549 dst->nz |= src->nz; // here, new nz bits are accumulated.
550 dst->score += src->score;
551 }
552
553 //------------------------------------------------------------------------------
554 // Performs trellis-optimized quantization.
555
556 // Trellis node
557 typedef struct {
558 int8_t prev; // best previous node
559 int8_t sign; // sign of coeff_i
560 int16_t level; // level
561 } Node;
562
563 // Score state
564 typedef struct {
565 score_t score; // partial RD score
566 const uint16_t* costs; // shortcut to cost tables
567 } ScoreState;
568
569 // If a coefficient was quantized to a value Q (using a neutral bias),
570 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571 // We don't test negative values though.
572 #define MIN_DELTA 0 // how much lower level to try
573 #define MAX_DELTA 1 // how much higher
574 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577
SetRDScore(int lambda,VP8ModeScore * const rd)578 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580 }
581
RDScoreTrellis(int lambda,score_t rate,score_t distortion)582 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583 score_t distortion) {
584 return rate * lambda + RD_DISTO_MULT * distortion;
585 }
586
TrellisQuantizeBlock(const VP8Encoder * const enc,int16_t in[16],int16_t out[16],int ctx0,int coeff_type,const VP8Matrix * const mtx,int lambda)587 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588 int16_t in[16], int16_t out[16],
589 int ctx0, int coeff_type,
590 const VP8Matrix* const mtx,
591 int lambda) {
592 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593 CostArrayPtr const costs =
594 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595 const int first = (coeff_type == 0) ? 1 : 0;
596 Node nodes[16][NUM_NODES];
597 ScoreState score_states[2][NUM_NODES];
598 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
599 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
600 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
601 score_t best_score;
602 int n, m, p, last;
603
604 {
605 score_t cost;
606 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
607 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
608
609 // compute the position of the last interesting coefficient
610 last = first - 1;
611 for (n = 15; n >= first; --n) {
612 const int j = kZigzag[n];
613 const int err = in[j] * in[j];
614 if (err > thresh) {
615 last = n;
616 break;
617 }
618 }
619 // we don't need to go inspect up to n = 16 coeffs. We can just go up
620 // to last + 1 (inclusive) without losing much.
621 if (last < 15) ++last;
622
623 // compute 'skip' score. This is the max score one can do.
624 cost = VP8BitCost(0, last_proba);
625 best_score = RDScoreTrellis(lambda, cost, 0);
626
627 // initialize source node.
628 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
630 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
631 ss_cur[m].costs = costs[first][ctx0];
632 }
633 }
634
635 // traverse trellis.
636 for (n = first; n <= last; ++n) {
637 const int j = kZigzag[n];
638 const uint32_t Q = mtx->q_[j];
639 const uint32_t iQ = mtx->iq_[j];
640 const uint32_t B = BIAS(0x00); // neutral bias
641 // note: it's important to take sign of the _original_ coeff,
642 // so we don't have to consider level < 0 afterward.
643 const int sign = (in[j] < 0);
644 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645 int level0 = QUANTDIV(coeff0, iQ, B);
646 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
647 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649
650 { // Swap current and previous score states
651 ScoreState* const tmp = ss_cur;
652 ss_cur = ss_prev;
653 ss_prev = tmp;
654 }
655
656 // test all alternate level values around level0.
657 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658 Node* const cur = &NODE(n, m);
659 int level = level0 + m;
660 const int ctx = (level > 2) ? 2 : level;
661 const int band = VP8EncBands[n + 1];
662 score_t base_score;
663 score_t best_cur_score = MAX_COST;
664 int best_prev = 0; // default, in case
665
666 ss_cur[m].score = MAX_COST;
667 ss_cur[m].costs = costs[n + 1][ctx];
668 if (level < 0 || level > thresh_level) {
669 // Node is dead.
670 continue;
671 }
672
673 {
674 // Compute delta_error = how much coding this level will
675 // subtract to max_error as distortion.
676 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
677 const int new_error = coeff0 - level * Q;
678 const int delta_error =
679 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680 base_score = RDScoreTrellis(lambda, 0, delta_error);
681 }
682
683 // Inspect all possible non-dead predecessors. Retain only the best one.
684 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686 // eliminated since their score can't be better than the current best.
687 const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688 // Examine node assuming it's a non-terminal one.
689 const score_t score =
690 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
691 if (score < best_cur_score) {
692 best_cur_score = score;
693 best_prev = p;
694 }
695 }
696 // Store best finding in current node.
697 cur->sign = sign;
698 cur->level = level;
699 cur->prev = best_prev;
700 ss_cur[m].score = best_cur_score;
701
702 // Now, record best terminal node (and thus best entry in the graph).
703 if (level != 0) {
704 const score_t last_pos_cost =
705 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
706 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
707 const score_t score = best_cur_score + last_pos_score;
708 if (score < best_score) {
709 best_score = score;
710 best_path[0] = n; // best eob position
711 best_path[1] = m; // best node index
712 best_path[2] = best_prev; // best predecessor
713 }
714 }
715 }
716 }
717
718 // Fresh start
719 memset(in + first, 0, (16 - first) * sizeof(*in));
720 memset(out + first, 0, (16 - first) * sizeof(*out));
721 if (best_path[0] == -1) {
722 return 0; // skip!
723 }
724
725 {
726 // Unwind the best path.
727 // Note: best-prev on terminal node is not necessarily equal to the
728 // best_prev for non-terminal. So we patch best_path[2] in.
729 int nz = 0;
730 int best_node = best_path[1];
731 n = best_path[0];
732 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
733
734 for (; n >= first; --n) {
735 const Node* const node = &NODE(n, best_node);
736 const int j = kZigzag[n];
737 out[n] = node->sign ? -node->level : node->level;
738 nz |= node->level;
739 in[j] = out[n] * mtx->q_[j];
740 best_node = node->prev;
741 }
742 return (nz != 0);
743 }
744 }
745
746 #undef NODE
747
748 //------------------------------------------------------------------------------
749 // Performs: difference, transform, quantize, back-transform, add
750 // all at once. Output is the reconstructed block in *yuv_out, and the
751 // quantized levels in *levels.
752
ReconstructIntra16(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)753 static int ReconstructIntra16(VP8EncIterator* const it,
754 VP8ModeScore* const rd,
755 uint8_t* const yuv_out,
756 int mode) {
757 const VP8Encoder* const enc = it->enc_;
758 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761 int nz = 0;
762 int n;
763 int16_t tmp[16][16], dc_tmp[16];
764
765 for (n = 0; n < 16; n += 2) {
766 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767 }
768 VP8FTransformWHT(tmp[0], dc_tmp);
769 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
770
771 if (DO_TRELLIS_I16 && it->do_trellis_) {
772 int x, y;
773 VP8IteratorNzToBytes(it);
774 for (y = 0, n = 0; y < 4; ++y) {
775 for (x = 0; x < 4; ++x, ++n) {
776 const int ctx = it->top_nz_[x] + it->left_nz_[y];
777 const int non_zero =
778 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
779 &dqm->y1_, dqm->lambda_trellis_i16_);
780 it->top_nz_[x] = it->left_nz_[y] = non_zero;
781 rd->y_ac_levels[n][0] = 0;
782 nz |= non_zero << n;
783 }
784 }
785 } else {
786 for (n = 0; n < 16; n += 2) {
787 // Zero-out the first coeff, so that: a) nz is correct below, and
788 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789 tmp[n][0] = tmp[n + 1][0] = 0;
790 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791 assert(rd->y_ac_levels[n + 0][0] == 0);
792 assert(rd->y_ac_levels[n + 1][0] == 0);
793 }
794 }
795
796 // Transform back
797 VP8TransformWHT(dc_tmp, tmp[0]);
798 for (n = 0; n < 16; n += 2) {
799 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
800 }
801
802 return nz;
803 }
804
ReconstructIntra4(VP8EncIterator * const it,int16_t levels[16],const uint8_t * const src,uint8_t * const yuv_out,int mode)805 static int ReconstructIntra4(VP8EncIterator* const it,
806 int16_t levels[16],
807 const uint8_t* const src,
808 uint8_t* const yuv_out,
809 int mode) {
810 const VP8Encoder* const enc = it->enc_;
811 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813 int nz = 0;
814 int16_t tmp[16];
815
816 VP8FTransform(src, ref, tmp);
817 if (DO_TRELLIS_I4 && it->do_trellis_) {
818 const int x = it->i4_ & 3, y = it->i4_ >> 2;
819 const int ctx = it->top_nz_[x] + it->left_nz_[y];
820 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
821 dqm->lambda_trellis_i4_);
822 } else {
823 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824 }
825 VP8ITransform(ref, tmp, yuv_out, 0);
826 return nz;
827 }
828
829 //------------------------------------------------------------------------------
830 // DC-error diffusion
831
832 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
833 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
834 #define C1 7 // fraction of error sent to the 4x4 block below
835 #define C2 8 // fraction of error sent to the 4x4 block on the right
836 #define DSHIFT 4
837 #define DSCALE 1 // storage descaling, needed to make the error fit int8_t
838
839 // Quantize as usual, but also compute and return the quantization error.
840 // Error is already divided by DSHIFT.
QuantizeSingle(int16_t * const v,const VP8Matrix * const mtx)841 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
842 int V = *v;
843 const int sign = (V < 0);
844 if (sign) V = -V;
845 if (V > (int)mtx->zthresh_[0]) {
846 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
847 const int err = (V - qV);
848 *v = sign ? -qV : qV;
849 return (sign ? -err : err) >> DSCALE;
850 }
851 *v = 0;
852 return (sign ? -V : V) >> DSCALE;
853 }
854
CorrectDCValues(const VP8EncIterator * const it,const VP8Matrix * const mtx,int16_t tmp[][16],VP8ModeScore * const rd)855 static void CorrectDCValues(const VP8EncIterator* const it,
856 const VP8Matrix* const mtx,
857 int16_t tmp[][16], VP8ModeScore* const rd) {
858 // | top[0] | top[1]
859 // --------+--------+---------
860 // left[0] | tmp[0] tmp[1] <-> err0 err1
861 // left[1] | tmp[2] tmp[3] err2 err3
862 //
863 // Final errors {err1,err2,err3} are preserved and later restored
864 // as top[]/left[] on the next block.
865 int ch;
866 for (ch = 0; ch <= 1; ++ch) {
867 const int8_t* const top = it->top_derr_[it->x_][ch];
868 const int8_t* const left = it->left_derr_[ch];
869 int16_t (* const c)[16] = &tmp[ch * 4];
870 int err0, err1, err2, err3;
871 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
872 err0 = QuantizeSingle(&c[0][0], mtx);
873 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
874 err1 = QuantizeSingle(&c[1][0], mtx);
875 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
876 err2 = QuantizeSingle(&c[2][0], mtx);
877 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
878 err3 = QuantizeSingle(&c[3][0], mtx);
879 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
880 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
881 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
882 rd->derr[ch][0] = (int8_t)err1;
883 rd->derr[ch][1] = (int8_t)err2;
884 rd->derr[ch][2] = (int8_t)err3;
885 }
886 }
887
StoreDiffusionErrors(VP8EncIterator * const it,const VP8ModeScore * const rd)888 static void StoreDiffusionErrors(VP8EncIterator* const it,
889 const VP8ModeScore* const rd) {
890 int ch;
891 for (ch = 0; ch <= 1; ++ch) {
892 int8_t* const top = it->top_derr_[it->x_][ch];
893 int8_t* const left = it->left_derr_[ch];
894 left[0] = rd->derr[ch][0]; // restore err1
895 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
896 top[0] = rd->derr[ch][1]; // ... err2
897 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
898 }
899 }
900
901 #undef C1
902 #undef C2
903 #undef DSHIFT
904 #undef DSCALE
905
906 //------------------------------------------------------------------------------
907
ReconstructUV(VP8EncIterator * const it,VP8ModeScore * const rd,uint8_t * const yuv_out,int mode)908 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
909 uint8_t* const yuv_out, int mode) {
910 const VP8Encoder* const enc = it->enc_;
911 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
912 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
913 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
914 int nz = 0;
915 int n;
916 int16_t tmp[8][16];
917
918 for (n = 0; n < 8; n += 2) {
919 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
920 }
921 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
922
923 if (DO_TRELLIS_UV && it->do_trellis_) {
924 int ch, x, y;
925 for (ch = 0, n = 0; ch <= 2; ch += 2) {
926 for (y = 0; y < 2; ++y) {
927 for (x = 0; x < 2; ++x, ++n) {
928 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
929 const int non_zero =
930 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
931 &dqm->uv_, dqm->lambda_trellis_uv_);
932 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
933 nz |= non_zero << n;
934 }
935 }
936 }
937 } else {
938 for (n = 0; n < 8; n += 2) {
939 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
940 }
941 }
942
943 for (n = 0; n < 8; n += 2) {
944 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
945 }
946 return (nz << 16);
947 }
948
949 //------------------------------------------------------------------------------
950 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
951 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
952
StoreMaxDelta(VP8SegmentInfo * const dqm,const int16_t DCs[16])953 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
954 // We look at the first three AC coefficients to determine what is the average
955 // delta between each sub-4x4 block.
956 const int v0 = abs(DCs[1]);
957 const int v1 = abs(DCs[2]);
958 const int v2 = abs(DCs[4]);
959 int max_v = (v1 > v0) ? v1 : v0;
960 max_v = (v2 > max_v) ? v2 : max_v;
961 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
962 }
963
SwapModeScore(VP8ModeScore ** a,VP8ModeScore ** b)964 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
965 VP8ModeScore* const tmp = *a;
966 *a = *b;
967 *b = tmp;
968 }
969
SwapPtr(uint8_t ** a,uint8_t ** b)970 static void SwapPtr(uint8_t** a, uint8_t** b) {
971 uint8_t* const tmp = *a;
972 *a = *b;
973 *b = tmp;
974 }
975
SwapOut(VP8EncIterator * const it)976 static void SwapOut(VP8EncIterator* const it) {
977 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
978 }
979
IsFlat(const int16_t * levels,int num_blocks,score_t thresh)980 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
981 score_t score = 0;
982 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
983 int i;
984 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
985 score += (levels[i] != 0);
986 if (score > thresh) return 0;
987 }
988 levels += 16;
989 }
990 return 1;
991 }
992
PickBestIntra16(VP8EncIterator * const it,VP8ModeScore * rd)993 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
994 const int kNumBlocks = 16;
995 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
996 const int lambda = dqm->lambda_i16_;
997 const int tlambda = dqm->tlambda_;
998 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
999 VP8ModeScore rd_tmp;
1000 VP8ModeScore* rd_cur = &rd_tmp;
1001 VP8ModeScore* rd_best = rd;
1002 int mode;
1003
1004 rd->mode_i16 = -1;
1005 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1006 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1007 rd_cur->mode_i16 = mode;
1008
1009 // Reconstruct
1010 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1011
1012 // Measure RD-score
1013 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1014 rd_cur->SD =
1015 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1016 rd_cur->H = VP8FixedCostsI16[mode];
1017 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1018 if (mode > 0 &&
1019 IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
1020 // penalty to avoid flat area to be mispredicted by complex mode
1021 rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
1022 }
1023
1024 // Since we always examine Intra16 first, we can overwrite *rd directly.
1025 SetRDScore(lambda, rd_cur);
1026 if (mode == 0 || rd_cur->score < rd_best->score) {
1027 SwapModeScore(&rd_cur, &rd_best);
1028 SwapOut(it);
1029 }
1030 }
1031 if (rd_best != rd) {
1032 memcpy(rd, rd_best, sizeof(*rd));
1033 }
1034 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1035 VP8SetIntra16Mode(it, rd->mode_i16);
1036
1037 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1038 // distortion, record max delta so we can later adjust the minimal filtering
1039 // strength needed to smooth these blocks out.
1040 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1041 StoreMaxDelta(dqm, rd->y_dc_levels);
1042 }
1043 }
1044
1045 //------------------------------------------------------------------------------
1046
1047 // return the cost array corresponding to the surrounding prediction modes.
GetCostModeI4(VP8EncIterator * const it,const uint8_t modes[16])1048 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1049 const uint8_t modes[16]) {
1050 const int preds_w = it->enc_->preds_w_;
1051 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1052 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1053 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1054 return VP8FixedCostsI4[top][left];
1055 }
1056
PickBestIntra4(VP8EncIterator * const it,VP8ModeScore * const rd)1057 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1058 const VP8Encoder* const enc = it->enc_;
1059 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1060 const int lambda = dqm->lambda_i4_;
1061 const int tlambda = dqm->tlambda_;
1062 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1063 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1064 int total_header_bits = 0;
1065 VP8ModeScore rd_best;
1066
1067 if (enc->max_i4_header_bits_ == 0) {
1068 return 0;
1069 }
1070
1071 InitScore(&rd_best);
1072 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1073 SetRDScore(dqm->lambda_mode_, &rd_best);
1074 VP8IteratorStartI4(it);
1075 do {
1076 const int kNumBlocks = 1;
1077 VP8ModeScore rd_i4;
1078 int mode;
1079 int best_mode = -1;
1080 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1081 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1082 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1083 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1084
1085 InitScore(&rd_i4);
1086 VP8MakeIntra4Preds(it);
1087 for (mode = 0; mode < NUM_BMODES; ++mode) {
1088 VP8ModeScore rd_tmp;
1089 int16_t tmp_levels[16];
1090
1091 // Reconstruct
1092 rd_tmp.nz =
1093 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1094
1095 // Compute RD-score
1096 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1097 rd_tmp.SD =
1098 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1099 : 0;
1100 rd_tmp.H = mode_costs[mode];
1101
1102 // Add flatness penalty
1103 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1104 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1105 } else {
1106 rd_tmp.R = 0;
1107 }
1108
1109 // early-out check
1110 SetRDScore(lambda, &rd_tmp);
1111 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1112
1113 // finish computing score
1114 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1115 SetRDScore(lambda, &rd_tmp);
1116
1117 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1118 CopyScore(&rd_i4, &rd_tmp);
1119 best_mode = mode;
1120 SwapPtr(&tmp_dst, &best_block);
1121 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1122 sizeof(rd_best.y_ac_levels[it->i4_]));
1123 }
1124 }
1125 SetRDScore(dqm->lambda_mode_, &rd_i4);
1126 AddScore(&rd_best, &rd_i4);
1127 if (rd_best.score >= rd->score) {
1128 return 0;
1129 }
1130 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1131 if (total_header_bits > enc->max_i4_header_bits_) {
1132 return 0;
1133 }
1134 // Copy selected samples if not in the right place already.
1135 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1136 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1137 }
1138 rd->modes_i4[it->i4_] = best_mode;
1139 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1140 } while (VP8IteratorRotateI4(it, best_blocks));
1141
1142 // finalize state
1143 CopyScore(rd, &rd_best);
1144 VP8SetIntra4Mode(it, rd->modes_i4);
1145 SwapOut(it);
1146 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1147 return 1; // select intra4x4 over intra16x16
1148 }
1149
1150 //------------------------------------------------------------------------------
1151
PickBestUV(VP8EncIterator * const it,VP8ModeScore * const rd)1152 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1153 const int kNumBlocks = 8;
1154 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1155 const int lambda = dqm->lambda_uv_;
1156 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1157 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1158 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1159 uint8_t* dst = dst0;
1160 VP8ModeScore rd_best;
1161 int mode;
1162
1163 rd->mode_uv = -1;
1164 InitScore(&rd_best);
1165 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1166 VP8ModeScore rd_uv;
1167
1168 // Reconstruct
1169 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1170
1171 // Compute RD-score
1172 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1173 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1174 rd_uv.H = VP8FixedCostsUV[mode];
1175 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1176 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1177 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1178 }
1179
1180 SetRDScore(lambda, &rd_uv);
1181 if (mode == 0 || rd_uv.score < rd_best.score) {
1182 CopyScore(&rd_best, &rd_uv);
1183 rd->mode_uv = mode;
1184 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1185 if (it->top_derr_ != NULL) {
1186 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1187 }
1188 SwapPtr(&dst, &tmp_dst);
1189 }
1190 }
1191 VP8SetIntraUVMode(it, rd->mode_uv);
1192 AddScore(rd, &rd_best);
1193 if (dst != dst0) { // copy 16x8 block if needed
1194 VP8Copy16x8(dst, dst0);
1195 }
1196 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1197 StoreDiffusionErrors(it, rd);
1198 }
1199 }
1200
1201 //------------------------------------------------------------------------------
1202 // Final reconstruction and quantization.
1203
SimpleQuantize(VP8EncIterator * const it,VP8ModeScore * const rd)1204 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1205 const VP8Encoder* const enc = it->enc_;
1206 const int is_i16 = (it->mb_->type_ == 1);
1207 int nz = 0;
1208
1209 if (is_i16) {
1210 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1211 } else {
1212 VP8IteratorStartI4(it);
1213 do {
1214 const int mode =
1215 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1216 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1217 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1218 VP8MakeIntra4Preds(it);
1219 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1220 src, dst, mode) << it->i4_;
1221 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1222 }
1223
1224 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1225 rd->nz = nz;
1226 }
1227
1228 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
RefineUsingDistortion(VP8EncIterator * const it,int try_both_modes,int refine_uv_mode,VP8ModeScore * const rd)1229 static void RefineUsingDistortion(VP8EncIterator* const it,
1230 int try_both_modes, int refine_uv_mode,
1231 VP8ModeScore* const rd) {
1232 score_t best_score = MAX_COST;
1233 int nz = 0;
1234 int mode;
1235 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1236
1237 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1238 // Some empiric constants, of approximate order of magnitude.
1239 const int lambda_d_i16 = 106;
1240 const int lambda_d_i4 = 11;
1241 const int lambda_d_uv = 120;
1242 score_t score_i4 = dqm->i4_penalty_;
1243 score_t i4_bit_sum = 0;
1244 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1245 : MAX_COST; // no early-out allowed
1246
1247 if (is_i16) { // First, evaluate Intra16 distortion
1248 int best_mode = -1;
1249 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1250 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1251 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1252 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1253 + VP8FixedCostsI16[mode] * lambda_d_i16;
1254 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1255 continue;
1256 }
1257 if (score < best_score) {
1258 best_mode = mode;
1259 best_score = score;
1260 }
1261 }
1262 VP8SetIntra16Mode(it, best_mode);
1263 // we'll reconstruct later, if i16 mode actually gets selected
1264 }
1265
1266 // Next, evaluate Intra4
1267 if (try_both_modes || !is_i16) {
1268 // We don't evaluate the rate here, but just account for it through a
1269 // constant penalty (i4 mode usually needs more bits compared to i16).
1270 is_i16 = 0;
1271 VP8IteratorStartI4(it);
1272 do {
1273 int best_i4_mode = -1;
1274 score_t best_i4_score = MAX_COST;
1275 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1276 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1277
1278 VP8MakeIntra4Preds(it);
1279 for (mode = 0; mode < NUM_BMODES; ++mode) {
1280 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1281 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1282 + mode_costs[mode] * lambda_d_i4;
1283 if (score < best_i4_score) {
1284 best_i4_mode = mode;
1285 best_i4_score = score;
1286 }
1287 }
1288 i4_bit_sum += mode_costs[best_i4_mode];
1289 rd->modes_i4[it->i4_] = best_i4_mode;
1290 score_i4 += best_i4_score;
1291 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1292 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1293 is_i16 = 1;
1294 break;
1295 } else { // reconstruct partial block inside yuv_out2_ buffer
1296 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1297 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1298 src, tmp_dst, best_i4_mode) << it->i4_;
1299 }
1300 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1301 }
1302
1303 // Final reconstruction, depending on which mode is selected.
1304 if (!is_i16) {
1305 VP8SetIntra4Mode(it, rd->modes_i4);
1306 SwapOut(it);
1307 best_score = score_i4;
1308 } else {
1309 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1310 }
1311
1312 // ... and UV!
1313 if (refine_uv_mode) {
1314 int best_mode = -1;
1315 score_t best_uv_score = MAX_COST;
1316 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1317 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1318 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1319 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1320 + VP8FixedCostsUV[mode] * lambda_d_uv;
1321 if (score < best_uv_score) {
1322 best_mode = mode;
1323 best_uv_score = score;
1324 }
1325 }
1326 VP8SetIntraUVMode(it, best_mode);
1327 }
1328 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1329
1330 rd->nz = nz;
1331 rd->score = best_score;
1332 }
1333
1334 //------------------------------------------------------------------------------
1335 // Entry point
1336
VP8Decimate(VP8EncIterator * const it,VP8ModeScore * const rd,VP8RDLevel rd_opt)1337 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1338 VP8RDLevel rd_opt) {
1339 int is_skipped;
1340 const int method = it->enc_->method_;
1341
1342 InitScore(rd);
1343
1344 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1345 // Luma4x4 predictions needs to be done as-we-go.
1346 VP8MakeLuma16Preds(it);
1347 VP8MakeChroma8Preds(it);
1348
1349 if (rd_opt > RD_OPT_NONE) {
1350 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1351 PickBestIntra16(it, rd);
1352 if (method >= 2) {
1353 PickBestIntra4(it, rd);
1354 }
1355 PickBestUV(it, rd);
1356 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1357 it->do_trellis_ = 1;
1358 SimpleQuantize(it, rd);
1359 }
1360 } else {
1361 // At this point we have heuristically decided intra16 / intra4.
1362 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1363 // For method <= 1, we don't re-examine the decision but just go ahead with
1364 // quantization/reconstruction.
1365 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1366 }
1367 is_skipped = (rd->nz == 0);
1368 VP8SetSkip(it, is_skipped);
1369 return is_skipped;
1370 }
1371