1 /*
2 * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 /* clang-format off */
13
14 #ifdef HAVE_CONFIG_H
15 # include "config.h"
16 #endif
17
18 #include <math.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include "aom_dsp/entcode.h"
22 #include "aom_dsp/entenc.h"
23 #include "av1/common/blockd.h"
24 #include "av1/common/odintrin.h"
25 #include "av1/common/partition.h"
26 #include "av1/common/pvq_state.h"
27 #include "av1/encoder/encodemb.h"
28 #include "av1/encoder/pvq_encoder.h"
29 #include "aom_ports/system_state.h"
30
31 /*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
32 dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
33 #define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
34
aom_write_symbol_pvq(aom_writer * w,int symb,aom_cdf_prob * cdf,int nsymbs)35 void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
36 int nsymbs) {
37 if (cdf[0] == 0)
38 aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
39 aom_write_symbol(w, symb, cdf, nsymbs);
40 }
41
aom_encode_pvq_codeword(aom_writer * w,od_pvq_codeword_ctx * adapt,const od_coeff * in,int n,int k)42 static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
43 const od_coeff *in, int n, int k) {
44 int i;
45 aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
46 for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
47 }
48
49 /* Computes 1/sqrt(i) using a table for small values. */
od_rsqrt_table(int i)50 static double od_rsqrt_table(int i) {
51 static double table[16] = {
52 1.000000, 0.707107, 0.577350, 0.500000,
53 0.447214, 0.408248, 0.377964, 0.353553,
54 0.333333, 0.316228, 0.301511, 0.288675,
55 0.277350, 0.267261, 0.258199, 0.250000};
56 if (i <= 16) return table[i-1];
57 else return 1./sqrt(i);
58 }
59
60 /*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
61 where 0 <= i < table_size.*/
od_custom_rsqrt_dynamic_table(const double * table,const int table_size,const double start,const int i)62 static double od_custom_rsqrt_dynamic_table(const double* table,
63 const int table_size, const double start, const int i) {
64 if (i < table_size) return table[i];
65 else return od_rsqrt_table((int)(start + 2*i + 1));
66 }
67
68 /*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
od_fill_dynamic_rsqrt_table(double * table,const int table_size,const double start)69 static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
70 const double start) {
71 int i;
72 for (i = 0; i < table_size; i++)
73 table[i] = od_rsqrt_table((int)(start + 2*i + 1));
74 }
75
76 /** Find the codepoint on the given PSphere closest to the desired
77 * vector. Double-precision PVQ search just to make sure our tests
78 * aren't limited by numerical accuracy.
79 *
80 * @param [in] xcoeff input vector to quantize (x in the math doc)
81 * @param [in] n number of dimensions
82 * @param [in] k number of pulses
83 * @param [out] ypulse optimal codevector found (y in the math doc)
84 * @param [out] g2 multiplier for the distortion (typically squared
85 * gain units)
86 * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
87 * @param [in] prev_k number of pulses already in ypulse that we should
88 * reuse for the search (or 0 for a new search)
89 * @return cosine distance between x and y (between 0 and 1)
90 */
pvq_search_rdo_double_c(const od_val16 * xcoeff,int n,int k,od_coeff * ypulse,double g2,double pvq_norm_lambda,int prev_k)91 double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
92 od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
93 int i, j;
94 double xy;
95 double yy;
96 /* TODO - This blows our 8kB stack space budget and should be fixed when
97 converting PVQ to fixed point. */
98 double x[MAXN];
99 double xx;
100 double lambda;
101 double norm_1;
102 int rdo_pulses;
103 double delta_rate;
104 xx = xy = yy = 0;
105 for (j = 0; j < n; j++) {
106 x[j] = fabs((float)xcoeff[j]);
107 xx += x[j]*x[j];
108 }
109 norm_1 = 1./sqrt(1e-30 + xx);
110 lambda = pvq_norm_lambda/(1e-30 + g2);
111 i = 0;
112 if (prev_k > 0 && prev_k <= k) {
113 /* We reuse pulses from a previous search so we don't have to search them
114 again. */
115 for (j = 0; j < n; j++) {
116 ypulse[j] = abs(ypulse[j]);
117 xy += x[j]*ypulse[j];
118 yy += ypulse[j]*ypulse[j];
119 i += ypulse[j];
120 }
121 }
122 else if (k > 2) {
123 double l1_norm;
124 double l1_inv;
125 l1_norm = 0;
126 for (j = 0; j < n; j++) l1_norm += x[j];
127 l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
128 for (j = 0; j < n; j++) {
129 double tmp;
130 tmp = k*x[j]*l1_inv;
131 ypulse[j] = OD_MAXI(0, (int)floor(tmp));
132 xy += x[j]*ypulse[j];
133 yy += ypulse[j]*ypulse[j];
134 i += ypulse[j];
135 }
136 }
137 else OD_CLEAR(ypulse, n);
138
139 /* Only use RDO on the last few pulses. This not only saves CPU, but using
140 RDO on all pulses actually makes the results worse for reasons I don't
141 fully understand. */
142 rdo_pulses = 1 + k/4;
143 /* Rough assumption for now, the last position costs about 3 bits more than
144 the first. */
145 delta_rate = 3./n;
146 /* Search one pulse at a time */
147 for (; i < k - rdo_pulses; i++) {
148 int pos;
149 double best_xy;
150 double best_yy;
151 pos = 0;
152 best_xy = -10;
153 best_yy = 1;
154 for (j = 0; j < n; j++) {
155 double tmp_xy;
156 double tmp_yy;
157 tmp_xy = xy + x[j];
158 tmp_yy = yy + 2*ypulse[j] + 1;
159 tmp_xy *= tmp_xy;
160 if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
161 best_xy = tmp_xy;
162 best_yy = tmp_yy;
163 pos = j;
164 }
165 }
166 xy = xy + x[pos];
167 yy = yy + 2*ypulse[pos] + 1;
168 ypulse[pos]++;
169 }
170 /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
171 and since x^2 and y^2 are constant, we just maximize x*y, plus a
172 lambda*rate term. Note that since x and y aren't normalized here,
173 we need to divide by sqrt(x^2)*sqrt(y^2). */
174 for (; i < k; i++) {
175 double rsqrt_table[4];
176 int rsqrt_table_size = 4;
177 int pos;
178 double best_cost;
179 pos = 0;
180 best_cost = -1e5;
181 /*Fill the small rsqrt lookup table with inputs relative to yy.
182 Specifically, the table of n values is filled with
183 rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
184 od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
185 for (j = 0; j < n; j++) {
186 double tmp_xy;
187 double tmp_yy;
188 tmp_xy = xy + x[j];
189 /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
190 tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
191 yy, ypulse[j]);
192 tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
193 if (j == 0 || tmp_xy > best_cost) {
194 best_cost = tmp_xy;
195 pos = j;
196 }
197 }
198 xy = xy + x[pos];
199 yy = yy + 2*ypulse[pos] + 1;
200 ypulse[pos]++;
201 }
202 for (i = 0; i < n; i++) {
203 if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
204 }
205 return xy/(1e-100 + sqrt(xx*yy));
206 }
207
208 /** Encodes the gain so that the return value increases with the
209 * distance |x-ref|, so that we can encode a zero when x=ref. The
210 * value x=0 is not covered because it is only allowed in the noref
211 * case.
212 *
213 * @param [in] x quantized gain to encode
214 * @param [in] ref quantized gain of the reference
215 * @return interleave-encoded quantized gain value
216 */
neg_interleave(int x,int ref)217 static int neg_interleave(int x, int ref) {
218 if (x < ref) return -2*(x - ref) - 1;
219 else if (x < 2*ref) return 2*(x - ref);
220 else return x-1;
221 }
222
od_vector_is_null(const od_coeff * x,int len)223 int od_vector_is_null(const od_coeff *x, int len) {
224 int i;
225 for (i = 0; i < len; i++) if (x[i]) return 0;
226 return 1;
227 }
228
od_pvq_rate(int qg,int icgr,int theta,int ts,const od_adapt_ctx * adapt,const od_coeff * y0,int k,int n,int speed)229 static double od_pvq_rate(int qg, int icgr, int theta, int ts,
230 const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
231 double rate;
232 if (k == 0) rate = 0;
233 else if (speed > 0) {
234 int i;
235 int sum;
236 double f;
237 /* Compute "center of mass" of the pulse vector. */
238 sum = 0;
239 for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
240 f = sum/(double)(k*n);
241 /* Estimates the number of bits it will cost to encode K pulses in
242 N dimensions based on hand-tuned fit for bitrate vs K, N and
243 "center of mass". */
244 rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
245 }
246 else {
247 aom_writer w;
248 od_pvq_codeword_ctx cd;
249 int tell;
250 #if !CONFIG_ANS
251 od_ec_enc_init(&w.ec, 1000);
252 #else
253 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
254 #endif
255 OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
256 #if !CONFIG_ANS
257 tell = od_ec_enc_tell_frac(&w.ec);
258 #else
259 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
260 #endif
261 aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
262 #if !CONFIG_ANS
263 rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
264 od_ec_enc_clear(&w.ec);
265 #else
266 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
267 #endif
268 }
269 if (qg > 0 && theta >= 0) {
270 /* Approximate cost of entropy-coding theta */
271 rate += .9*OD_LOG2(ts);
272 if (qg == icgr) rate -= .5;
273 }
274 return rate;
275 }
276
277 #define MAX_PVQ_ITEMS (20)
278 /* This stores the information about a PVQ search candidate, so we can sort
279 based on K. */
280 typedef struct {
281 int gain;
282 int k;
283 od_val32 qtheta;
284 int theta;
285 int ts;
286 od_val32 qcg;
287 } pvq_search_item;
288
items_compare(pvq_search_item * a,pvq_search_item * b)289 int items_compare(pvq_search_item *a, pvq_search_item *b) {
290 /* Break ties in K with gain to ensure a stable sort.
291 Otherwise, the order depends on qsort implementation. */
292 return a->k == b->k ? a->gain - b->gain : a->k - b->k;
293 }
294
295 /** Perform PVQ quantization with prediction, trying several
296 * possible gains and angles. See draft-valin-videocodec-pvq and
297 * http://jmvalin.ca/slides/pvq.pdf for more details.
298 *
299 * @param [out] out coefficients after quantization
300 * @param [in] x0 coefficients before quantization
301 * @param [in] r0 reference, aka predicted coefficients
302 * @param [in] n number of dimensions
303 * @param [in] q0 quantization step size
304 * @param [out] y pulse vector (i.e. selected PVQ codevector)
305 * @param [out] itheta angle between input and reference (-1 if noref)
306 * @param [out] vk total number of pulses
307 * @param [in] beta per-band activity masking beta param
308 * @param [out] skip_diff distortion cost of skipping this block
309 * (accumulated)
310 * @param [in] is_keyframe whether we're encoding a keyframe
311 * @param [in] pli plane index
312 * @param [in] adapt probability adaptation context
313 * @param [in] qm QM with magnitude compensation
314 * @param [in] qm_inv Inverse of QM with magnitude compensation
315 * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
316 * @param [in] speed Make search faster by making approximations
317 * @return gain index of the quatized gain
318 */
pvq_theta(od_coeff * out,const od_coeff * x0,const od_coeff * r0,int n,int q0,od_coeff * y,int * itheta,int * vk,od_val16 beta,double * skip_diff,int is_keyframe,int pli,const od_adapt_ctx * adapt,const int16_t * qm,const int16_t * qm_inv,double pvq_norm_lambda,int speed)319 static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
320 int n, int q0, od_coeff *y, int *itheta, int *vk,
321 od_val16 beta, double *skip_diff, int is_keyframe, int pli,
322 const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
323 double pvq_norm_lambda, int speed) {
324 od_val32 g;
325 od_val32 gr;
326 od_coeff y_tmp[MAXN + 3];
327 int i;
328 /* Number of pulses. */
329 int k;
330 /* Companded gain of x and reference, normalized to q. */
331 od_val32 cg;
332 od_val32 cgr;
333 int icgr;
334 int qg;
335 /* Best RDO cost (D + lamdba*R) so far. */
336 double best_cost;
337 double dist0;
338 /* Distortion (D) that corresponds to the best RDO cost. */
339 double best_dist;
340 double dist;
341 /* Sign of Householder reflection. */
342 int s;
343 /* Dimension on which Householder reflects. */
344 int m;
345 od_val32 theta;
346 double corr;
347 int best_k;
348 od_val32 best_qtheta;
349 od_val32 gain_offset;
350 int noref;
351 double skip_dist;
352 int cfl_enabled;
353 int skip;
354 double gain_weight;
355 od_val16 x16[MAXN];
356 od_val16 r16[MAXN];
357 int xshift;
358 int rshift;
359 /* Give more weight to gain error when calculating the total distortion. */
360 gain_weight = 1.0;
361 OD_ASSERT(n > 1);
362 corr = 0;
363 #if !defined(OD_FLOAT_PVQ)
364 /* Shift needed to make x fit in 16 bits even after rotation.
365 This shift value is not normative (it can be changed without breaking
366 the bitstream) */
367 xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
368 /* Shift needed to make the reference fit in 15 bits, so that the Householder
369 vector can fit in 16 bits.
370 This shift value *is* normative, and has to match the decoder. */
371 rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
372 #else
373 xshift = 0;
374 rshift = 0;
375 #endif
376 for (i = 0; i < n; i++) {
377 #if defined(OD_FLOAT_PVQ)
378 /*This is slightly different from the original float PVQ code,
379 where the qm was applied in the accumulation in od_pvq_compute_gain and
380 the vectors were od_coeffs, not od_val16 (i.e. double).*/
381 x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
382 r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
383 #else
384 x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
385 r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
386 #endif
387 corr += OD_MULT16_16(x16[i], r16[i]);
388 }
389 cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
390 cg = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
391 cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
392 if (cfl_enabled) cgr = OD_CGAIN_SCALE;
393 /* gain_offset is meant to make sure one of the quantized gains has
394 exactly the same gain as the reference. */
395 #if defined(OD_FLOAT_PVQ)
396 icgr = (int)floor(.5 + cgr);
397 #else
398 icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
399 #endif
400 gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
401 /* Start search with null case: gain=0, no pulse. */
402 qg = 0;
403 dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
404 best_dist = dist;
405 best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
406 n, speed);
407 noref = 1;
408 best_k = 0;
409 *itheta = -1;
410 OD_CLEAR(y, n);
411 best_qtheta = 0;
412 m = 0;
413 s = 1;
414 corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
415 corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
416 if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
417 else {
418 skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
419 + cgr*(double)cg*(2 - 2*corr);
420 skip_dist *= OD_CGAIN_SCALE_2;
421 }
422 if (!is_keyframe) {
423 /* noref, gain=0 isn't allowed, but skip is allowed. */
424 od_val32 scgr;
425 scgr = OD_MAXF(0,gain_offset);
426 if (icgr == 0) {
427 best_dist = gain_weight*(cg - scgr)*(cg - scgr)
428 + scgr*(double)cg*(2 - 2*corr);
429 best_dist *= OD_CGAIN_SCALE_2;
430 }
431 best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
432 NULL, 0, n, speed);
433 best_qtheta = 0;
434 *itheta = 0;
435 noref = 0;
436 }
437 dist0 = best_dist;
438 if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
439 od_val16 xr[MAXN];
440 int gain_bound;
441 int prev_k;
442 pvq_search_item items[MAX_PVQ_ITEMS];
443 int idx;
444 int nitems;
445 double cos_dist;
446 idx = 0;
447 gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
448 /* Perform theta search only if prediction is useful. */
449 theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
450 m = od_compute_householder(r16, n, gr, &s, rshift);
451 od_apply_householder(xr, x16, r16, n);
452 prev_k = 0;
453 for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
454 /* Compute all candidate PVQ searches within a reasonable range of gain
455 and theta. */
456 for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
457 int j;
458 od_val32 qcg;
459 int ts;
460 int theta_lower;
461 int theta_upper;
462 /* Quantized companded gain */
463 qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
464 /* Set angular resolution (in ra) to match the encoded gain */
465 ts = od_pvq_compute_max_theta(qcg, beta);
466 theta_lower = OD_MAXI(0, (int)floor(.5 +
467 theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
468 theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
469 /* Include the angles within a reasonable range. */
470 for (j = theta_lower; j <= theta_upper; j++) {
471 od_val32 qtheta;
472 qtheta = od_pvq_compute_theta(j, ts);
473 k = od_pvq_compute_k(qcg, j, 0, n, beta);
474 items[idx].gain = i;
475 items[idx].theta = j;
476 items[idx].k = k;
477 items[idx].qcg = qcg;
478 items[idx].qtheta = qtheta;
479 items[idx].ts = ts;
480 idx++;
481 OD_ASSERT(idx < MAX_PVQ_ITEMS);
482 }
483 }
484 nitems = idx;
485 cos_dist = 0;
486 /* Sort PVQ search candidates in ascending order of pulses K so that
487 we can reuse all the previously searched pulses across searches. */
488 qsort(items, nitems, sizeof(items[0]),
489 (int (*)(const void *, const void *))items_compare);
490 /* Search for the best gain/theta in order. */
491 for (idx = 0; idx < nitems; idx++) {
492 int j;
493 od_val32 qcg;
494 int ts;
495 double cost;
496 double dist_theta;
497 double sin_prod;
498 od_val32 qtheta;
499 /* Quantized companded gain */
500 qcg = items[idx].qcg;
501 i = items[idx].gain;
502 j = items[idx].theta;
503 /* Set angular resolution (in ra) to match the encoded gain */
504 ts = items[idx].ts;
505 /* Search for the best angle within a reasonable range. */
506 qtheta = items[idx].qtheta;
507 k = items[idx].k;
508 /* Compute the minimal possible distortion by not taking the PVQ
509 cos_dist into account. */
510 dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
511 dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
512 dist *= OD_CGAIN_SCALE_2;
513 /* If we have no hope of beating skip (including a 1-bit worst-case
514 penalty), stop now. */
515 if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
516 sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
517 OD_TRIG_SCALE_1;
518 /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
519 that's the factor by which cos_dist is multiplied to get the
520 distortion metric. */
521 if (k == 0) {
522 cos_dist = 0;
523 OD_CLEAR(y_tmp, n-1);
524 }
525 else if (k != prev_k) {
526 cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
527 qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
528 }
529 prev_k = k;
530 /* See Jmspeex' Journal of Dubious Theoretical Results. */
531 dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
532 + sin_prod*(2 - 2*cos_dist);
533 dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
534 dist *= OD_CGAIN_SCALE_2;
535 /* Do approximate RDO. */
536 cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
537 k, n, speed);
538 if (cost < best_cost) {
539 best_cost = cost;
540 best_dist = dist;
541 qg = i;
542 best_k = k;
543 best_qtheta = qtheta;
544 *itheta = j;
545 noref = 0;
546 OD_COPY(y, y_tmp, n - 1);
547 }
548 }
549 }
550 /* Don't bother with no-reference version if there's a reasonable
551 correlation. */
552 if (n <= OD_MAX_PVQ_SIZE && (corr < .5
553 || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
554 int gain_bound;
555 int prev_k;
556 gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
557 prev_k = 0;
558 /* Search for the best gain (haven't determined reasonable range yet). */
559 for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
560 double cos_dist;
561 double cost;
562 od_val32 qcg;
563 qcg = OD_SHL(i, OD_CGAIN_SHIFT);
564 k = od_pvq_compute_k(qcg, -1, 1, n, beta);
565 /* Compute the minimal possible distortion by not taking the PVQ
566 cos_dist into account. */
567 dist = gain_weight*(qcg - cg)*(qcg - cg);
568 dist *= OD_CGAIN_SCALE_2;
569 if (dist > dist0 && k != 0) continue;
570 cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
571 qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
572 prev_k = k;
573 /* See Jmspeex' Journal of Dubious Theoretical Results. */
574 dist = gain_weight*(qcg - cg)*(qcg - cg)
575 + qcg*(double)cg*(2 - 2*cos_dist);
576 dist *= OD_CGAIN_SCALE_2;
577 /* Do approximate RDO. */
578 cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
579 n, speed);
580 if (cost <= best_cost) {
581 best_cost = cost;
582 best_dist = dist;
583 qg = i;
584 noref = 1;
585 best_k = k;
586 *itheta = -1;
587 OD_COPY(y, y_tmp, n);
588 }
589 }
590 }
591 k = best_k;
592 theta = best_qtheta;
593 skip = 0;
594 if (noref) {
595 if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
596 }
597 else {
598 if (!is_keyframe && qg == 0) {
599 skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
600 }
601 if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
602 }
603 /* Synthesize like the decoder would. */
604 if (skip) {
605 if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
606 else OD_CLEAR(out, n);
607 }
608 else {
609 if (noref) gain_offset = 0;
610 g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
611 od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
612 qm_inv);
613 }
614 *vk = k;
615 *skip_diff += skip_dist - best_dist;
616 /* Encode gain differently depending on whether we use prediction or not.
617 Special encoding on inter frames where qg=0 is allowed for noref=0
618 but not noref=1.*/
619 if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
620 else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
621 }
622
623 /** Encodes a single vector of integers (eg, a partition within a
624 * coefficient block) using PVQ
625 *
626 * @param [in,out] w multi-symbol entropy encoder
627 * @param [in] qg quantized gain
628 * @param [in] theta quantized post-prediction theta
629 * @param [in] in coefficient vector to code
630 * @param [in] n number of coefficients in partition
631 * @param [in] k number of pulses in partition
632 * @param [in,out] model entropy encoder state
633 * @param [in,out] adapt adaptation context
634 * @param [in,out] exg ExQ16 expectation of gain value
635 * @param [in,out] ext ExQ16 expectation of theta value
636 * @param [in] cdf_ctx selects which cdf context to use
637 * @param [in] is_keyframe whether we're encoding a keyframe
638 * @param [in] code_skip whether the "skip rest" flag is allowed
639 * @param [in] skip_rest when set, we skip all higher bands
640 * @param [in] encode_flip whether we need to encode the CfL flip flag now
641 * @param [in] flip value of the CfL flip flag
642 */
pvq_encode_partition(aom_writer * w,int qg,int theta,const od_coeff * in,int n,int k,generic_encoder model[3],od_adapt_ctx * adapt,int * exg,int * ext,int cdf_ctx,int is_keyframe,int code_skip,int skip_rest,int encode_flip,int flip)643 void pvq_encode_partition(aom_writer *w,
644 int qg,
645 int theta,
646 const od_coeff *in,
647 int n,
648 int k,
649 generic_encoder model[3],
650 od_adapt_ctx *adapt,
651 int *exg,
652 int *ext,
653 int cdf_ctx,
654 int is_keyframe,
655 int code_skip,
656 int skip_rest,
657 int encode_flip,
658 int flip) {
659 int noref;
660 int id;
661 noref = (theta == -1);
662 id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
663 if (is_keyframe) {
664 OD_ASSERT(id != 8);
665 if (id >= 8) id--;
666 }
667 else {
668 OD_ASSERT(id != 10);
669 if (id >= 10) id--;
670 }
671 /* Jointly code gain, theta and noref for small values. Then we handle
672 larger gain and theta values. For noref, theta = -1. */
673 aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
674 8 + 7*code_skip);
675 if (encode_flip) {
676 /* We could eventually do some smarter entropy coding here, but it would
677 have to be good enough to overcome the overhead of the entropy coder.
678 An early attempt using a "toogle" flag with simple adaptation wasn't
679 worth the trouble. */
680 aom_write_bit(w, flip);
681 }
682 if (qg > 0) {
683 int tmp;
684 tmp = *exg;
685 generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
686 OD_IIR_DIADIC(*exg, qg << 16, 2);
687 }
688 if (theta > 1) {
689 int tmp;
690 tmp = *ext;
691 generic_encode(w, &model[2], theta - 2, &tmp, 2);
692 OD_IIR_DIADIC(*ext, theta << 16, 2);
693 }
694 aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
695 n - (theta != -1), k);
696 }
697
698 /** Quantizes a scalar with rate-distortion optimization (RDO)
699 * @param [in] x unquantized value
700 * @param [in] q quantization step size
701 * @param [in] delta0 rate increase for encoding a 1 instead of a 0
702 * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
703 * @retval quantized value
704 */
od_rdo_quant(od_coeff x,int q,double delta0,double pvq_norm_lambda)705 int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
706 int n;
707 /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
708 Jmspeex' Journal of Dubious Theoretical Results for details. */
709 n = OD_DIV_R0(abs(x), q);
710 if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
711 return 0;
712 }
713 else {
714 return OD_DIV_R0(x, q);
715 }
716 }
717
718 /** Encode a coefficient block (excepting DC) using PVQ
719 *
720 * @param [in,out] enc daala encoder context
721 * @param [in] ref 'reference' (prediction) vector
722 * @param [in] in coefficient block to quantize and encode
723 * @param [out] out quantized coefficient block
724 * @param [in] q0 scale/quantizer
725 * @param [in] pli plane index
726 * @param [in] bs log of the block size minus two
727 * @param [in] beta per-band activity masking beta param
728 * @param [in] is_keyframe whether we're encoding a keyframe
729 * @param [in] qm QM with magnitude compensation
730 * @param [in] qm_inv Inverse of QM with magnitude compensation
731 * @param [in] speed Make search faster by making approximations
732 * @param [in] pvq_info If null, conisdered as RDO search mode
733 * @return Returns block skip info indicating whether DC/AC are coded.
734 * bit0: DC is coded, bit1: AC is coded (1 means coded)
735 *
736 */
od_pvq_encode(daala_enc_ctx * enc,od_coeff * ref,const od_coeff * in,od_coeff * out,int q_dc,int q_ac,int pli,int bs,const od_val16 * beta,int is_keyframe,const int16_t * qm,const int16_t * qm_inv,int speed,PVQ_INFO * pvq_info)737 PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
738 od_coeff *ref,
739 const od_coeff *in,
740 od_coeff *out,
741 int q_dc,
742 int q_ac,
743 int pli,
744 int bs,
745 const od_val16 *beta,
746 int is_keyframe,
747 const int16_t *qm,
748 const int16_t *qm_inv,
749 int speed,
750 PVQ_INFO *pvq_info){
751 int theta[PVQ_MAX_PARTITIONS];
752 int qg[PVQ_MAX_PARTITIONS];
753 int k[PVQ_MAX_PARTITIONS];
754 od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
755 int *exg;
756 int *ext;
757 int nb_bands;
758 int i;
759 const int *off;
760 int size[PVQ_MAX_PARTITIONS];
761 generic_encoder *model;
762 double skip_diff;
763 int tell;
764 uint16_t *skip_cdf;
765 od_rollback_buffer buf;
766 int dc_quant;
767 int flip;
768 int cfl_encoded;
769 int skip_rest;
770 int skip_dir;
771 int skip_theta_value;
772 const unsigned char *pvq_qm;
773 double dc_rate;
774 int use_masking;
775 PVQ_SKIP_TYPE ac_dc_coded;
776
777 aom_clear_system_state();
778
779 use_masking = enc->use_activity_masking;
780
781 if (use_masking)
782 pvq_qm = &enc->state.pvq_qm_q4[pli][0];
783 else
784 pvq_qm = 0;
785
786 exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
787 ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
788 skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
789 model = enc->state.adapt->pvq.pvq_param_model;
790 nb_bands = OD_BAND_OFFSETS[bs][0];
791 off = &OD_BAND_OFFSETS[bs][1];
792
793 if (use_masking)
794 dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
795 else
796 dc_quant = OD_MAXI(1, q_dc);
797
798 tell = 0;
799 for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
800 skip_diff = 0;
801 flip = 0;
802 /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
803 if (pli != 0 && is_keyframe) {
804 od_val32 xy;
805 xy = 0;
806 /*Compute the dot-product of the first band of chroma with the luma ref.*/
807 for (i = off[0]; i < off[1]; i++) {
808 #if defined(OD_FLOAT_PVQ)
809 xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
810 (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
811 #else
812 od_val32 rq;
813 od_val32 inq;
814 rq = ref[i]*qm[i];
815 inq = in[i]*qm[i];
816 xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
817 1));
818 #endif
819 }
820 /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
821 if (xy < 0) {
822 flip = 1;
823 for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
824 }
825 }
826 for (i = 0; i < nb_bands; i++) {
827 int q;
828
829 if (use_masking)
830 q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
831 else
832 q = OD_MAXI(1, q_ac);
833
834 qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
835 q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
836 pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
837 enc->pvq_norm_lambda, speed);
838 }
839 od_encode_checkpoint(enc, &buf);
840 if (is_keyframe) out[0] = 0;
841 else {
842 int n;
843 n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
844 if (n == 0) {
845 out[0] = 0;
846 } else {
847 int tell2;
848 od_rollback_buffer dc_buf;
849
850 dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
851 (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
852 dc_rate += 1;
853
854 #if !CONFIG_ANS
855 tell2 = od_ec_enc_tell_frac(&enc->w.ec);
856 #else
857 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
858 #endif
859 od_encode_checkpoint(enc, &dc_buf);
860 generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
861 n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
862 #if !CONFIG_ANS
863 tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
864 #else
865 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
866 #endif
867 dc_rate += tell2/8.0;
868 od_encode_rollback(enc, &dc_buf);
869
870 out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
871 enc->pvq_norm_lambda);
872 }
873 }
874 #if !CONFIG_ANS
875 tell = od_ec_enc_tell_frac(&enc->w.ec);
876 #else
877 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
878 #endif
879 /* Code as if we're not skipping. */
880 aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
881 ac_dc_coded = AC_CODED + (out[0] != 0);
882 cfl_encoded = 0;
883 skip_rest = 1;
884 skip_theta_value = is_keyframe ? -1 : 0;
885 for (i = 1; i < nb_bands; i++) {
886 if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
887 }
888 skip_dir = 0;
889 if (nb_bands > 1) {
890 for (i = 0; i < 3; i++) {
891 int j;
892 int tmp;
893 tmp = 1;
894 // ToDo(yaowu): figure out better stop condition without gcc warning.
895 for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
896 if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
897 }
898 skip_dir |= tmp << i;
899 }
900 }
901 if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
902
903 /* NOTE: There was no other better place to put this function. */
904 if (pvq_info)
905 av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
906 skip_rest, skip_dir, bs);
907
908 for (i = 0; i < nb_bands; i++) {
909 int encode_flip;
910 /* Encode CFL flip bit just after the first time it's used. */
911 encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
912 if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
913 pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
914 size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
915 (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
916 is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
917 }
918 if (i == 0 && !skip_rest && bs > 0) {
919 aom_write_symbol(&enc->w, skip_dir,
920 &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
921 }
922 if (encode_flip) cfl_encoded = 1;
923 }
924 #if !CONFIG_ANS
925 tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
926 #else
927 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
928 #endif
929 /* Account for the rate of skipping the AC, based on the same DC decision
930 we made when trying to not skip AC. */
931 {
932 double skip_rate;
933 if (out[0] != 0) {
934 skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
935 (double)OD_ICDF(skip_cdf[3]));
936 }
937 else {
938 skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
939 (double)OD_ICDF(skip_cdf[3]));
940 }
941 tell -= (int)floor(.5+8*skip_rate);
942 }
943 if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
944 if (is_keyframe) out[0] = 0;
945 else {
946 int n;
947 n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
948 if (n == 0) {
949 out[0] = 0;
950 } else {
951 int tell2;
952 od_rollback_buffer dc_buf;
953
954 dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
955 (double)OD_ICDF(skip_cdf[0]));
956 dc_rate += 1;
957
958 #if !CONFIG_ANS
959 tell2 = od_ec_enc_tell_frac(&enc->w.ec);
960 #else
961 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
962 #endif
963 od_encode_checkpoint(enc, &dc_buf);
964 generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
965 n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
966 #if !CONFIG_ANS
967 tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
968 #else
969 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
970 #endif
971 dc_rate += tell2/8.0;
972 od_encode_rollback(enc, &dc_buf);
973
974 out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
975 enc->pvq_norm_lambda);
976 }
977 }
978 /* We decide to skip, roll back everything as it was before. */
979 od_encode_rollback(enc, &buf);
980 aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
981 ac_dc_coded = (out[0] != 0);
982 if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
983 else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
984 }
985 if (pvq_info)
986 pvq_info->ac_dc_coded = ac_dc_coded;
987 return ac_dc_coded;
988 }
989