1 /*
2  * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 /* clang-format off */
13 
14 #ifdef HAVE_CONFIG_H
15 # include "config.h"
16 #endif
17 
18 #include <math.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include "aom_dsp/entcode.h"
22 #include "aom_dsp/entenc.h"
23 #include "av1/common/blockd.h"
24 #include "av1/common/odintrin.h"
25 #include "av1/common/partition.h"
26 #include "av1/common/pvq_state.h"
27 #include "av1/encoder/encodemb.h"
28 #include "av1/encoder/pvq_encoder.h"
29 #include "aom_ports/system_state.h"
30 
31 /*Shift to ensure that the upper bound (i.e. for the max blocksize) of the
32    dot-product of the 1st band of chroma with the luma ref doesn't overflow.*/
33 #define OD_CFL_FLIP_SHIFT (OD_LIMIT_BSIZE_MAX + 0)
34 
aom_write_symbol_pvq(aom_writer * w,int symb,aom_cdf_prob * cdf,int nsymbs)35 void aom_write_symbol_pvq(aom_writer *w, int symb, aom_cdf_prob *cdf,
36     int nsymbs) {
37   if (cdf[0] == 0)
38     aom_cdf_init_q15_1D(cdf, nsymbs, CDF_SIZE(nsymbs));
39   aom_write_symbol(w, symb, cdf, nsymbs);
40 }
41 
aom_encode_pvq_codeword(aom_writer * w,od_pvq_codeword_ctx * adapt,const od_coeff * in,int n,int k)42 static void aom_encode_pvq_codeword(aom_writer *w, od_pvq_codeword_ctx *adapt,
43  const od_coeff *in, int n, int k) {
44   int i;
45   aom_encode_band_pvq_splits(w, adapt, in, n, k, 0);
46   for (i = 0; i < n; i++) if (in[i]) aom_write_bit(w, in[i] < 0);
47 }
48 
49 /* Computes 1/sqrt(i) using a table for small values. */
od_rsqrt_table(int i)50 static double od_rsqrt_table(int i) {
51   static double table[16] = {
52     1.000000, 0.707107, 0.577350, 0.500000,
53     0.447214, 0.408248, 0.377964, 0.353553,
54     0.333333, 0.316228, 0.301511, 0.288675,
55     0.277350, 0.267261, 0.258199, 0.250000};
56   if (i <= 16) return table[i-1];
57   else return 1./sqrt(i);
58 }
59 
60 /*Computes 1/sqrt(start+2*i+1) using a lookup table containing the results
61    where 0 <= i < table_size.*/
od_custom_rsqrt_dynamic_table(const double * table,const int table_size,const double start,const int i)62 static double od_custom_rsqrt_dynamic_table(const double* table,
63  const int table_size, const double start, const int i) {
64   if (i < table_size) return table[i];
65   else return od_rsqrt_table((int)(start + 2*i + 1));
66 }
67 
68 /*Fills tables used in od_custom_rsqrt_dynamic_table for a given start.*/
od_fill_dynamic_rsqrt_table(double * table,const int table_size,const double start)69 static void od_fill_dynamic_rsqrt_table(double *table, const int table_size,
70  const double start) {
71   int i;
72   for (i = 0; i < table_size; i++)
73     table[i] = od_rsqrt_table((int)(start + 2*i + 1));
74 }
75 
76 /** Find the codepoint on the given PSphere closest to the desired
77  * vector. Double-precision PVQ search just to make sure our tests
78  * aren't limited by numerical accuracy.
79  *
80  * @param [in]      xcoeff  input vector to quantize (x in the math doc)
81  * @param [in]      n       number of dimensions
82  * @param [in]      k       number of pulses
83  * @param [out]     ypulse  optimal codevector found (y in the math doc)
84  * @param [out]     g2      multiplier for the distortion (typically squared
85  *                          gain units)
86  * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
87  * @param [in]      prev_k  number of pulses already in ypulse that we should
88  *                          reuse for the search (or 0 for a new search)
89  * @return                  cosine distance between x and y (between 0 and 1)
90  */
pvq_search_rdo_double_c(const od_val16 * xcoeff,int n,int k,od_coeff * ypulse,double g2,double pvq_norm_lambda,int prev_k)91 double pvq_search_rdo_double_c(const od_val16 *xcoeff, int n, int k,
92  od_coeff *ypulse, double g2, double pvq_norm_lambda, int prev_k) {
93   int i, j;
94   double xy;
95   double yy;
96   /* TODO - This blows our 8kB stack space budget and should be fixed when
97    converting PVQ to fixed point. */
98   double x[MAXN];
99   double xx;
100   double lambda;
101   double norm_1;
102   int rdo_pulses;
103   double delta_rate;
104   xx = xy = yy = 0;
105   for (j = 0; j < n; j++) {
106     x[j] = fabs((float)xcoeff[j]);
107     xx += x[j]*x[j];
108   }
109   norm_1 = 1./sqrt(1e-30 + xx);
110   lambda = pvq_norm_lambda/(1e-30 + g2);
111   i = 0;
112   if (prev_k > 0 && prev_k <= k) {
113     /* We reuse pulses from a previous search so we don't have to search them
114        again. */
115     for (j = 0; j < n; j++) {
116       ypulse[j] = abs(ypulse[j]);
117       xy += x[j]*ypulse[j];
118       yy += ypulse[j]*ypulse[j];
119       i += ypulse[j];
120     }
121   }
122   else if (k > 2) {
123     double l1_norm;
124     double l1_inv;
125     l1_norm = 0;
126     for (j = 0; j < n; j++) l1_norm += x[j];
127     l1_inv = 1./OD_MAXF(l1_norm, 1e-100);
128     for (j = 0; j < n; j++) {
129       double tmp;
130       tmp = k*x[j]*l1_inv;
131       ypulse[j] = OD_MAXI(0, (int)floor(tmp));
132       xy += x[j]*ypulse[j];
133       yy += ypulse[j]*ypulse[j];
134       i += ypulse[j];
135     }
136   }
137   else OD_CLEAR(ypulse, n);
138 
139   /* Only use RDO on the last few pulses. This not only saves CPU, but using
140      RDO on all pulses actually makes the results worse for reasons I don't
141      fully understand. */
142   rdo_pulses = 1 + k/4;
143   /* Rough assumption for now, the last position costs about 3 bits more than
144      the first. */
145   delta_rate = 3./n;
146   /* Search one pulse at a time */
147   for (; i < k - rdo_pulses; i++) {
148     int pos;
149     double best_xy;
150     double best_yy;
151     pos = 0;
152     best_xy = -10;
153     best_yy = 1;
154     for (j = 0; j < n; j++) {
155       double tmp_xy;
156       double tmp_yy;
157       tmp_xy = xy + x[j];
158       tmp_yy = yy + 2*ypulse[j] + 1;
159       tmp_xy *= tmp_xy;
160       if (j == 0 || tmp_xy*best_yy > best_xy*tmp_yy) {
161         best_xy = tmp_xy;
162         best_yy = tmp_yy;
163         pos = j;
164       }
165     }
166     xy = xy + x[pos];
167     yy = yy + 2*ypulse[pos] + 1;
168     ypulse[pos]++;
169   }
170   /* Search last pulses with RDO. Distortion is D = (x-y)^2 = x^2 - 2*x*y + y^2
171      and since x^2 and y^2 are constant, we just maximize x*y, plus a
172      lambda*rate term. Note that since x and y aren't normalized here,
173      we need to divide by sqrt(x^2)*sqrt(y^2). */
174   for (; i < k; i++) {
175     double rsqrt_table[4];
176     int rsqrt_table_size = 4;
177     int pos;
178     double best_cost;
179     pos = 0;
180     best_cost = -1e5;
181     /*Fill the small rsqrt lookup table with inputs relative to yy.
182       Specifically, the table of n values is filled with
183        rsqrt(yy + 1), rsqrt(yy + 2 + 1) .. rsqrt(yy + 2*(n-1) + 1).*/
184     od_fill_dynamic_rsqrt_table(rsqrt_table, rsqrt_table_size, yy);
185     for (j = 0; j < n; j++) {
186       double tmp_xy;
187       double tmp_yy;
188       tmp_xy = xy + x[j];
189       /*Calculate rsqrt(yy + 2*ypulse[j] + 1) using an optimized method.*/
190       tmp_yy = od_custom_rsqrt_dynamic_table(rsqrt_table, rsqrt_table_size,
191        yy, ypulse[j]);
192       tmp_xy = 2*tmp_xy*norm_1*tmp_yy - lambda*j*delta_rate;
193       if (j == 0 || tmp_xy > best_cost) {
194         best_cost = tmp_xy;
195         pos = j;
196       }
197     }
198     xy = xy + x[pos];
199     yy = yy + 2*ypulse[pos] + 1;
200     ypulse[pos]++;
201   }
202   for (i = 0; i < n; i++) {
203     if (xcoeff[i] < 0) ypulse[i] = -ypulse[i];
204   }
205   return xy/(1e-100 + sqrt(xx*yy));
206 }
207 
208 /** Encodes the gain so that the return value increases with the
209  * distance |x-ref|, so that we can encode a zero when x=ref. The
210  * value x=0 is not covered because it is only allowed in the noref
211  * case.
212  *
213  * @param [in]      x      quantized gain to encode
214  * @param [in]      ref    quantized gain of the reference
215  * @return                 interleave-encoded quantized gain value
216  */
neg_interleave(int x,int ref)217 static int neg_interleave(int x, int ref) {
218   if (x < ref) return -2*(x - ref) - 1;
219   else if (x < 2*ref) return 2*(x - ref);
220   else return x-1;
221 }
222 
od_vector_is_null(const od_coeff * x,int len)223 int od_vector_is_null(const od_coeff *x, int len) {
224   int i;
225   for (i = 0; i < len; i++) if (x[i]) return 0;
226   return 1;
227 }
228 
od_pvq_rate(int qg,int icgr,int theta,int ts,const od_adapt_ctx * adapt,const od_coeff * y0,int k,int n,int speed)229 static double od_pvq_rate(int qg, int icgr, int theta, int ts,
230   const od_adapt_ctx *adapt, const od_coeff *y0, int k, int n, int speed) {
231   double rate;
232   if (k == 0) rate = 0;
233   else if (speed > 0) {
234     int i;
235     int sum;
236     double f;
237     /* Compute "center of mass" of the pulse vector. */
238     sum = 0;
239     for (i = 0; i < n - (theta != -1); i++) sum += i*abs(y0[i]);
240     f = sum/(double)(k*n);
241     /* Estimates the number of bits it will cost to encode K pulses in
242        N dimensions based on hand-tuned fit for bitrate vs K, N and
243        "center of mass". */
244     rate = (1 + .4*f)*n*OD_LOG2(1 + OD_MAXF(0, log(n*2*(1*f + .025))*k/n)) + 3;
245   }
246   else {
247     aom_writer w;
248     od_pvq_codeword_ctx cd;
249     int tell;
250 #if !CONFIG_ANS
251     od_ec_enc_init(&w.ec, 1000);
252 #else
253 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
254 #endif
255     OD_COPY(&cd, &adapt->pvq.pvq_codeword_ctx, 1);
256 #if !CONFIG_ANS
257     tell = od_ec_enc_tell_frac(&w.ec);
258 #else
259 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
260 #endif
261     aom_encode_pvq_codeword(&w, &cd, y0, n - (theta != -1), k);
262 #if !CONFIG_ANS
263     rate = (od_ec_enc_tell_frac(&w.ec)-tell)/8.;
264     od_ec_enc_clear(&w.ec);
265 #else
266 # error "CONFIG_PVQ currently requires !CONFIG_ANS."
267 #endif
268   }
269   if (qg > 0 && theta >= 0) {
270     /* Approximate cost of entropy-coding theta */
271     rate += .9*OD_LOG2(ts);
272     if (qg == icgr) rate -= .5;
273   }
274   return rate;
275 }
276 
277 #define MAX_PVQ_ITEMS (20)
278 /* This stores the information about a PVQ search candidate, so we can sort
279    based on K. */
280 typedef struct {
281   int gain;
282   int k;
283   od_val32 qtheta;
284   int theta;
285   int ts;
286   od_val32 qcg;
287 } pvq_search_item;
288 
items_compare(pvq_search_item * a,pvq_search_item * b)289 int items_compare(pvq_search_item *a, pvq_search_item *b) {
290   /* Break ties in K with gain to ensure a stable sort.
291      Otherwise, the order depends on qsort implementation. */
292   return a->k == b->k ? a->gain - b->gain : a->k - b->k;
293 }
294 
295 /** Perform PVQ quantization with prediction, trying several
296  * possible gains and angles. See draft-valin-videocodec-pvq and
297  * http://jmvalin.ca/slides/pvq.pdf for more details.
298  *
299  * @param [out]    out         coefficients after quantization
300  * @param [in]     x0          coefficients before quantization
301  * @param [in]     r0          reference, aka predicted coefficients
302  * @param [in]     n           number of dimensions
303  * @param [in]     q0          quantization step size
304  * @param [out]    y           pulse vector (i.e. selected PVQ codevector)
305  * @param [out]    itheta      angle between input and reference (-1 if noref)
306  * @param [out]    vk          total number of pulses
307  * @param [in]     beta        per-band activity masking beta param
308  * @param [out]    skip_diff   distortion cost of skipping this block
309  *                             (accumulated)
310  * @param [in]     is_keyframe whether we're encoding a keyframe
311  * @param [in]     pli         plane index
312  * @param [in]     adapt       probability adaptation context
313  * @param [in]     qm          QM with magnitude compensation
314  * @param [in]     qm_inv      Inverse of QM with magnitude compensation
315  * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
316  * @param [in]     speed       Make search faster by making approximations
317  * @return         gain        index of the quatized gain
318 */
pvq_theta(od_coeff * out,const od_coeff * x0,const od_coeff * r0,int n,int q0,od_coeff * y,int * itheta,int * vk,od_val16 beta,double * skip_diff,int is_keyframe,int pli,const od_adapt_ctx * adapt,const int16_t * qm,const int16_t * qm_inv,double pvq_norm_lambda,int speed)319 static int pvq_theta(od_coeff *out, const od_coeff *x0, const od_coeff *r0,
320     int n, int q0, od_coeff *y, int *itheta, int *vk,
321     od_val16 beta, double *skip_diff, int is_keyframe, int pli,
322     const od_adapt_ctx *adapt, const int16_t *qm, const int16_t *qm_inv,
323     double pvq_norm_lambda, int speed) {
324   od_val32 g;
325   od_val32 gr;
326   od_coeff y_tmp[MAXN + 3];
327   int i;
328   /* Number of pulses. */
329   int k;
330   /* Companded gain of x and reference, normalized to q. */
331   od_val32 cg;
332   od_val32 cgr;
333   int icgr;
334   int qg;
335   /* Best RDO cost (D + lamdba*R) so far. */
336   double best_cost;
337   double dist0;
338   /* Distortion (D) that corresponds to the best RDO cost. */
339   double best_dist;
340   double dist;
341   /* Sign of Householder reflection. */
342   int s;
343   /* Dimension on which Householder reflects. */
344   int m;
345   od_val32 theta;
346   double corr;
347   int best_k;
348   od_val32 best_qtheta;
349   od_val32 gain_offset;
350   int noref;
351   double skip_dist;
352   int cfl_enabled;
353   int skip;
354   double gain_weight;
355   od_val16 x16[MAXN];
356   od_val16 r16[MAXN];
357   int xshift;
358   int rshift;
359   /* Give more weight to gain error when calculating the total distortion. */
360   gain_weight = 1.0;
361   OD_ASSERT(n > 1);
362   corr = 0;
363 #if !defined(OD_FLOAT_PVQ)
364   /* Shift needed to make x fit in 16 bits even after rotation.
365      This shift value is not normative (it can be changed without breaking
366      the bitstream) */
367   xshift = OD_MAXI(0, od_vector_log_mag(x0, n) - 15);
368   /* Shift needed to make the reference fit in 15 bits, so that the Householder
369      vector can fit in 16 bits.
370      This shift value *is* normative, and has to match the decoder. */
371   rshift = OD_MAXI(0, od_vector_log_mag(r0, n) - 14);
372 #else
373   xshift = 0;
374   rshift = 0;
375 #endif
376   for (i = 0; i < n; i++) {
377 #if defined(OD_FLOAT_PVQ)
378     /*This is slightly different from the original float PVQ code,
379        where the qm was applied in the accumulation in od_pvq_compute_gain and
380        the vectors were od_coeffs, not od_val16 (i.e. double).*/
381     x16[i] = x0[i]*(double)qm[i]*OD_QM_SCALE_1;
382     r16[i] = r0[i]*(double)qm[i]*OD_QM_SCALE_1;
383 #else
384     x16[i] = OD_SHR_ROUND(x0[i]*qm[i], OD_QM_SHIFT + xshift);
385     r16[i] = OD_SHR_ROUND(r0[i]*qm[i], OD_QM_SHIFT + rshift);
386 #endif
387     corr += OD_MULT16_16(x16[i], r16[i]);
388   }
389   cfl_enabled = is_keyframe && pli != 0 && !OD_DISABLE_CFL;
390   cg  = od_pvq_compute_gain(x16, n, q0, &g, beta, xshift);
391   cgr = od_pvq_compute_gain(r16, n, q0, &gr, beta, rshift);
392   if (cfl_enabled) cgr = OD_CGAIN_SCALE;
393   /* gain_offset is meant to make sure one of the quantized gains has
394      exactly the same gain as the reference. */
395 #if defined(OD_FLOAT_PVQ)
396   icgr = (int)floor(.5 + cgr);
397 #else
398   icgr = OD_SHR_ROUND(cgr, OD_CGAIN_SHIFT);
399 #endif
400   gain_offset = cgr - OD_SHL(icgr, OD_CGAIN_SHIFT);
401   /* Start search with null case: gain=0, no pulse. */
402   qg = 0;
403   dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
404   best_dist = dist;
405   best_cost = dist + pvq_norm_lambda*od_pvq_rate(0, 0, -1, 0, adapt, NULL, 0,
406     n, speed);
407   noref = 1;
408   best_k = 0;
409   *itheta = -1;
410   OD_CLEAR(y, n);
411   best_qtheta = 0;
412   m = 0;
413   s = 1;
414   corr = corr/(1e-100 + g*(double)gr/OD_SHL(1, xshift + rshift));
415   corr = OD_MAXF(OD_MINF(corr, 1.), -1.);
416   if (is_keyframe) skip_dist = gain_weight*cg*cg*OD_CGAIN_SCALE_2;
417   else {
418     skip_dist = gain_weight*(cg - cgr)*(cg - cgr)
419      + cgr*(double)cg*(2 - 2*corr);
420     skip_dist *= OD_CGAIN_SCALE_2;
421   }
422   if (!is_keyframe) {
423     /* noref, gain=0 isn't allowed, but skip is allowed. */
424     od_val32 scgr;
425     scgr = OD_MAXF(0,gain_offset);
426     if (icgr == 0) {
427       best_dist = gain_weight*(cg - scgr)*(cg - scgr)
428        + scgr*(double)cg*(2 - 2*corr);
429       best_dist *= OD_CGAIN_SCALE_2;
430     }
431     best_cost = best_dist + pvq_norm_lambda*od_pvq_rate(0, icgr, 0, 0, adapt,
432      NULL, 0, n, speed);
433     best_qtheta = 0;
434     *itheta = 0;
435     noref = 0;
436   }
437   dist0 = best_dist;
438   if (n <= OD_MAX_PVQ_SIZE && !od_vector_is_null(r0, n) && corr > 0) {
439     od_val16 xr[MAXN];
440     int gain_bound;
441     int prev_k;
442     pvq_search_item items[MAX_PVQ_ITEMS];
443     int idx;
444     int nitems;
445     double cos_dist;
446     idx = 0;
447     gain_bound = OD_SHR(cg - gain_offset, OD_CGAIN_SHIFT);
448     /* Perform theta search only if prediction is useful. */
449     theta = OD_ROUND32(OD_THETA_SCALE*acos(corr));
450     m = od_compute_householder(r16, n, gr, &s, rshift);
451     od_apply_householder(xr, x16, r16, n);
452     prev_k = 0;
453     for (i = m; i < n - 1; i++) xr[i] = xr[i + 1];
454     /* Compute all candidate PVQ searches within a reasonable range of gain
455        and theta. */
456     for (i = OD_MAXI(1, gain_bound - 1); i <= gain_bound + 1; i++) {
457       int j;
458       od_val32 qcg;
459       int ts;
460       int theta_lower;
461       int theta_upper;
462       /* Quantized companded gain */
463       qcg = OD_SHL(i, OD_CGAIN_SHIFT) + gain_offset;
464       /* Set angular resolution (in ra) to match the encoded gain */
465       ts = od_pvq_compute_max_theta(qcg, beta);
466       theta_lower = OD_MAXI(0, (int)floor(.5 +
467        theta*OD_THETA_SCALE_1*2/M_PI*ts) - 2);
468       theta_upper = OD_MINI(ts - 1, (int)ceil(theta*OD_THETA_SCALE_1*2/M_PI*ts));
469       /* Include the angles within a reasonable range. */
470       for (j = theta_lower; j <= theta_upper; j++) {
471         od_val32 qtheta;
472         qtheta = od_pvq_compute_theta(j, ts);
473         k = od_pvq_compute_k(qcg, j, 0, n, beta);
474         items[idx].gain = i;
475         items[idx].theta = j;
476         items[idx].k = k;
477         items[idx].qcg = qcg;
478         items[idx].qtheta = qtheta;
479         items[idx].ts = ts;
480         idx++;
481         OD_ASSERT(idx < MAX_PVQ_ITEMS);
482       }
483     }
484     nitems = idx;
485     cos_dist = 0;
486     /* Sort PVQ search candidates in ascending order of pulses K so that
487        we can reuse all the previously searched pulses across searches. */
488     qsort(items, nitems, sizeof(items[0]),
489      (int (*)(const void *, const void *))items_compare);
490     /* Search for the best gain/theta in order. */
491     for (idx = 0; idx < nitems; idx++) {
492       int j;
493       od_val32 qcg;
494       int ts;
495       double cost;
496       double dist_theta;
497       double sin_prod;
498       od_val32 qtheta;
499       /* Quantized companded gain */
500       qcg = items[idx].qcg;
501       i = items[idx].gain;
502       j = items[idx].theta;
503       /* Set angular resolution (in ra) to match the encoded gain */
504       ts = items[idx].ts;
505       /* Search for the best angle within a reasonable range. */
506       qtheta = items[idx].qtheta;
507       k = items[idx].k;
508       /* Compute the minimal possible distortion by not taking the PVQ
509          cos_dist into account. */
510       dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1;
511       dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
512       dist *= OD_CGAIN_SCALE_2;
513       /* If we have no hope of beating skip (including a 1-bit worst-case
514          penalty), stop now. */
515       if (dist > dist0 + 1.0*pvq_norm_lambda && k != 0) continue;
516       sin_prod = od_pvq_sin(theta)*OD_TRIG_SCALE_1*od_pvq_sin(qtheta)*
517        OD_TRIG_SCALE_1;
518       /* PVQ search, using a gain of qcg*cg*sin(theta)*sin(qtheta) since
519          that's the factor by which cos_dist is multiplied to get the
520          distortion metric. */
521       if (k == 0) {
522         cos_dist = 0;
523         OD_CLEAR(y_tmp, n-1);
524       }
525       else if (k != prev_k) {
526         cos_dist = pvq_search_rdo_double(xr, n - 1, k, y_tmp,
527          qcg*(double)cg*sin_prod*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
528       }
529       prev_k = k;
530       /* See Jmspeex' Journal of Dubious Theoretical Results. */
531       dist_theta = 2 - 2.*od_pvq_cos(theta - qtheta)*OD_TRIG_SCALE_1
532        + sin_prod*(2 - 2*cos_dist);
533       dist = gain_weight*(qcg - cg)*(qcg - cg) + qcg*(double)cg*dist_theta;
534       dist *= OD_CGAIN_SCALE_2;
535       /* Do approximate RDO. */
536       cost = dist + pvq_norm_lambda*od_pvq_rate(i, icgr, j, ts, adapt, y_tmp,
537        k, n, speed);
538       if (cost < best_cost) {
539         best_cost = cost;
540         best_dist = dist;
541         qg = i;
542         best_k = k;
543         best_qtheta = qtheta;
544         *itheta = j;
545         noref = 0;
546         OD_COPY(y, y_tmp, n - 1);
547       }
548     }
549   }
550   /* Don't bother with no-reference version if there's a reasonable
551      correlation. */
552   if (n <= OD_MAX_PVQ_SIZE && (corr < .5
553         || cg < (od_val32)(OD_SHL(2, OD_CGAIN_SHIFT)))) {
554     int gain_bound;
555     int prev_k;
556     gain_bound = OD_SHR(cg, OD_CGAIN_SHIFT);
557     prev_k = 0;
558     /* Search for the best gain (haven't determined reasonable range yet). */
559     for (i = OD_MAXI(1, gain_bound); i <= gain_bound + 1; i++) {
560       double cos_dist;
561       double cost;
562       od_val32 qcg;
563       qcg = OD_SHL(i, OD_CGAIN_SHIFT);
564       k = od_pvq_compute_k(qcg, -1, 1, n, beta);
565       /* Compute the minimal possible distortion by not taking the PVQ
566          cos_dist into account. */
567       dist = gain_weight*(qcg - cg)*(qcg - cg);
568       dist *= OD_CGAIN_SCALE_2;
569       if (dist > dist0 && k != 0) continue;
570       cos_dist = pvq_search_rdo_double(x16, n, k, y_tmp,
571        qcg*(double)cg*OD_CGAIN_SCALE_2, pvq_norm_lambda, prev_k);
572       prev_k = k;
573       /* See Jmspeex' Journal of Dubious Theoretical Results. */
574       dist = gain_weight*(qcg - cg)*(qcg - cg)
575        + qcg*(double)cg*(2 - 2*cos_dist);
576       dist *= OD_CGAIN_SCALE_2;
577       /* Do approximate RDO. */
578       cost = dist + pvq_norm_lambda*od_pvq_rate(i, 0, -1, 0, adapt, y_tmp, k,
579        n, speed);
580       if (cost <= best_cost) {
581         best_cost = cost;
582         best_dist = dist;
583         qg = i;
584         noref = 1;
585         best_k = k;
586         *itheta = -1;
587         OD_COPY(y, y_tmp, n);
588       }
589     }
590   }
591   k = best_k;
592   theta = best_qtheta;
593   skip = 0;
594   if (noref) {
595     if (qg == 0) skip = OD_PVQ_SKIP_ZERO;
596   }
597   else {
598     if (!is_keyframe && qg == 0) {
599       skip = (icgr ? OD_PVQ_SKIP_ZERO : OD_PVQ_SKIP_COPY);
600     }
601     if (qg == icgr && *itheta == 0 && !cfl_enabled) skip = OD_PVQ_SKIP_COPY;
602   }
603   /* Synthesize like the decoder would. */
604   if (skip) {
605     if (skip == OD_PVQ_SKIP_COPY) OD_COPY(out, r0, n);
606     else OD_CLEAR(out, n);
607   }
608   else {
609     if (noref) gain_offset = 0;
610     g = od_gain_expand(OD_SHL(qg, OD_CGAIN_SHIFT) + gain_offset, q0, beta);
611     od_pvq_synthesis_partial(out, y, r16, n, noref, g, theta, m, s,
612      qm_inv);
613   }
614   *vk = k;
615   *skip_diff += skip_dist - best_dist;
616   /* Encode gain differently depending on whether we use prediction or not.
617      Special encoding on inter frames where qg=0 is allowed for noref=0
618      but not noref=1.*/
619   if (is_keyframe) return noref ? qg : neg_interleave(qg, icgr);
620   else return noref ? qg - 1 : neg_interleave(qg + 1, icgr + 1);
621 }
622 
623 /** Encodes a single vector of integers (eg, a partition within a
624  *  coefficient block) using PVQ
625  *
626  * @param [in,out] w          multi-symbol entropy encoder
627  * @param [in]     qg         quantized gain
628  * @param [in]     theta      quantized post-prediction theta
629  * @param [in]     in         coefficient vector to code
630  * @param [in]     n          number of coefficients in partition
631  * @param [in]     k          number of pulses in partition
632  * @param [in,out] model      entropy encoder state
633  * @param [in,out] adapt      adaptation context
634  * @param [in,out] exg        ExQ16 expectation of gain value
635  * @param [in,out] ext        ExQ16 expectation of theta value
636  * @param [in]     cdf_ctx    selects which cdf context to use
637  * @param [in]     is_keyframe whether we're encoding a keyframe
638  * @param [in]     code_skip  whether the "skip rest" flag is allowed
639  * @param [in]     skip_rest  when set, we skip all higher bands
640  * @param [in]     encode_flip whether we need to encode the CfL flip flag now
641  * @param [in]     flip       value of the CfL flip flag
642  */
pvq_encode_partition(aom_writer * w,int qg,int theta,const od_coeff * in,int n,int k,generic_encoder model[3],od_adapt_ctx * adapt,int * exg,int * ext,int cdf_ctx,int is_keyframe,int code_skip,int skip_rest,int encode_flip,int flip)643 void pvq_encode_partition(aom_writer *w,
644                                  int qg,
645                                  int theta,
646                                  const od_coeff *in,
647                                  int n,
648                                  int k,
649                                  generic_encoder model[3],
650                                  od_adapt_ctx *adapt,
651                                  int *exg,
652                                  int *ext,
653                                  int cdf_ctx,
654                                  int is_keyframe,
655                                  int code_skip,
656                                  int skip_rest,
657                                  int encode_flip,
658                                  int flip) {
659   int noref;
660   int id;
661   noref = (theta == -1);
662   id = (qg > 0) + 2*OD_MINI(theta + 1,3) + 8*code_skip*skip_rest;
663   if (is_keyframe) {
664     OD_ASSERT(id != 8);
665     if (id >= 8) id--;
666   }
667   else {
668     OD_ASSERT(id != 10);
669     if (id >= 10) id--;
670   }
671   /* Jointly code gain, theta and noref for small values. Then we handle
672      larger gain and theta values. For noref, theta = -1. */
673   aom_write_symbol_pvq(w, id, &adapt->pvq.pvq_gaintheta_cdf[cdf_ctx][0],
674    8 + 7*code_skip);
675   if (encode_flip) {
676     /* We could eventually do some smarter entropy coding here, but it would
677        have to be good enough to overcome the overhead of the entropy coder.
678        An early attempt using a "toogle" flag with simple adaptation wasn't
679        worth the trouble. */
680     aom_write_bit(w, flip);
681   }
682   if (qg > 0) {
683     int tmp;
684     tmp = *exg;
685     generic_encode(w, &model[!noref], qg - 1, &tmp, 2);
686     OD_IIR_DIADIC(*exg, qg << 16, 2);
687   }
688   if (theta > 1) {
689     int tmp;
690     tmp = *ext;
691     generic_encode(w, &model[2], theta - 2, &tmp, 2);
692     OD_IIR_DIADIC(*ext, theta << 16, 2);
693   }
694   aom_encode_pvq_codeword(w, &adapt->pvq.pvq_codeword_ctx, in,
695    n - (theta != -1), k);
696 }
697 
698 /** Quantizes a scalar with rate-distortion optimization (RDO)
699  * @param [in] x      unquantized value
700  * @param [in] q      quantization step size
701  * @param [in] delta0 rate increase for encoding a 1 instead of a 0
702  * @param [in] pvq_norm_lambda enc->pvq_norm_lambda for quantized RDO
703  * @retval quantized value
704  */
od_rdo_quant(od_coeff x,int q,double delta0,double pvq_norm_lambda)705 int od_rdo_quant(od_coeff x, int q, double delta0, double pvq_norm_lambda) {
706   int n;
707   /* Optimal quantization threshold is 1/2 + lambda*delta_rate/2. See
708      Jmspeex' Journal of Dubious Theoretical Results for details. */
709   n = OD_DIV_R0(abs(x), q);
710   if ((double)abs(x)/q < (double)n/2 + pvq_norm_lambda*delta0/(2*n)) {
711     return 0;
712   }
713   else {
714     return OD_DIV_R0(x, q);
715   }
716 }
717 
718 /** Encode a coefficient block (excepting DC) using PVQ
719  *
720  * @param [in,out] enc     daala encoder context
721  * @param [in]     ref     'reference' (prediction) vector
722  * @param [in]     in      coefficient block to quantize and encode
723  * @param [out]    out     quantized coefficient block
724  * @param [in]     q0      scale/quantizer
725  * @param [in]     pli     plane index
726  * @param [in]     bs      log of the block size minus two
727  * @param [in]     beta    per-band activity masking beta param
728  * @param [in]     is_keyframe whether we're encoding a keyframe
729  * @param [in]     qm      QM with magnitude compensation
730  * @param [in]     qm_inv  Inverse of QM with magnitude compensation
731  * @param [in]     speed   Make search faster by making approximations
732  * @param [in]     pvq_info If null, conisdered as RDO search mode
733  * @return         Returns block skip info indicating whether DC/AC are coded.
734  *                 bit0: DC is coded, bit1: AC is coded (1 means coded)
735  *
736  */
od_pvq_encode(daala_enc_ctx * enc,od_coeff * ref,const od_coeff * in,od_coeff * out,int q_dc,int q_ac,int pli,int bs,const od_val16 * beta,int is_keyframe,const int16_t * qm,const int16_t * qm_inv,int speed,PVQ_INFO * pvq_info)737 PVQ_SKIP_TYPE od_pvq_encode(daala_enc_ctx *enc,
738                    od_coeff *ref,
739                    const od_coeff *in,
740                    od_coeff *out,
741                    int q_dc,
742                    int q_ac,
743                    int pli,
744                    int bs,
745                    const od_val16 *beta,
746                    int is_keyframe,
747                    const int16_t *qm,
748                    const int16_t *qm_inv,
749                    int speed,
750                    PVQ_INFO *pvq_info){
751   int theta[PVQ_MAX_PARTITIONS];
752   int qg[PVQ_MAX_PARTITIONS];
753   int k[PVQ_MAX_PARTITIONS];
754   od_coeff y[OD_TXSIZE_MAX*OD_TXSIZE_MAX];
755   int *exg;
756   int *ext;
757   int nb_bands;
758   int i;
759   const int *off;
760   int size[PVQ_MAX_PARTITIONS];
761   generic_encoder *model;
762   double skip_diff;
763   int tell;
764   uint16_t *skip_cdf;
765   od_rollback_buffer buf;
766   int dc_quant;
767   int flip;
768   int cfl_encoded;
769   int skip_rest;
770   int skip_dir;
771   int skip_theta_value;
772   const unsigned char *pvq_qm;
773   double dc_rate;
774   int use_masking;
775   PVQ_SKIP_TYPE ac_dc_coded;
776 
777   aom_clear_system_state();
778 
779   use_masking = enc->use_activity_masking;
780 
781   if (use_masking)
782     pvq_qm = &enc->state.pvq_qm_q4[pli][0];
783   else
784     pvq_qm = 0;
785 
786   exg = &enc->state.adapt->pvq.pvq_exg[pli][bs][0];
787   ext = enc->state.adapt->pvq.pvq_ext + bs*PVQ_MAX_PARTITIONS;
788   skip_cdf = enc->state.adapt->skip_cdf[2*bs + (pli != 0)];
789   model = enc->state.adapt->pvq.pvq_param_model;
790   nb_bands = OD_BAND_OFFSETS[bs][0];
791   off = &OD_BAND_OFFSETS[bs][1];
792 
793   if (use_masking)
794     dc_quant = OD_MAXI(1, q_dc * pvq_qm[od_qm_get_index(bs, 0)] >> 4);
795   else
796     dc_quant = OD_MAXI(1, q_dc);
797 
798   tell = 0;
799   for (i = 0; i < nb_bands; i++) size[i] = off[i+1] - off[i];
800   skip_diff = 0;
801   flip = 0;
802   /*If we are coding a chroma block of a keyframe, we are doing CfL.*/
803   if (pli != 0 && is_keyframe) {
804     od_val32 xy;
805     xy = 0;
806     /*Compute the dot-product of the first band of chroma with the luma ref.*/
807     for (i = off[0]; i < off[1]; i++) {
808 #if defined(OD_FLOAT_PVQ)
809       xy += ref[i]*(double)qm[i]*OD_QM_SCALE_1*
810        (double)in[i]*(double)qm[i]*OD_QM_SCALE_1;
811 #else
812       od_val32 rq;
813       od_val32 inq;
814       rq = ref[i]*qm[i];
815       inq = in[i]*qm[i];
816       xy += OD_SHR(rq*(int64_t)inq, OD_SHL(OD_QM_SHIFT + OD_CFL_FLIP_SHIFT,
817        1));
818 #endif
819     }
820     /*If cos(theta) < 0, then |theta| > pi/2 and we should negate the ref.*/
821     if (xy < 0) {
822       flip = 1;
823       for(i = off[0]; i < off[nb_bands]; i++) ref[i] = -ref[i];
824     }
825   }
826   for (i = 0; i < nb_bands; i++) {
827     int q;
828 
829     if (use_masking)
830       q = OD_MAXI(1, q_ac * pvq_qm[od_qm_get_index(bs, i + 1)] >> 4);
831     else
832       q = OD_MAXI(1, q_ac);
833 
834     qg[i] = pvq_theta(out + off[i], in + off[i], ref + off[i], size[i],
835      q, y + off[i], &theta[i], &k[i], beta[i], &skip_diff, is_keyframe,
836      pli, enc->state.adapt, qm + off[i], qm_inv + off[i],
837      enc->pvq_norm_lambda, speed);
838   }
839   od_encode_checkpoint(enc, &buf);
840   if (is_keyframe) out[0] = 0;
841   else {
842     int n;
843     n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
844     if (n == 0) {
845       out[0] = 0;
846     } else {
847       int tell2;
848       od_rollback_buffer dc_buf;
849 
850       dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[3]) - OD_ICDF(skip_cdf[2]))/
851        (double)(OD_ICDF(skip_cdf[2]) - OD_ICDF(skip_cdf[1])));
852       dc_rate += 1;
853 
854 #if !CONFIG_ANS
855       tell2 = od_ec_enc_tell_frac(&enc->w.ec);
856 #else
857 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
858 #endif
859       od_encode_checkpoint(enc, &dc_buf);
860       generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
861        n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
862 #if !CONFIG_ANS
863       tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
864 #else
865 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
866 #endif
867       dc_rate += tell2/8.0;
868       od_encode_rollback(enc, &dc_buf);
869 
870       out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
871        enc->pvq_norm_lambda);
872     }
873   }
874 #if !CONFIG_ANS
875   tell = od_ec_enc_tell_frac(&enc->w.ec);
876 #else
877 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
878 #endif
879   /* Code as if we're not skipping. */
880   aom_write_symbol(&enc->w, 2 + (out[0] != 0), skip_cdf, 4);
881   ac_dc_coded = AC_CODED + (out[0] != 0);
882   cfl_encoded = 0;
883   skip_rest = 1;
884   skip_theta_value = is_keyframe ? -1 : 0;
885   for (i = 1; i < nb_bands; i++) {
886     if (theta[i] != skip_theta_value || qg[i]) skip_rest = 0;
887   }
888   skip_dir = 0;
889   if (nb_bands > 1) {
890     for (i = 0; i < 3; i++) {
891       int j;
892       int tmp;
893       tmp = 1;
894       // ToDo(yaowu): figure out better stop condition without gcc warning.
895       for (j = i + 1; j < nb_bands && j < PVQ_MAX_PARTITIONS; j += 3) {
896         if (theta[j] != skip_theta_value || qg[j]) tmp = 0;
897       }
898       skip_dir |= tmp << i;
899     }
900   }
901   if (theta[0] == skip_theta_value && qg[0] == 0 && skip_rest) nb_bands = 0;
902 
903   /* NOTE: There was no other better place to put this function. */
904   if (pvq_info)
905     av1_store_pvq_enc_info(pvq_info, qg, theta, k, y, nb_bands, off, size,
906       skip_rest, skip_dir, bs);
907 
908   for (i = 0; i < nb_bands; i++) {
909     int encode_flip;
910     /* Encode CFL flip bit just after the first time it's used. */
911     encode_flip = pli != 0 && is_keyframe && theta[i] != -1 && !cfl_encoded;
912     if (i == 0 || (!skip_rest && !(skip_dir & (1 << ((i - 1)%3))))) {
913       pvq_encode_partition(&enc->w, qg[i], theta[i], y + off[i],
914        size[i], k[i], model, enc->state.adapt, exg + i, ext + i,
915        (pli != 0)*OD_TXSIZES*PVQ_MAX_PARTITIONS + bs*PVQ_MAX_PARTITIONS + i,
916        is_keyframe, i == 0 && (i < nb_bands - 1), skip_rest, encode_flip, flip);
917     }
918     if (i == 0 && !skip_rest && bs > 0) {
919       aom_write_symbol(&enc->w, skip_dir,
920        &enc->state.adapt->pvq.pvq_skip_dir_cdf[(pli != 0) + 2*(bs - 1)][0], 7);
921     }
922     if (encode_flip) cfl_encoded = 1;
923   }
924 #if !CONFIG_ANS
925   tell = od_ec_enc_tell_frac(&enc->w.ec) - tell;
926 #else
927 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
928 #endif
929   /* Account for the rate of skipping the AC, based on the same DC decision
930      we made when trying to not skip AC. */
931   {
932     double skip_rate;
933     if (out[0] != 0) {
934       skip_rate = -OD_LOG2((OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
935      (double)OD_ICDF(skip_cdf[3]));
936     }
937     else {
938       skip_rate = -OD_LOG2(OD_ICDF(skip_cdf[0])/
939      (double)OD_ICDF(skip_cdf[3]));
940     }
941     tell -= (int)floor(.5+8*skip_rate);
942   }
943   if (nb_bands == 0 || skip_diff <= enc->pvq_norm_lambda/8*tell) {
944     if (is_keyframe) out[0] = 0;
945     else {
946       int n;
947       n = OD_DIV_R0(abs(in[0] - ref[0]), dc_quant);
948       if (n == 0) {
949         out[0] = 0;
950       } else {
951         int tell2;
952         od_rollback_buffer dc_buf;
953 
954         dc_rate = -OD_LOG2((double)(OD_ICDF(skip_cdf[1]) - OD_ICDF(skip_cdf[0]))/
955          (double)OD_ICDF(skip_cdf[0]));
956         dc_rate += 1;
957 
958 #if !CONFIG_ANS
959         tell2 = od_ec_enc_tell_frac(&enc->w.ec);
960 #else
961 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
962 #endif
963         od_encode_checkpoint(enc, &dc_buf);
964         generic_encode(&enc->w, &enc->state.adapt->model_dc[pli],
965          n - 1, &enc->state.adapt->ex_dc[pli][bs][0], 2);
966 #if !CONFIG_ANS
967         tell2 = od_ec_enc_tell_frac(&enc->w.ec) - tell2;
968 #else
969 #error "CONFIG_PVQ currently requires !CONFIG_ANS."
970 #endif
971         dc_rate += tell2/8.0;
972         od_encode_rollback(enc, &dc_buf);
973 
974         out[0] = od_rdo_quant(in[0] - ref[0], dc_quant, dc_rate,
975          enc->pvq_norm_lambda);
976       }
977     }
978     /* We decide to skip, roll back everything as it was before. */
979     od_encode_rollback(enc, &buf);
980     aom_write_symbol(&enc->w, out[0] != 0, skip_cdf, 4);
981     ac_dc_coded = (out[0] != 0);
982     if (is_keyframe) for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = 0;
983     else for (i = 1; i < 1 << (2*bs + 4); i++) out[i] = ref[i];
984   }
985   if (pvq_info)
986     pvq_info->ac_dc_coded = ac_dc_coded;
987   return ac_dc_coded;
988 }
989