1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14   last mod: $Id$
15 
16  ********************************************************************/
17 #include <stdlib.h>
18 #include <string.h>
19 #include "encint.h"
20 
21 
22 
oc_make_eob_token(int _run_count)23 static int oc_make_eob_token(int _run_count){
24   if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
25   else{
26     int cat;
27     cat=OC_ILOGNZ_32(_run_count)-3;
28     cat=OC_MINI(cat,3);
29     return OC_DCT_REPEAT_RUN0_TOKEN+cat;
30   }
31 }
32 
oc_make_eob_token_full(int _run_count,int * _eb)33 static int oc_make_eob_token_full(int _run_count,int *_eb){
34   if(_run_count<4){
35     *_eb=0;
36     return OC_DCT_EOB1_TOKEN+_run_count-1;
37   }
38   else{
39     int cat;
40     cat=OC_ILOGNZ_32(_run_count)-3;
41     cat=OC_MINI(cat,3);
42     *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
43     return OC_DCT_REPEAT_RUN0_TOKEN+cat;
44   }
45 }
46 
47 /*Returns the number of blocks ended by an EOB token.*/
oc_decode_eob_token(int _token,int _eb)48 static int oc_decode_eob_token(int _token,int _eb){
49   return (0x20820C41U>>_token*5&0x1F)+_eb;
50 }
51 
52 /*TODO: This is now only used during DCT tokenization, and never for runs; it
53    should be simplified.*/
oc_make_dct_token_full(int _zzi,int _zzj,int _val,int * _eb)54 static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
55   int neg;
56   int zero_run;
57   int token;
58   int eb;
59   neg=_val<0;
60   _val=abs(_val);
61   zero_run=_zzj-_zzi;
62   if(zero_run>0){
63     int adj;
64     /*Implement a minor restriction on stack 1 so that we know during DC fixups
65        that extending a dctrun token from stack 1 will never overflow.*/
66     adj=_zzi!=1;
67     if(_val<2&&zero_run<17+adj){
68       if(zero_run<6){
69         token=OC_DCT_RUN_CAT1A+zero_run-1;
70         eb=neg;
71       }
72       else if(zero_run<10){
73         token=OC_DCT_RUN_CAT1B;
74         eb=zero_run-6+(neg<<2);
75       }
76       else{
77         token=OC_DCT_RUN_CAT1C;
78         eb=zero_run-10+(neg<<3);
79       }
80     }
81     else if(_val<4&&zero_run<3+adj){
82       if(zero_run<2){
83         token=OC_DCT_RUN_CAT2A;
84         eb=_val-2+(neg<<1);
85       }
86       else{
87         token=OC_DCT_RUN_CAT2B;
88         eb=zero_run-2+(_val-2<<1)+(neg<<2);
89       }
90     }
91     else{
92       if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
93       else token=OC_DCT_ZRL_TOKEN;
94       eb=zero_run-1;
95     }
96   }
97   else if(_val<3){
98     token=OC_ONE_TOKEN+(_val-1<<1)+neg;
99     eb=0;
100   }
101   else if(_val<7){
102     token=OC_DCT_VAL_CAT2+_val-3;
103     eb=neg;
104   }
105   else if(_val<9){
106     token=OC_DCT_VAL_CAT3;
107     eb=_val-7+(neg<<1);
108   }
109   else if(_val<13){
110     token=OC_DCT_VAL_CAT4;
111     eb=_val-9+(neg<<2);
112   }
113   else if(_val<21){
114     token=OC_DCT_VAL_CAT5;
115     eb=_val-13+(neg<<3);
116   }
117   else if(_val<37){
118     token=OC_DCT_VAL_CAT6;
119     eb=_val-21+(neg<<4);
120   }
121   else if(_val<69){
122     token=OC_DCT_VAL_CAT7;
123     eb=_val-37+(neg<<5);
124   }
125   else{
126     token=OC_DCT_VAL_CAT8;
127     eb=_val-69+(neg<<9);
128   }
129   *_eb=eb;
130   return token;
131 }
132 
133 /*Token logging to allow a few fragments of efficient rollback.
134   Late SKIP analysis is tied up in the tokenization process, so we need to be
135    able to undo a fragment's tokens on a whim.*/
136 
137 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
138    0,16,16,16,16,16,32,32,
139   32,32,32,32,32,32,32,48,
140   48,48,48,48,48,48,48,48,
141   48,48,48,48,64,64,64,64,
142   64,64,64,64,64,64,64,64,
143   64,64,64,64,64,64,64,64,
144   64,64,64,64,64,64,64,64
145 };
146 
oc_token_bits(oc_enc_ctx * _enc,int _huffi,int _zzi,int _token)147 static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
148   return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
149    +OC_DCT_TOKEN_EXTRA_BITS[_token];
150 }
151 
oc_enc_tokenlog_checkpoint(oc_enc_ctx * _enc,oc_token_checkpoint * _cp,int _pli,int _zzi)152 static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
153  oc_token_checkpoint *_cp,int _pli,int _zzi){
154   _cp->pli=_pli;
155   _cp->zzi=_zzi;
156   _cp->eob_run=_enc->eob_run[_pli][_zzi];
157   _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
158 }
159 
oc_enc_tokenlog_rollback(oc_enc_ctx * _enc,const oc_token_checkpoint * _stack,int _n)160 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
161  const oc_token_checkpoint *_stack,int _n){
162   int i;
163   for(i=_n;i-->0;){
164     int pli;
165     int zzi;
166     pli=_stack[i].pli;
167     zzi=_stack[i].zzi;
168     _enc->eob_run[pli][zzi]=_stack[i].eob_run;
169     _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
170   }
171 }
172 
oc_enc_token_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _token,int _eb)173 static void oc_enc_token_log(oc_enc_ctx *_enc,
174  int _pli,int _zzi,int _token,int _eb){
175   ptrdiff_t ti;
176   ti=_enc->ndct_tokens[_pli][_zzi]++;
177   _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
178   _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
179 }
180 
oc_enc_eob_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _run_count)181 static void oc_enc_eob_log(oc_enc_ctx *_enc,
182  int _pli,int _zzi,int _run_count){
183   int token;
184   int eb;
185   token=oc_make_eob_token_full(_run_count,&eb);
186   oc_enc_token_log(_enc,_pli,_zzi,token,eb);
187 }
188 
189 
oc_enc_tokenize_start(oc_enc_ctx * _enc)190 void oc_enc_tokenize_start(oc_enc_ctx *_enc){
191   memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
192   memset(_enc->eob_run,0,sizeof(_enc->eob_run));
193   memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
194   memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
195 }
196 
197 typedef struct oc_quant_token oc_quant_token;
198 
199 /*A single node in the Viterbi trellis.
200   We maintain up to 2 of these per coefficient:
201     - A token to code if the value is zero (EOB, zero run, or combo token).
202     - A token to code if the value is not zero (DCT value token).*/
203 struct oc_quant_token{
204   unsigned char next;
205   signed char   token;
206   ogg_int16_t   eb;
207   ogg_uint32_t  cost;
208   int           bits;
209   int           qc;
210 };
211 
212 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
213    dequantizes and de-zig-zags the result.
214   The DC coefficient is not preserved; it should be restored by the caller.*/
oc_enc_tokenize_ac(oc_enc_ctx * _enc,int _pli,ptrdiff_t _fragi,ogg_int16_t * _qdct,const ogg_uint16_t * _dequant,const ogg_int16_t * _dct,int _zzi,oc_token_checkpoint ** _stack,int _acmin)215 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
216  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
217  int _zzi,oc_token_checkpoint **_stack,int _acmin){
218   oc_token_checkpoint *stack;
219   ogg_int64_t          zflags;
220   ogg_int64_t          nzflags;
221   ogg_int64_t          best_flags;
222   ogg_uint32_t         d2_accum[64];
223   oc_quant_token       tokens[64][2];
224   ogg_uint16_t        *eob_run;
225   const unsigned char *dct_fzig_zag;
226   ogg_uint32_t         cost;
227   int                  bits;
228   int                  eob;
229   int                  token;
230   int                  eb;
231   int                  next;
232   int                  huffi;
233   int                  zzi;
234   int                  ti;
235   int                  zzj;
236   int                  qc;
237   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
238   eob_run=_enc->eob_run[_pli];
239   memset(tokens[0],0,sizeof(tokens[0]));
240   best_flags=nzflags=0;
241   zflags=1;
242   d2_accum[0]=0;
243   zzj=64;
244   for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
245     ogg_int32_t  lambda;
246     ogg_uint32_t best_cost;
247     int          best_bits=best_bits;
248     int          best_next=best_next;
249     int          best_token=best_token;
250     int          best_eb=best_eb;
251     int          best_qc=best_qc;
252     int          flush_bits;
253     ogg_uint32_t d2;
254     int          dq;
255     int          e;
256     int          c;
257     int          s;
258     int          tj;
259     lambda=_enc->lambda;
260     qc=_qdct[zzi];
261     s=-(qc<0);
262     qc=qc+s^s;
263     c=_dct[OC_FZIG_ZAG[zzi]];
264     if(qc<=1){
265       ogg_uint32_t sum_d2;
266       int          nzeros;
267       int          dc_reserve;
268       /*The hard case: try a zero run.*/
269       if(!qc){
270         /*Skip runs that are already quantized to zeros.
271           If we considered each zero coefficient in turn, we might
272            theoretically find a better way to partition long zero runs (e.g.,
273            a run of > 17 zeros followed by a 1 might be better coded as a short
274            zero run followed by a combo token, rather than the longer zero
275            token followed by a 1 value token), but zeros are so common that
276            this becomes very computationally expensive (quadratic instead of
277            linear in the number of coefficients), for a marginal gain.*/
278         while(zzi>1&&!_qdct[zzi-1])zzi--;
279         /*The distortion of coefficients originally quantized to zero is
280            treated as zero (since we'll never quantize them to anything else).*/
281         d2=0;
282       }
283       else{
284         c=c+s^s;
285         d2=c*(ogg_int32_t)c;
286       }
287       eob=eob_run[zzi];
288       nzeros=zzj-zzi;
289       zzj&=63;
290       sum_d2=d2+d2_accum[zzj];
291       d2_accum[zzi]=sum_d2;
292       flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
293       /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
294          to ensure they can be extended to include the DC coefficient if
295          necessary; this greatly simplifies stack-rewriting later on.*/
296       dc_reserve=zzi+62>>6;
297       best_cost=0xFFFFFFFF;
298       for(;;){
299         if(nzflags>>zzj&1){
300           int cat;
301           int val;
302           int val_s;
303           int zzk;
304           int tk;
305           next=tokens[zzj][1].next;
306           tk=next&1;
307           zzk=next>>1;
308           /*Try a pure zero run to this point.*/
309           cat=nzeros+55>>6;
310           token=OC_DCT_SHORT_ZRL_TOKEN+cat;
311           bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
312           d2=sum_d2-d2_accum[zzj];
313           cost=d2+lambda*bits+tokens[zzj][1].cost;
314           if(cost<=best_cost){
315             best_next=(zzj<<1)+1;
316             best_token=token;
317             best_eb=nzeros-1;
318             best_cost=cost;
319             best_bits=bits+tokens[zzj][1].bits;
320             best_qc=0;
321           }
322           if(nzeros<16+dc_reserve){
323             val=_qdct[zzj];
324             val_s=-(val<0);
325             val=val+val_s^val_s;
326             if(val<=2){
327               /*Try a +/- 1 combo token.*/
328               if(nzeros<6){
329                 token=OC_DCT_RUN_CAT1A+nzeros-1;
330                 eb=-val_s;
331               }
332               else{
333                 cat=nzeros+54>>6;
334                 token=OC_DCT_RUN_CAT1B+cat;
335                 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
336               }
337               e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
338               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
339               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
340               cost=d2+lambda*bits+tokens[zzk][tk].cost;
341               if(cost<=best_cost){
342                 best_next=next;
343                 best_token=token;
344                 best_eb=eb;
345                 best_cost=cost;
346                 best_bits=bits+tokens[zzk][tk].bits;
347                 best_qc=1+val_s^val_s;
348               }
349             }
350             if(nzeros<2+dc_reserve&&2<=val&&val<=4){
351               /*Try a +/- 2/3 combo token.*/
352               cat=nzeros>>1;
353               token=OC_DCT_RUN_CAT2A+cat;
354               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
355               val=2+((val+val_s^val_s)>2);
356               e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
357               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
358               cost=d2+lambda*bits+tokens[zzk][tk].cost;
359               if(cost<=best_cost){
360                 best_cost=cost;
361                 best_bits=bits+tokens[zzk][tk].bits;
362                 best_next=next;
363                 best_token=token;
364                 best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
365                 best_qc=val+val_s^val_s;
366               }
367             }
368           }
369           /*zzj can't be coded as a zero, so stop trying to extend the run.*/
370           if(!(zflags>>zzj&1))break;
371         }
372         /*We could try to consider _all_ potentially non-zero coefficients, but
373            if we already found a bunch of them not worth coding, it's fairly
374            unlikely they would now be worth coding from this position; skipping
375            them saves a lot of work.*/
376         zzj=(tokens[zzj][0].next>>1)-(tokens[zzj][0].qc!=0)&63;
377         if(zzj==0){
378           /*We made it all the way to the end of the block; try an EOB token.*/
379           if(eob<4095){
380             bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
381              -flush_bits;
382           }
383           else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
384           cost=sum_d2+bits*lambda;
385           /*If the best route so far is still a pure zero run to the end of the
386              block, force coding it as an EOB.
387             Even if it's not optimal for this block, it has a good chance of
388              getting combined with an EOB token from subsequent blocks, saving
389              bits overall.*/
390           if(cost<=best_cost||best_token<=OC_DCT_ZRL_TOKEN&&zzi+best_eb==63){
391             best_next=0;
392             /*This token is just a marker; in reality we may not emit any
393                tokens, but update eob_run[] instead.*/
394             best_token=OC_DCT_EOB1_TOKEN;
395             best_eb=0;
396             best_cost=cost;
397             best_bits=bits;
398             best_qc=0;
399           }
400           break;
401         }
402         nzeros=zzj-zzi;
403       }
404       tokens[zzi][0].next=(unsigned char)best_next;
405       tokens[zzi][0].token=(signed char)best_token;
406       tokens[zzi][0].eb=(ogg_int16_t)best_eb;
407       tokens[zzi][0].cost=best_cost;
408       tokens[zzi][0].bits=best_bits;
409       tokens[zzi][0].qc=best_qc;
410       zflags|=(ogg_int64_t)1<<zzi;
411       if(qc){
412         dq=_dequant[zzi];
413         if(zzi<_acmin)lambda=0;
414         e=dq-c;
415         d2=e*(ogg_int32_t)e;
416         token=OC_ONE_TOKEN-s;
417         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
418         zzj=zzi+1&63;
419         tj=best_flags>>zzj&1;
420         next=(zzj<<1)+tj;
421         tokens[zzi][1].next=(unsigned char)next;
422         tokens[zzi][1].token=(signed char)token;
423         tokens[zzi][1].eb=0;
424         tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
425         tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
426         tokens[zzi][1].qc=1+s^s;
427         nzflags|=(ogg_int64_t)1<<zzi;
428         best_flags|=
429          (ogg_int64_t)(tokens[zzi][1].cost<tokens[zzi][0].cost)<<zzi;
430       }
431     }
432     else{
433       eob=eob_run[zzi];
434       if(zzi<_acmin)lambda=0;
435       c=c+s^s;
436       dq=_dequant[zzi];
437       /*No zero run can extend past this point.*/
438       d2_accum[zzi]=0;
439       flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
440       if(qc<=2){
441         e=2*dq-c;
442         d2=e*(ogg_int32_t)e;
443         best_token=OC_TWO_TOKEN-s;
444         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
445         best_cost=d2+lambda*best_bits;
446         e-=dq;
447         d2=e*(ogg_int32_t)e;
448         token=OC_ONE_TOKEN-s;
449         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
450         cost=d2+lambda*bits;
451         if(cost<=best_cost){
452           best_token=token;
453           best_bits=bits;
454           best_cost=cost;
455           qc--;
456         }
457         best_eb=0;
458       }
459       else if(qc<=3){
460         e=3*dq-c;
461         d2=e*(ogg_int32_t)e;
462         best_token=OC_DCT_VAL_CAT2;
463         best_eb=-s;
464         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
465         best_cost=d2+lambda*best_bits;
466         e-=dq;
467         d2=e*(ogg_int32_t)e;
468         token=OC_TWO_TOKEN-s;
469         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
470         cost=d2+lambda*bits;
471         if(cost<=best_cost){
472           best_token=token;
473           best_eb=0;
474           best_bits=bits;
475           best_cost=cost;
476           qc--;
477         }
478       }
479       else if(qc<=6){
480         e=qc*dq-c;
481         d2=e*(ogg_int32_t)e;
482         best_token=OC_DCT_VAL_CAT2+qc-3;
483         best_eb=-s;
484         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
485         best_cost=d2+lambda*best_bits;
486         e-=dq;
487         d2=e*(ogg_int32_t)e;
488         token=best_token-1;
489         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
490         cost=d2+lambda*bits;
491         if(cost<=best_cost){
492           best_token=token;
493           best_bits=bits;
494           best_cost=cost;
495           qc--;
496         }
497       }
498       else if(qc<=8){
499         e=qc*dq-c;
500         d2=e*(ogg_int32_t)e;
501         best_token=OC_DCT_VAL_CAT3;
502         best_eb=(-s<<1)+qc-7;
503         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
504         best_cost=d2+lambda*best_bits;
505         e=6*dq-c;
506         d2=e*(ogg_int32_t)e;
507         token=OC_DCT_VAL_CAT2+3;
508         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
509         cost=d2+lambda*bits;
510         if(cost<=best_cost){
511           best_token=token;
512           best_eb=-s;
513           best_bits=bits;
514           best_cost=cost;
515           qc=6;
516         }
517       }
518       else if(qc<=12){
519         e=qc*dq-c;
520         d2=e*(ogg_int32_t)e;
521         best_token=OC_DCT_VAL_CAT4;
522         best_eb=(-s<<2)+qc-9;
523         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
524         best_cost=d2+lambda*best_bits;
525         e=8*dq-c;
526         d2=e*(ogg_int32_t)e;
527         token=best_token-1;
528         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
529         cost=d2+lambda*bits;
530         if(cost<=best_cost){
531           best_token=token;
532           best_eb=(-s<<1)+1;
533           best_bits=bits;
534           best_cost=cost;
535           qc=8;
536         }
537       }
538       else if(qc<=20){
539         e=qc*dq-c;
540         d2=e*(ogg_int32_t)e;
541         best_token=OC_DCT_VAL_CAT5;
542         best_eb=(-s<<3)+qc-13;
543         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
544         best_cost=d2+lambda*best_bits;
545         e=12*dq-c;
546         d2=e*(ogg_int32_t)e;
547         token=best_token-1;
548         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
549         cost=d2+lambda*bits;
550         if(cost<=best_cost){
551           best_token=token;
552           best_eb=(-s<<2)+3;
553           best_bits=bits;
554           best_cost=cost;
555           qc=12;
556         }
557       }
558       else if(qc<=36){
559         e=qc*dq-c;
560         d2=e*(ogg_int32_t)e;
561         best_token=OC_DCT_VAL_CAT6;
562         best_eb=(-s<<4)+qc-21;
563         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
564         best_cost=d2+lambda*best_bits;
565         e=20*dq-c;
566         d2=e*(ogg_int32_t)e;
567         token=best_token-1;
568         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
569         cost=d2+lambda*bits;
570         if(cost<=best_cost){
571           best_token=token;
572           best_eb=(-s<<3)+7;
573           best_bits=bits;
574           best_cost=cost;
575           qc=20;
576         }
577       }
578       else if(qc<=68){
579         e=qc*dq-c;
580         d2=e*(ogg_int32_t)e;
581         best_token=OC_DCT_VAL_CAT7;
582         best_eb=(-s<<5)+qc-37;
583         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
584         best_cost=d2+lambda*best_bits;
585         e=36*dq-c;
586         d2=e*(ogg_int32_t)e;
587         token=best_token-1;
588         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
589         cost=d2+lambda*bits;
590         if(cost<best_cost){
591           best_token=token;
592           best_eb=(-s<<4)+15;
593           best_bits=bits;
594           best_cost=cost;
595           qc=36;
596         }
597       }
598       else{
599         e=qc*dq-c;
600         d2=e*(ogg_int32_t)e;
601         best_token=OC_DCT_VAL_CAT8;
602         best_eb=(-s<<9)+qc-69;
603         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
604         best_cost=d2+lambda*best_bits;
605         e=68*dq-c;
606         d2=e*(ogg_int32_t)e;
607         token=best_token-1;
608         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
609         cost=d2+lambda*bits;
610         if(cost<best_cost){
611           best_token=token;
612           best_eb=(-s<<5)+31;
613           best_bits=bits;
614           best_cost=cost;
615           qc=68;
616         }
617       }
618       zzj=zzi+1&63;
619       tj=best_flags>>zzj&1;
620       next=(zzj<<1)+tj;
621       tokens[zzi][1].next=(unsigned char)next;
622       tokens[zzi][1].token=(signed char)best_token;
623       tokens[zzi][1].eb=best_eb;
624       tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
625       tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
626       tokens[zzi][1].qc=qc+s^s;
627       nzflags|=(ogg_int64_t)1<<zzi;
628       best_flags|=(ogg_int64_t)1<<zzi;
629     }
630     zzj=zzi;
631   }
632   /*Emit the tokens from the best path through the trellis.*/
633   stack=*_stack;
634   /*We blow away the first entry here so that things vectorize better.
635     The DC coefficient is not actually stored in the array yet.*/
636   for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
637   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
638   zzi=1;
639   ti=best_flags>>1&1;
640   bits=tokens[zzi][ti].bits;
641   do{
642     oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
643     eob=eob_run[zzi];
644     if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
645       if(++eob>=4095){
646         oc_enc_eob_log(_enc,_pli,zzi,eob);
647         eob=0;
648       }
649       eob_run[zzi]=eob;
650       /*We don't include the actual EOB cost for this block in the return value.
651         It will be paid for by the fragment that terminates the EOB run.*/
652       bits-=tokens[zzi][ti].bits;
653       zzi=_zzi;
654       break;
655     }
656     /*Emit pending EOB run if any.*/
657     if(eob>0){
658       oc_enc_eob_log(_enc,_pli,zzi,eob);
659       eob_run[zzi]=0;
660     }
661     oc_enc_token_log(_enc,_pli,zzi,tokens[zzi][ti].token,tokens[zzi][ti].eb);
662     next=tokens[zzi][ti].next;
663     qc=tokens[zzi][ti].qc;
664     zzj=(next>>1)-1&63;
665     /*TODO: It may be worth saving the dequantized coefficient in the trellis
666        above; we had to compute it to measure the error anyway.*/
667     _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
668     zzi=next>>1;
669     ti=next&1;
670   }
671   while(zzi);
672   *_stack=stack;
673   return bits;
674 }
675 
oc_enc_pred_dc_frag_rows(oc_enc_ctx * _enc,int _pli,int _fragy0,int _frag_yend)676 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
677  int _pli,int _fragy0,int _frag_yend){
678   const oc_fragment_plane *fplane;
679   const oc_fragment       *frags;
680   ogg_int16_t             *frag_dc;
681   ptrdiff_t                fragi;
682   int                     *pred_last;
683   int                      nhfrags;
684   int                      fragx;
685   int                      fragy;
686   fplane=_enc->state.fplanes+_pli;
687   frags=_enc->state.frags;
688   frag_dc=_enc->frag_dc;
689   pred_last=_enc->dc_pred_last[_pli];
690   nhfrags=fplane->nhfrags;
691   fragi=fplane->froffset+_fragy0*nhfrags;
692   for(fragy=_fragy0;fragy<_frag_yend;fragy++){
693     if(fragy==0){
694       /*For the first row, all of the cases reduce to just using the previous
695          predictor for the same reference frame.*/
696       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
697         if(frags[fragi].coded){
698           int ref;
699           ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
700           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
701           pred_last[ref]=frags[fragi].dc;
702         }
703       }
704     }
705     else{
706       const oc_fragment *u_frags;
707       int                l_ref;
708       int                ul_ref;
709       int                u_ref;
710       u_frags=frags-nhfrags;
711       l_ref=-1;
712       ul_ref=-1;
713       u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
714       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
715         int ur_ref;
716         if(fragx+1>=nhfrags)ur_ref=-1;
717         else{
718           ur_ref=u_frags[fragi+1].coded?
719            OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
720         }
721         if(frags[fragi].coded){
722           int pred;
723           int ref;
724           ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
725           /*We break out a separate case based on which of our neighbors use
726              the same reference frames.
727             This is somewhat faster than trying to make a generic case which
728              handles all of them, since it reduces lots of poorly predicted
729              jumps to one switch statement, and also lets a number of the
730              multiplications be optimized out by strength reduction.*/
731           switch((l_ref==ref)|(ul_ref==ref)<<1|
732            (u_ref==ref)<<2|(ur_ref==ref)<<3){
733             default:pred=pred_last[ref];break;
734             case  1:
735             case  3:pred=frags[fragi-1].dc;break;
736             case  2:pred=u_frags[fragi-1].dc;break;
737             case  4:
738             case  6:
739             case 12:pred=u_frags[fragi].dc;break;
740             case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
741             case  8:pred=u_frags[fragi+1].dc;break;
742             case  9:
743             case 11:
744             case 13:{
745               pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
746             }break;
747             case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
748             case 14:{
749               pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
750                +10*u_frags[fragi].dc)/16;
751             }break;
752             case  7:
753             case 15:{
754               int p0;
755               int p1;
756               int p2;
757               p0=frags[fragi-1].dc;
758               p1=u_frags[fragi-1].dc;
759               p2=u_frags[fragi].dc;
760               pred=(29*(p0+p2)-26*p1)/32;
761               if(abs(pred-p2)>128)pred=p2;
762               else if(abs(pred-p0)>128)pred=p0;
763               else if(abs(pred-p1)>128)pred=p1;
764             }break;
765           }
766           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
767           pred_last[ref]=frags[fragi].dc;
768           l_ref=ref;
769         }
770         else l_ref=-1;
771         ul_ref=u_ref;
772         u_ref=ur_ref;
773       }
774     }
775   }
776 }
777 
oc_enc_tokenize_dc_frag_list(oc_enc_ctx * _enc,int _pli,const ptrdiff_t * _coded_fragis,ptrdiff_t _ncoded_fragis,int _prev_ndct_tokens1,int _prev_eob_run1)778 void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
779  const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
780  int _prev_ndct_tokens1,int _prev_eob_run1){
781   const ogg_int16_t *frag_dc;
782   ptrdiff_t          fragii;
783   unsigned char     *dct_tokens0;
784   unsigned char     *dct_tokens1;
785   ogg_uint16_t      *extra_bits0;
786   ogg_uint16_t      *extra_bits1;
787   ptrdiff_t          ti0;
788   ptrdiff_t          ti1r;
789   ptrdiff_t          ti1w;
790   int                eob_run0;
791   int                eob_run1;
792   int                neobs1;
793   int                token;
794   int                eb;
795   int                token1=token1;
796   int                eb1=eb1;
797   /*Return immediately if there are no coded fragments; otherwise we'd flush
798      any trailing EOB run into the AC 1 list and never read it back out.*/
799   if(_ncoded_fragis<=0)return;
800   frag_dc=_enc->frag_dc;
801   dct_tokens0=_enc->dct_tokens[_pli][0];
802   dct_tokens1=_enc->dct_tokens[_pli][1];
803   extra_bits0=_enc->extra_bits[_pli][0];
804   extra_bits1=_enc->extra_bits[_pli][1];
805   ti0=_enc->ndct_tokens[_pli][0];
806   ti1w=ti1r=_prev_ndct_tokens1;
807   eob_run0=_enc->eob_run[_pli][0];
808   /*Flush any trailing EOB run for the 1st AC coefficient.
809     This is needed to allow us to track tokens to the end of the list.*/
810   eob_run1=_enc->eob_run[_pli][1];
811   if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
812   /*If there was an active EOB run at the start of the 1st AC stack, read it
813      in and decode it.*/
814   if(_prev_eob_run1>0){
815     token1=dct_tokens1[ti1r];
816     eb1=extra_bits1[ti1r];
817     ti1r++;
818     eob_run1=oc_decode_eob_token(token1,eb1);
819     /*Consume the portion of the run that came before these fragments.*/
820     neobs1=eob_run1-_prev_eob_run1;
821   }
822   else eob_run1=neobs1=0;
823   for(fragii=0;fragii<_ncoded_fragis;fragii++){
824     int val;
825     /*All tokens in the 1st AC coefficient stack are regenerated as the DC
826        coefficients are produced.
827       This can be done in-place; stack 1 cannot get larger.*/
828     if(!neobs1){
829       /*There's no active EOB run in stack 1; read the next token.*/
830       token1=dct_tokens1[ti1r];
831       eb1=extra_bits1[ti1r];
832       ti1r++;
833       if(token1<OC_NDCT_EOB_TOKEN_MAX){
834         neobs1=oc_decode_eob_token(token1,eb1);
835         /*It's an EOB run; add it to the current (inactive) one.
836           Because we may have moved entries to stack 0, we may have an
837            opportunity to merge two EOB runs in stack 1.*/
838         eob_run1+=neobs1;
839       }
840     }
841     val=frag_dc[_coded_fragis[fragii]];
842     if(val){
843       /*There was a non-zero DC value, so there's no alteration to stack 1
844          for this fragment; just code the stack 0 token.*/
845       /*Flush any pending EOB run.*/
846       if(eob_run0>0){
847         token=oc_make_eob_token_full(eob_run0,&eb);
848         dct_tokens0[ti0]=(unsigned char)token;
849         extra_bits0[ti0]=(ogg_uint16_t)eb;
850         ti0++;
851         eob_run0=0;
852       }
853       token=oc_make_dct_token_full(0,0,val,&eb);
854       dct_tokens0[ti0]=(unsigned char)token;
855       extra_bits0[ti0]=(ogg_uint16_t)eb;
856       ti0++;
857     }
858     else{
859       /*Zero DC value; that means the entry in stack 1 might need to be coded
860          from stack 0.
861         This requires a stack 1 fixup.*/
862       if(neobs1>0){
863         /*We're in the middle of an active EOB run in stack 1.
864           Move it to stack 0.*/
865         if(++eob_run0>=4095){
866           token=oc_make_eob_token_full(eob_run0,&eb);
867           dct_tokens0[ti0]=(unsigned char)token;
868           extra_bits0[ti0]=(ogg_uint16_t)eb;
869           ti0++;
870           eob_run0=0;
871         }
872         eob_run1--;
873       }
874       else{
875         /*No active EOB run in stack 1, so we can't extend one in stack 0.
876           Flush it if we've got it.*/
877         if(eob_run0>0){
878           token=oc_make_eob_token_full(eob_run0,&eb);
879           dct_tokens0[ti0]=(unsigned char)token;
880           extra_bits0[ti0]=(ogg_uint16_t)eb;
881           ti0++;
882           eob_run0=0;
883         }
884         /*Stack 1 token is one of: a pure zero run token, a single
885            coefficient token, or a zero run/coefficient combo token.
886           A zero run token is expanded and moved to token stack 0, and the
887            stack 1 entry dropped.
888           A single coefficient value may be transformed into combo token that
889            is moved to stack 0, or if it cannot be combined, it is left alone
890            and a single length-1 zero run is emitted in stack 0.
891           A combo token is extended and moved to stack 0.
892           During AC coding, we restrict the run lengths on combo tokens for
893            stack 1 to guarantee we can extend them.*/
894         switch(token1){
895           case OC_DCT_SHORT_ZRL_TOKEN:{
896             if(eb1<7){
897               dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
898               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
899               ti0++;
900               /*Don't write the AC coefficient back out.*/
901               continue;
902             }
903             /*Fall through.*/
904           }
905           case OC_DCT_ZRL_TOKEN:{
906             dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
907             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
908             ti0++;
909             /*Don't write the AC coefficient back out.*/
910           }continue;
911           case OC_ONE_TOKEN:
912           case OC_MINUS_ONE_TOKEN:{
913             dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
914             extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
915             ti0++;
916             /*Don't write the AC coefficient back out.*/
917           }continue;
918           case OC_TWO_TOKEN:
919           case OC_MINUS_TWO_TOKEN:{
920             dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
921             extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
922             ti0++;
923             /*Don't write the AC coefficient back out.*/
924           }continue;
925           case OC_DCT_VAL_CAT2:{
926             dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
927             extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
928             ti0++;
929             /*Don't write the AC coefficient back out.*/
930           }continue;
931           case OC_DCT_RUN_CAT1A:
932           case OC_DCT_RUN_CAT1A+1:
933           case OC_DCT_RUN_CAT1A+2:
934           case OC_DCT_RUN_CAT1A+3:{
935             dct_tokens0[ti0]=(unsigned char)(token1+1);
936             extra_bits0[ti0]=(ogg_uint16_t)eb1;
937             ti0++;
938             /*Don't write the AC coefficient back out.*/
939           }continue;
940           case OC_DCT_RUN_CAT1A+4:{
941             dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
942             extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
943             ti0++;
944             /*Don't write the AC coefficient back out.*/
945           }continue;
946           case OC_DCT_RUN_CAT1B:{
947             if((eb1&3)<3){
948               dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
949               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
950               ti0++;
951               /*Don't write the AC coefficient back out.*/
952               continue;
953             }
954             eb1=((eb1&4)<<1)-1;
955             /*Fall through.*/
956           }
957           case OC_DCT_RUN_CAT1C:{
958             dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
959             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
960             ti0++;
961             /*Don't write the AC coefficient back out.*/
962           }continue;
963           case OC_DCT_RUN_CAT2A:{
964             eb1=(eb1<<1)-1;
965             /*Fall through.*/
966           }
967           case OC_DCT_RUN_CAT2B:{
968             dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
969             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
970             ti0++;
971             /*Don't write the AC coefficient back out.*/
972           }continue;
973         }
974         /*We can't merge tokens, write a short zero run and keep going.*/
975         dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
976         extra_bits0[ti0]=0;
977         ti0++;
978       }
979     }
980     if(!neobs1){
981       /*Flush any (inactive) EOB run.*/
982       if(eob_run1>0){
983         token=oc_make_eob_token_full(eob_run1,&eb);
984         dct_tokens1[ti1w]=(unsigned char)token;
985         extra_bits1[ti1w]=(ogg_uint16_t)eb;
986         ti1w++;
987         eob_run1=0;
988       }
989       /*There's no active EOB run, so log the current token.*/
990       dct_tokens1[ti1w]=(unsigned char)token1;
991       extra_bits1[ti1w]=(ogg_uint16_t)eb1;
992       ti1w++;
993     }
994     else{
995       /*Otherwise consume one EOB from the current run.*/
996       neobs1--;
997       /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
998       if(eob_run1-neobs1>=4095){
999         token=oc_make_eob_token_full(4095,&eb);
1000         dct_tokens1[ti1w]=(unsigned char)token;
1001         extra_bits1[ti1w]=(ogg_uint16_t)eb;
1002         ti1w++;
1003         eob_run1-=4095;
1004       }
1005     }
1006   }
1007   /*Save the current state.*/
1008   _enc->ndct_tokens[_pli][0]=ti0;
1009   _enc->ndct_tokens[_pli][1]=ti1w;
1010   _enc->eob_run[_pli][0]=eob_run0;
1011   _enc->eob_run[_pli][1]=eob_run1;
1012 }
1013 
1014 /*Final EOB run welding.*/
oc_enc_tokenize_finish(oc_enc_ctx * _enc)1015 void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
1016   int pli;
1017   int zzi;
1018   /*Emit final EOB runs.*/
1019   for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
1020     int eob_run;
1021     eob_run=_enc->eob_run[pli][zzi];
1022     if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
1023   }
1024   /*Merge the final EOB run of one token list with the start of the next, if
1025      possible.*/
1026   for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
1027     int       old_tok1;
1028     int       old_tok2;
1029     int       old_eb1;
1030     int       old_eb2;
1031     int       new_tok;
1032     int       new_eb;
1033     int       zzj;
1034     int       plj;
1035     ptrdiff_t ti=ti;
1036     int       run_count;
1037     /*Make sure this coefficient has tokens at all.*/
1038     if(_enc->ndct_tokens[pli][zzi]<=0)continue;
1039     /*Ensure the first token is an EOB run.*/
1040     old_tok2=_enc->dct_tokens[pli][zzi][0];
1041     if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
1042     /*Search for a previous coefficient that has any tokens at all.*/
1043     old_tok1=OC_NDCT_EOB_TOKEN_MAX;
1044     for(zzj=zzi,plj=pli;zzj>=0;zzj--){
1045       while(plj-->0){
1046         ti=_enc->ndct_tokens[plj][zzj]-1;
1047         if(ti>=_enc->dct_token_offs[plj][zzj]){
1048           old_tok1=_enc->dct_tokens[plj][zzj][ti];
1049           break;
1050         }
1051       }
1052       if(plj>=0)break;
1053       plj=3;
1054     }
1055     /*Ensure its last token was an EOB run.*/
1056     if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
1057     /*Pull off the associated extra bits, if any, and decode the runs.*/
1058     old_eb1=_enc->extra_bits[plj][zzj][ti];
1059     old_eb2=_enc->extra_bits[pli][zzi][0];
1060     run_count=oc_decode_eob_token(old_tok1,old_eb1)
1061      +oc_decode_eob_token(old_tok2,old_eb2);
1062     /*We can't possibly combine these into one run.
1063       It might be possible to split them more optimally, but we'll just leave
1064        them as-is.*/
1065     if(run_count>=4096)continue;
1066     /*We CAN combine them into one run.*/
1067     new_tok=oc_make_eob_token_full(run_count,&new_eb);
1068     _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
1069     _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
1070     _enc->dct_token_offs[pli][zzi]++;
1071   }
1072 }
1073