1 /********************************************************************
2  *                                                                  *
3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
7  *                                                                  *
8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
9  * by the Xiph.Org Foundation http://www.xiph.org/                  *
10  *                                                                  *
11  ********************************************************************
12 
13   function:
14   last mod: $Id$
15 
16  ********************************************************************/
17 #include <stdlib.h>
18 #include <string.h>
19 #include "encint.h"
20 
21 
22 
oc_make_eob_token(int _run_count)23 static int oc_make_eob_token(int _run_count){
24   if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
25   else{
26     int cat;
27     cat=OC_ILOGNZ_32(_run_count)-3;
28     cat=OC_MINI(cat,3);
29     return OC_DCT_REPEAT_RUN0_TOKEN+cat;
30   }
31 }
32 
oc_make_eob_token_full(int _run_count,int * _eb)33 static int oc_make_eob_token_full(int _run_count,int *_eb){
34   if(_run_count<4){
35     *_eb=0;
36     return OC_DCT_EOB1_TOKEN+_run_count-1;
37   }
38   else{
39     int cat;
40     cat=OC_ILOGNZ_32(_run_count)-3;
41     cat=OC_MINI(cat,3);
42     *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
43     return OC_DCT_REPEAT_RUN0_TOKEN+cat;
44   }
45 }
46 
47 /*Returns the number of blocks ended by an EOB token.*/
oc_decode_eob_token(int _token,int _eb)48 static int oc_decode_eob_token(int _token,int _eb){
49   return (0x20820C41U>>_token*5&0x1F)+_eb;
50 }
51 
52 /*TODO: This is now only used during DCT tokenization, and never for runs; it
53    should be simplified.*/
oc_make_dct_token_full(int _zzi,int _zzj,int _val,int * _eb)54 static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
55   int neg;
56   int zero_run;
57   int token;
58   int eb;
59   neg=_val<0;
60   _val=abs(_val);
61   zero_run=_zzj-_zzi;
62   if(zero_run>0){
63     int adj;
64     /*Implement a minor restriction on stack 1 so that we know during DC fixups
65        that extending a dctrun token from stack 1 will never overflow.*/
66     adj=_zzi!=1;
67     if(_val<2&&zero_run<17+adj){
68       if(zero_run<6){
69         token=OC_DCT_RUN_CAT1A+zero_run-1;
70         eb=neg;
71       }
72       else if(zero_run<10){
73         token=OC_DCT_RUN_CAT1B;
74         eb=zero_run-6+(neg<<2);
75       }
76       else{
77         token=OC_DCT_RUN_CAT1C;
78         eb=zero_run-10+(neg<<3);
79       }
80     }
81     else if(_val<4&&zero_run<3+adj){
82       if(zero_run<2){
83         token=OC_DCT_RUN_CAT2A;
84         eb=_val-2+(neg<<1);
85       }
86       else{
87         token=OC_DCT_RUN_CAT2B;
88         eb=zero_run-2+(_val-2<<1)+(neg<<2);
89       }
90     }
91     else{
92       if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
93       else token=OC_DCT_ZRL_TOKEN;
94       eb=zero_run-1;
95     }
96   }
97   else if(_val<3){
98     token=OC_ONE_TOKEN+(_val-1<<1)+neg;
99     eb=0;
100   }
101   else if(_val<7){
102     token=OC_DCT_VAL_CAT2+_val-3;
103     eb=neg;
104   }
105   else if(_val<9){
106     token=OC_DCT_VAL_CAT3;
107     eb=_val-7+(neg<<1);
108   }
109   else if(_val<13){
110     token=OC_DCT_VAL_CAT4;
111     eb=_val-9+(neg<<2);
112   }
113   else if(_val<21){
114     token=OC_DCT_VAL_CAT5;
115     eb=_val-13+(neg<<3);
116   }
117   else if(_val<37){
118     token=OC_DCT_VAL_CAT6;
119     eb=_val-21+(neg<<4);
120   }
121   else if(_val<69){
122     token=OC_DCT_VAL_CAT7;
123     eb=_val-37+(neg<<5);
124   }
125   else{
126     token=OC_DCT_VAL_CAT8;
127     eb=_val-69+(neg<<9);
128   }
129   *_eb=eb;
130   return token;
131 }
132 
133 /*Token logging to allow a few fragments of efficient rollback.
134   Late SKIP analysis is tied up in the tokenization process, so we need to be
135    able to undo a fragment's tokens on a whim.*/
136 
137 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
138    0,16,16,16,16,16,32,32,
139   32,32,32,32,32,32,32,48,
140   48,48,48,48,48,48,48,48,
141   48,48,48,48,64,64,64,64,
142   64,64,64,64,64,64,64,64,
143   64,64,64,64,64,64,64,64,
144   64,64,64,64,64,64,64,64
145 };
146 
oc_token_bits(oc_enc_ctx * _enc,int _huffi,int _zzi,int _token)147 static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
148   return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
149    +OC_DCT_TOKEN_EXTRA_BITS[_token];
150 }
151 
oc_enc_tokenlog_checkpoint(oc_enc_ctx * _enc,oc_token_checkpoint * _cp,int _pli,int _zzi)152 static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
153  oc_token_checkpoint *_cp,int _pli,int _zzi){
154   _cp->pli=_pli;
155   _cp->zzi=_zzi;
156   _cp->eob_run=_enc->eob_run[_pli][_zzi];
157   _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
158 }
159 
oc_enc_tokenlog_rollback(oc_enc_ctx * _enc,const oc_token_checkpoint * _stack,int _n)160 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
161  const oc_token_checkpoint *_stack,int _n){
162   int i;
163   for(i=_n;i-->0;){
164     int pli;
165     int zzi;
166     pli=_stack[i].pli;
167     zzi=_stack[i].zzi;
168     _enc->eob_run[pli][zzi]=_stack[i].eob_run;
169     _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
170   }
171 }
172 
oc_enc_token_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _token,int _eb)173 static void oc_enc_token_log(oc_enc_ctx *_enc,
174  int _pli,int _zzi,int _token,int _eb){
175   ptrdiff_t ti;
176   ti=_enc->ndct_tokens[_pli][_zzi]++;
177   _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
178   _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
179 }
180 
oc_enc_eob_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _run_count)181 static void oc_enc_eob_log(oc_enc_ctx *_enc,
182  int _pli,int _zzi,int _run_count){
183   int token;
184   int eb;
185   token=oc_make_eob_token_full(_run_count,&eb);
186   oc_enc_token_log(_enc,_pli,_zzi,token,eb);
187 }
188 
189 
oc_enc_tokenize_start(oc_enc_ctx * _enc)190 void oc_enc_tokenize_start(oc_enc_ctx *_enc){
191   memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
192   memset(_enc->eob_run,0,sizeof(_enc->eob_run));
193   memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
194   memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
195 }
196 
197 typedef struct oc_quant_token oc_quant_token;
198 
199 /*A single node in the Viterbi trellis.
200   We maintain up to 2 of these per coefficient:
201     - A token to code if the value is zero (EOB, zero run, or combo token).
202     - A token to code if the value is not zero (DCT value token).*/
203 struct oc_quant_token{
204   unsigned char next;
205   signed char   token;
206   ogg_int16_t   eb;
207   ogg_uint32_t  cost;
208   int           bits;
209   int           qc;
210 };
211 
212 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
213    dequantizes and de-zig-zags the result.
214   The DC coefficient is not preserved; it should be restored by the caller.*/
oc_enc_tokenize_ac(oc_enc_ctx * _enc,int _pli,ptrdiff_t _fragi,ogg_int16_t * _qdct,const ogg_uint16_t * _dequant,const ogg_int16_t * _dct,int _zzi,oc_token_checkpoint ** _stack,int _acmin)215 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
216  ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
217  int _zzi,oc_token_checkpoint **_stack,int _acmin){
218   oc_token_checkpoint *stack;
219   ogg_int64_t          zflags;
220   ogg_int64_t          nzflags;
221   ogg_int64_t          best_flags;
222   ogg_uint32_t         d2_accum[64];
223   oc_quant_token       tokens[64][2];
224   ogg_uint16_t        *eob_run;
225   const unsigned char *dct_fzig_zag;
226   ogg_uint32_t         cost;
227   int                  bits;
228   int                  eob;
229   int                  token;
230   int                  eb;
231   int                  next;
232   int                  huffi;
233   int                  zzi;
234   int                  ti;
235   int                  zzj;
236   int                  qc;
237   (void)_fragi;
238   huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
239   eob_run=_enc->eob_run[_pli];
240   memset(tokens[0],0,sizeof(tokens[0]));
241   best_flags=nzflags=0;
242   zflags=1;
243   d2_accum[0]=0;
244   zzj=64;
245   for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
246     ogg_int32_t  lambda;
247     ogg_uint32_t best_cost;
248     int          best_bits=best_bits;
249     int          best_next=best_next;
250     int          best_token=best_token;
251     int          best_eb=best_eb;
252     int          best_qc=best_qc;
253     int          flush_bits;
254     ogg_uint32_t d2;
255     int          dq;
256     int          e;
257     int          c;
258     int          s;
259     int          tj;
260     lambda=_enc->lambda;
261     qc=_qdct[zzi];
262     s=-(qc<0);
263     qc=qc+s^s;
264     c=_dct[OC_FZIG_ZAG[zzi]];
265     if(qc<=1){
266       ogg_uint32_t sum_d2;
267       int          nzeros;
268       int          dc_reserve;
269       /*The hard case: try a zero run.*/
270       if(!qc){
271         /*Skip runs that are already quantized to zeros.
272           If we considered each zero coefficient in turn, we might
273            theoretically find a better way to partition long zero runs (e.g.,
274            a run of > 17 zeros followed by a 1 might be better coded as a short
275            zero run followed by a combo token, rather than the longer zero
276            token followed by a 1 value token), but zeros are so common that
277            this becomes very computationally expensive (quadratic instead of
278            linear in the number of coefficients), for a marginal gain.*/
279         while(zzi>1&&!_qdct[zzi-1])zzi--;
280         /*The distortion of coefficients originally quantized to zero is
281            treated as zero (since we'll never quantize them to anything else).*/
282         d2=0;
283       }
284       else{
285         c=c+s^s;
286         d2=c*(ogg_int32_t)c;
287       }
288       eob=eob_run[zzi];
289       nzeros=zzj-zzi;
290       zzj&=63;
291       sum_d2=d2+d2_accum[zzj];
292       d2_accum[zzi]=sum_d2;
293       flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
294       /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
295          to ensure they can be extended to include the DC coefficient if
296          necessary; this greatly simplifies stack-rewriting later on.*/
297       dc_reserve=zzi+62>>6;
298       best_cost=0xFFFFFFFF;
299       for(;;){
300         if(nzflags>>zzj&1){
301           int cat;
302           int val;
303           int val_s;
304           int zzk;
305           int tk;
306           next=tokens[zzj][1].next;
307           tk=next&1;
308           zzk=next>>1;
309           /*Try a pure zero run to this point.*/
310           cat=nzeros+55>>6;
311           token=OC_DCT_SHORT_ZRL_TOKEN+cat;
312           bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
313           d2=sum_d2-d2_accum[zzj];
314           cost=d2+lambda*bits+tokens[zzj][1].cost;
315           if(cost<=best_cost){
316             best_next=(zzj<<1)+1;
317             best_token=token;
318             best_eb=nzeros-1;
319             best_cost=cost;
320             best_bits=bits+tokens[zzj][1].bits;
321             best_qc=0;
322           }
323           if(nzeros<16+dc_reserve){
324             val=_qdct[zzj];
325             val_s=-(val<0);
326             val=val+val_s^val_s;
327             if(val<=2){
328               /*Try a +/- 1 combo token.*/
329               if(nzeros<6){
330                 token=OC_DCT_RUN_CAT1A+nzeros-1;
331                 eb=-val_s;
332               }
333               else{
334                 cat=nzeros+54>>6;
335                 token=OC_DCT_RUN_CAT1B+cat;
336                 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
337               }
338               e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
339               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
340               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
341               cost=d2+lambda*bits+tokens[zzk][tk].cost;
342               if(cost<=best_cost){
343                 best_next=next;
344                 best_token=token;
345                 best_eb=eb;
346                 best_cost=cost;
347                 best_bits=bits+tokens[zzk][tk].bits;
348                 best_qc=1+val_s^val_s;
349               }
350             }
351             if(nzeros<2+dc_reserve&&2<=val&&val<=4){
352               /*Try a +/- 2/3 combo token.*/
353               cat=nzeros>>1;
354               token=OC_DCT_RUN_CAT2A+cat;
355               bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
356               val=2+((val+val_s^val_s)>2);
357               e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
358               d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
359               cost=d2+lambda*bits+tokens[zzk][tk].cost;
360               if(cost<=best_cost){
361                 best_cost=cost;
362                 best_bits=bits+tokens[zzk][tk].bits;
363                 best_next=next;
364                 best_token=token;
365                 best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
366                 best_qc=val+val_s^val_s;
367               }
368             }
369           }
370           /*zzj can't be coded as a zero, so stop trying to extend the run.*/
371           if(!(zflags>>zzj&1))break;
372         }
373         /*We could try to consider _all_ potentially non-zero coefficients, but
374            if we already found a bunch of them not worth coding, it's fairly
375            unlikely they would now be worth coding from this position; skipping
376            them saves a lot of work.*/
377         zzj=(tokens[zzj][0].next>>1)-(tokens[zzj][0].qc!=0)&63;
378         if(zzj==0){
379           /*We made it all the way to the end of the block; try an EOB token.*/
380           if(eob<4095){
381             bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
382              -flush_bits;
383           }
384           else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
385           cost=sum_d2+bits*lambda;
386           /*If the best route so far is still a pure zero run to the end of the
387              block, force coding it as an EOB.
388             Even if it's not optimal for this block, it has a good chance of
389              getting combined with an EOB token from subsequent blocks, saving
390              bits overall.*/
391           if(cost<=best_cost||best_token<=OC_DCT_ZRL_TOKEN&&zzi+best_eb==63){
392             best_next=0;
393             /*This token is just a marker; in reality we may not emit any
394                tokens, but update eob_run[] instead.*/
395             best_token=OC_DCT_EOB1_TOKEN;
396             best_eb=0;
397             best_cost=cost;
398             best_bits=bits;
399             best_qc=0;
400           }
401           break;
402         }
403         nzeros=zzj-zzi;
404       }
405       tokens[zzi][0].next=(unsigned char)best_next;
406       tokens[zzi][0].token=(signed char)best_token;
407       tokens[zzi][0].eb=(ogg_int16_t)best_eb;
408       tokens[zzi][0].cost=best_cost;
409       tokens[zzi][0].bits=best_bits;
410       tokens[zzi][0].qc=best_qc;
411       zflags|=(ogg_int64_t)1<<zzi;
412       if(qc){
413         dq=_dequant[zzi];
414         if(zzi<_acmin)lambda=0;
415         e=dq-c;
416         d2=e*(ogg_int32_t)e;
417         token=OC_ONE_TOKEN-s;
418         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
419         zzj=zzi+1&63;
420         tj=best_flags>>zzj&1;
421         next=(zzj<<1)+tj;
422         tokens[zzi][1].next=(unsigned char)next;
423         tokens[zzi][1].token=(signed char)token;
424         tokens[zzi][1].eb=0;
425         tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
426         tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
427         tokens[zzi][1].qc=1+s^s;
428         nzflags|=(ogg_int64_t)1<<zzi;
429         best_flags|=
430          (ogg_int64_t)(tokens[zzi][1].cost<tokens[zzi][0].cost)<<zzi;
431       }
432     }
433     else{
434       eob=eob_run[zzi];
435       if(zzi<_acmin)lambda=0;
436       c=c+s^s;
437       dq=_dequant[zzi];
438       /*No zero run can extend past this point.*/
439       d2_accum[zzi]=0;
440       flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
441       if(qc<=2){
442         e=2*dq-c;
443         d2=e*(ogg_int32_t)e;
444         best_token=OC_TWO_TOKEN-s;
445         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
446         best_cost=d2+lambda*best_bits;
447         e-=dq;
448         d2=e*(ogg_int32_t)e;
449         token=OC_ONE_TOKEN-s;
450         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
451         cost=d2+lambda*bits;
452         if(cost<=best_cost){
453           best_token=token;
454           best_bits=bits;
455           best_cost=cost;
456           qc--;
457         }
458         best_eb=0;
459       }
460       else if(qc<=3){
461         e=3*dq-c;
462         d2=e*(ogg_int32_t)e;
463         best_token=OC_DCT_VAL_CAT2;
464         best_eb=-s;
465         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
466         best_cost=d2+lambda*best_bits;
467         e-=dq;
468         d2=e*(ogg_int32_t)e;
469         token=OC_TWO_TOKEN-s;
470         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
471         cost=d2+lambda*bits;
472         if(cost<=best_cost){
473           best_token=token;
474           best_eb=0;
475           best_bits=bits;
476           best_cost=cost;
477           qc--;
478         }
479       }
480       else if(qc<=6){
481         e=qc*dq-c;
482         d2=e*(ogg_int32_t)e;
483         best_token=OC_DCT_VAL_CAT2+qc-3;
484         best_eb=-s;
485         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
486         best_cost=d2+lambda*best_bits;
487         e-=dq;
488         d2=e*(ogg_int32_t)e;
489         token=best_token-1;
490         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
491         cost=d2+lambda*bits;
492         if(cost<=best_cost){
493           best_token=token;
494           best_bits=bits;
495           best_cost=cost;
496           qc--;
497         }
498       }
499       else if(qc<=8){
500         e=qc*dq-c;
501         d2=e*(ogg_int32_t)e;
502         best_token=OC_DCT_VAL_CAT3;
503         best_eb=(-s<<1)+qc-7;
504         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
505         best_cost=d2+lambda*best_bits;
506         e=6*dq-c;
507         d2=e*(ogg_int32_t)e;
508         token=OC_DCT_VAL_CAT2+3;
509         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
510         cost=d2+lambda*bits;
511         if(cost<=best_cost){
512           best_token=token;
513           best_eb=-s;
514           best_bits=bits;
515           best_cost=cost;
516           qc=6;
517         }
518       }
519       else if(qc<=12){
520         e=qc*dq-c;
521         d2=e*(ogg_int32_t)e;
522         best_token=OC_DCT_VAL_CAT4;
523         best_eb=(-s<<2)+qc-9;
524         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
525         best_cost=d2+lambda*best_bits;
526         e=8*dq-c;
527         d2=e*(ogg_int32_t)e;
528         token=best_token-1;
529         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
530         cost=d2+lambda*bits;
531         if(cost<=best_cost){
532           best_token=token;
533           best_eb=(-s<<1)+1;
534           best_bits=bits;
535           best_cost=cost;
536           qc=8;
537         }
538       }
539       else if(qc<=20){
540         e=qc*dq-c;
541         d2=e*(ogg_int32_t)e;
542         best_token=OC_DCT_VAL_CAT5;
543         best_eb=(-s<<3)+qc-13;
544         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
545         best_cost=d2+lambda*best_bits;
546         e=12*dq-c;
547         d2=e*(ogg_int32_t)e;
548         token=best_token-1;
549         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
550         cost=d2+lambda*bits;
551         if(cost<=best_cost){
552           best_token=token;
553           best_eb=(-s<<2)+3;
554           best_bits=bits;
555           best_cost=cost;
556           qc=12;
557         }
558       }
559       else if(qc<=36){
560         e=qc*dq-c;
561         d2=e*(ogg_int32_t)e;
562         best_token=OC_DCT_VAL_CAT6;
563         best_eb=(-s<<4)+qc-21;
564         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
565         best_cost=d2+lambda*best_bits;
566         e=20*dq-c;
567         d2=e*(ogg_int32_t)e;
568         token=best_token-1;
569         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
570         cost=d2+lambda*bits;
571         if(cost<=best_cost){
572           best_token=token;
573           best_eb=(-s<<3)+7;
574           best_bits=bits;
575           best_cost=cost;
576           qc=20;
577         }
578       }
579       else if(qc<=68){
580         e=qc*dq-c;
581         d2=e*(ogg_int32_t)e;
582         best_token=OC_DCT_VAL_CAT7;
583         best_eb=(-s<<5)+qc-37;
584         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
585         best_cost=d2+lambda*best_bits;
586         e=36*dq-c;
587         d2=e*(ogg_int32_t)e;
588         token=best_token-1;
589         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
590         cost=d2+lambda*bits;
591         if(cost<best_cost){
592           best_token=token;
593           best_eb=(-s<<4)+15;
594           best_bits=bits;
595           best_cost=cost;
596           qc=36;
597         }
598       }
599       else{
600         e=qc*dq-c;
601         d2=e*(ogg_int32_t)e;
602         best_token=OC_DCT_VAL_CAT8;
603         best_eb=(-s<<9)+qc-69;
604         best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
605         best_cost=d2+lambda*best_bits;
606         e=68*dq-c;
607         d2=e*(ogg_int32_t)e;
608         token=best_token-1;
609         bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
610         cost=d2+lambda*bits;
611         if(cost<best_cost){
612           best_token=token;
613           best_eb=(-s<<5)+31;
614           best_bits=bits;
615           best_cost=cost;
616           qc=68;
617         }
618       }
619       zzj=zzi+1&63;
620       tj=best_flags>>zzj&1;
621       next=(zzj<<1)+tj;
622       tokens[zzi][1].next=(unsigned char)next;
623       tokens[zzi][1].token=(signed char)best_token;
624       tokens[zzi][1].eb=best_eb;
625       tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
626       tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
627       tokens[zzi][1].qc=qc+s^s;
628       nzflags|=(ogg_int64_t)1<<zzi;
629       best_flags|=(ogg_int64_t)1<<zzi;
630     }
631     zzj=zzi;
632   }
633   /*Emit the tokens from the best path through the trellis.*/
634   stack=*_stack;
635   /*We blow away the first entry here so that things vectorize better.
636     The DC coefficient is not actually stored in the array yet.*/
637   for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
638   dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
639   zzi=1;
640   ti=best_flags>>1&1;
641   bits=tokens[zzi][ti].bits;
642   do{
643     oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
644     eob=eob_run[zzi];
645     if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
646       if(++eob>=4095){
647         oc_enc_eob_log(_enc,_pli,zzi,eob);
648         eob=0;
649       }
650       eob_run[zzi]=eob;
651       /*We don't include the actual EOB cost for this block in the return value.
652         It will be paid for by the fragment that terminates the EOB run.*/
653       bits-=tokens[zzi][ti].bits;
654       zzi=_zzi;
655       break;
656     }
657     /*Emit pending EOB run if any.*/
658     if(eob>0){
659       oc_enc_eob_log(_enc,_pli,zzi,eob);
660       eob_run[zzi]=0;
661     }
662     oc_enc_token_log(_enc,_pli,zzi,tokens[zzi][ti].token,tokens[zzi][ti].eb);
663     next=tokens[zzi][ti].next;
664     qc=tokens[zzi][ti].qc;
665     zzj=(next>>1)-1&63;
666     /*TODO: It may be worth saving the dequantized coefficient in the trellis
667        above; we had to compute it to measure the error anyway.*/
668     _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
669     zzi=next>>1;
670     ti=next&1;
671   }
672   while(zzi);
673   *_stack=stack;
674   return bits;
675 }
676 
oc_enc_pred_dc_frag_rows(oc_enc_ctx * _enc,int _pli,int _fragy0,int _frag_yend)677 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
678  int _pli,int _fragy0,int _frag_yend){
679   const oc_fragment_plane *fplane;
680   const oc_fragment       *frags;
681   ogg_int16_t             *frag_dc;
682   ptrdiff_t                fragi;
683   int                     *pred_last;
684   int                      nhfrags;
685   int                      fragx;
686   int                      fragy;
687   fplane=_enc->state.fplanes+_pli;
688   frags=_enc->state.frags;
689   frag_dc=_enc->frag_dc;
690   pred_last=_enc->dc_pred_last[_pli];
691   nhfrags=fplane->nhfrags;
692   fragi=fplane->froffset+_fragy0*nhfrags;
693   for(fragy=_fragy0;fragy<_frag_yend;fragy++){
694     if(fragy==0){
695       /*For the first row, all of the cases reduce to just using the previous
696          predictor for the same reference frame.*/
697       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
698         if(frags[fragi].coded){
699           int ref;
700           ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
701           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
702           pred_last[ref]=frags[fragi].dc;
703         }
704       }
705     }
706     else{
707       const oc_fragment *u_frags;
708       int                l_ref;
709       int                ul_ref;
710       int                u_ref;
711       u_frags=frags-nhfrags;
712       l_ref=-1;
713       ul_ref=-1;
714       u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
715       for(fragx=0;fragx<nhfrags;fragx++,fragi++){
716         int ur_ref;
717         if(fragx+1>=nhfrags)ur_ref=-1;
718         else{
719           ur_ref=u_frags[fragi+1].coded?
720            OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
721         }
722         if(frags[fragi].coded){
723           int pred;
724           int ref;
725           ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
726           /*We break out a separate case based on which of our neighbors use
727              the same reference frames.
728             This is somewhat faster than trying to make a generic case which
729              handles all of them, since it reduces lots of poorly predicted
730              jumps to one switch statement, and also lets a number of the
731              multiplications be optimized out by strength reduction.*/
732           switch((l_ref==ref)|(ul_ref==ref)<<1|
733            (u_ref==ref)<<2|(ur_ref==ref)<<3){
734             default:pred=pred_last[ref];break;
735             case  1:
736             case  3:pred=frags[fragi-1].dc;break;
737             case  2:pred=u_frags[fragi-1].dc;break;
738             case  4:
739             case  6:
740             case 12:pred=u_frags[fragi].dc;break;
741             case  5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
742             case  8:pred=u_frags[fragi+1].dc;break;
743             case  9:
744             case 11:
745             case 13:{
746               pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
747             }break;
748             case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
749             case 14:{
750               pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
751                +10*u_frags[fragi].dc)/16;
752             }break;
753             case  7:
754             case 15:{
755               int p0;
756               int p1;
757               int p2;
758               p0=frags[fragi-1].dc;
759               p1=u_frags[fragi-1].dc;
760               p2=u_frags[fragi].dc;
761               pred=(29*(p0+p2)-26*p1)/32;
762               if(abs(pred-p2)>128)pred=p2;
763               else if(abs(pred-p0)>128)pred=p0;
764               else if(abs(pred-p1)>128)pred=p1;
765             }break;
766           }
767           frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
768           pred_last[ref]=frags[fragi].dc;
769           l_ref=ref;
770         }
771         else l_ref=-1;
772         ul_ref=u_ref;
773         u_ref=ur_ref;
774       }
775     }
776   }
777 }
778 
oc_enc_tokenize_dc_frag_list(oc_enc_ctx * _enc,int _pli,const ptrdiff_t * _coded_fragis,ptrdiff_t _ncoded_fragis,int _prev_ndct_tokens1,int _prev_eob_run1)779 void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
780  const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
781  int _prev_ndct_tokens1,int _prev_eob_run1){
782   const ogg_int16_t *frag_dc;
783   ptrdiff_t          fragii;
784   unsigned char     *dct_tokens0;
785   unsigned char     *dct_tokens1;
786   ogg_uint16_t      *extra_bits0;
787   ogg_uint16_t      *extra_bits1;
788   ptrdiff_t          ti0;
789   ptrdiff_t          ti1r;
790   ptrdiff_t          ti1w;
791   int                eob_run0;
792   int                eob_run1;
793   int                neobs1;
794   int                token;
795   int                eb;
796   int                token1=token1;
797   int                eb1=eb1;
798   /*Return immediately if there are no coded fragments; otherwise we'd flush
799      any trailing EOB run into the AC 1 list and never read it back out.*/
800   if(_ncoded_fragis<=0)return;
801   frag_dc=_enc->frag_dc;
802   dct_tokens0=_enc->dct_tokens[_pli][0];
803   dct_tokens1=_enc->dct_tokens[_pli][1];
804   extra_bits0=_enc->extra_bits[_pli][0];
805   extra_bits1=_enc->extra_bits[_pli][1];
806   ti0=_enc->ndct_tokens[_pli][0];
807   ti1w=ti1r=_prev_ndct_tokens1;
808   eob_run0=_enc->eob_run[_pli][0];
809   /*Flush any trailing EOB run for the 1st AC coefficient.
810     This is needed to allow us to track tokens to the end of the list.*/
811   eob_run1=_enc->eob_run[_pli][1];
812   if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
813   /*If there was an active EOB run at the start of the 1st AC stack, read it
814      in and decode it.*/
815   if(_prev_eob_run1>0){
816     token1=dct_tokens1[ti1r];
817     eb1=extra_bits1[ti1r];
818     ti1r++;
819     eob_run1=oc_decode_eob_token(token1,eb1);
820     /*Consume the portion of the run that came before these fragments.*/
821     neobs1=eob_run1-_prev_eob_run1;
822   }
823   else eob_run1=neobs1=0;
824   for(fragii=0;fragii<_ncoded_fragis;fragii++){
825     int val;
826     /*All tokens in the 1st AC coefficient stack are regenerated as the DC
827        coefficients are produced.
828       This can be done in-place; stack 1 cannot get larger.*/
829     if(!neobs1){
830       /*There's no active EOB run in stack 1; read the next token.*/
831       token1=dct_tokens1[ti1r];
832       eb1=extra_bits1[ti1r];
833       ti1r++;
834       if(token1<OC_NDCT_EOB_TOKEN_MAX){
835         neobs1=oc_decode_eob_token(token1,eb1);
836         /*It's an EOB run; add it to the current (inactive) one.
837           Because we may have moved entries to stack 0, we may have an
838            opportunity to merge two EOB runs in stack 1.*/
839         eob_run1+=neobs1;
840       }
841     }
842     val=frag_dc[_coded_fragis[fragii]];
843     if(val){
844       /*There was a non-zero DC value, so there's no alteration to stack 1
845          for this fragment; just code the stack 0 token.*/
846       /*Flush any pending EOB run.*/
847       if(eob_run0>0){
848         token=oc_make_eob_token_full(eob_run0,&eb);
849         dct_tokens0[ti0]=(unsigned char)token;
850         extra_bits0[ti0]=(ogg_uint16_t)eb;
851         ti0++;
852         eob_run0=0;
853       }
854       token=oc_make_dct_token_full(0,0,val,&eb);
855       dct_tokens0[ti0]=(unsigned char)token;
856       extra_bits0[ti0]=(ogg_uint16_t)eb;
857       ti0++;
858     }
859     else{
860       /*Zero DC value; that means the entry in stack 1 might need to be coded
861          from stack 0.
862         This requires a stack 1 fixup.*/
863       if(neobs1>0){
864         /*We're in the middle of an active EOB run in stack 1.
865           Move it to stack 0.*/
866         if(++eob_run0>=4095){
867           token=oc_make_eob_token_full(eob_run0,&eb);
868           dct_tokens0[ti0]=(unsigned char)token;
869           extra_bits0[ti0]=(ogg_uint16_t)eb;
870           ti0++;
871           eob_run0=0;
872         }
873         eob_run1--;
874       }
875       else{
876         /*No active EOB run in stack 1, so we can't extend one in stack 0.
877           Flush it if we've got it.*/
878         if(eob_run0>0){
879           token=oc_make_eob_token_full(eob_run0,&eb);
880           dct_tokens0[ti0]=(unsigned char)token;
881           extra_bits0[ti0]=(ogg_uint16_t)eb;
882           ti0++;
883           eob_run0=0;
884         }
885         /*Stack 1 token is one of: a pure zero run token, a single
886            coefficient token, or a zero run/coefficient combo token.
887           A zero run token is expanded and moved to token stack 0, and the
888            stack 1 entry dropped.
889           A single coefficient value may be transformed into combo token that
890            is moved to stack 0, or if it cannot be combined, it is left alone
891            and a single length-1 zero run is emitted in stack 0.
892           A combo token is extended and moved to stack 0.
893           During AC coding, we restrict the run lengths on combo tokens for
894            stack 1 to guarantee we can extend them.*/
895         switch(token1){
896           case OC_DCT_SHORT_ZRL_TOKEN:{
897             if(eb1<7){
898               dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
899               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
900               ti0++;
901               /*Don't write the AC coefficient back out.*/
902               continue;
903             }
904             /*Fall through.*/
905           }
906           case OC_DCT_ZRL_TOKEN:{
907             dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
908             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
909             ti0++;
910             /*Don't write the AC coefficient back out.*/
911           }continue;
912           case OC_ONE_TOKEN:
913           case OC_MINUS_ONE_TOKEN:{
914             dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
915             extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
916             ti0++;
917             /*Don't write the AC coefficient back out.*/
918           }continue;
919           case OC_TWO_TOKEN:
920           case OC_MINUS_TWO_TOKEN:{
921             dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
922             extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
923             ti0++;
924             /*Don't write the AC coefficient back out.*/
925           }continue;
926           case OC_DCT_VAL_CAT2:{
927             dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
928             extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
929             ti0++;
930             /*Don't write the AC coefficient back out.*/
931           }continue;
932           case OC_DCT_RUN_CAT1A:
933           case OC_DCT_RUN_CAT1A+1:
934           case OC_DCT_RUN_CAT1A+2:
935           case OC_DCT_RUN_CAT1A+3:{
936             dct_tokens0[ti0]=(unsigned char)(token1+1);
937             extra_bits0[ti0]=(ogg_uint16_t)eb1;
938             ti0++;
939             /*Don't write the AC coefficient back out.*/
940           }continue;
941           case OC_DCT_RUN_CAT1A+4:{
942             dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
943             extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
944             ti0++;
945             /*Don't write the AC coefficient back out.*/
946           }continue;
947           case OC_DCT_RUN_CAT1B:{
948             if((eb1&3)<3){
949               dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
950               extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
951               ti0++;
952               /*Don't write the AC coefficient back out.*/
953               continue;
954             }
955             eb1=((eb1&4)<<1)-1;
956             /*Fall through.*/
957           }
958           case OC_DCT_RUN_CAT1C:{
959             dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
960             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
961             ti0++;
962             /*Don't write the AC coefficient back out.*/
963           }continue;
964           case OC_DCT_RUN_CAT2A:{
965             eb1=(eb1<<1)-1;
966             /*Fall through.*/
967           }
968           case OC_DCT_RUN_CAT2B:{
969             dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
970             extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
971             ti0++;
972             /*Don't write the AC coefficient back out.*/
973           }continue;
974         }
975         /*We can't merge tokens, write a short zero run and keep going.*/
976         dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
977         extra_bits0[ti0]=0;
978         ti0++;
979       }
980     }
981     if(!neobs1){
982       /*Flush any (inactive) EOB run.*/
983       if(eob_run1>0){
984         token=oc_make_eob_token_full(eob_run1,&eb);
985         dct_tokens1[ti1w]=(unsigned char)token;
986         extra_bits1[ti1w]=(ogg_uint16_t)eb;
987         ti1w++;
988         eob_run1=0;
989       }
990       /*There's no active EOB run, so log the current token.*/
991       dct_tokens1[ti1w]=(unsigned char)token1;
992       extra_bits1[ti1w]=(ogg_uint16_t)eb1;
993       ti1w++;
994     }
995     else{
996       /*Otherwise consume one EOB from the current run.*/
997       neobs1--;
998       /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
999       if(eob_run1-neobs1>=4095){
1000         token=oc_make_eob_token_full(4095,&eb);
1001         dct_tokens1[ti1w]=(unsigned char)token;
1002         extra_bits1[ti1w]=(ogg_uint16_t)eb;
1003         ti1w++;
1004         eob_run1-=4095;
1005       }
1006     }
1007   }
1008   /*Save the current state.*/
1009   _enc->ndct_tokens[_pli][0]=ti0;
1010   _enc->ndct_tokens[_pli][1]=ti1w;
1011   _enc->eob_run[_pli][0]=eob_run0;
1012   _enc->eob_run[_pli][1]=eob_run1;
1013 }
1014 
1015 /*Final EOB run welding.*/
oc_enc_tokenize_finish(oc_enc_ctx * _enc)1016 void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
1017   int pli;
1018   int zzi;
1019   /*Emit final EOB runs.*/
1020   for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
1021     int eob_run;
1022     eob_run=_enc->eob_run[pli][zzi];
1023     if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
1024   }
1025   /*Merge the final EOB run of one token list with the start of the next, if
1026      possible.*/
1027   for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
1028     int       old_tok1;
1029     int       old_tok2;
1030     int       old_eb1;
1031     int       old_eb2;
1032     int       new_tok;
1033     int       new_eb;
1034     int       zzj;
1035     int       plj;
1036     ptrdiff_t ti=ti;
1037     int       run_count;
1038     /*Make sure this coefficient has tokens at all.*/
1039     if(_enc->ndct_tokens[pli][zzi]<=0)continue;
1040     /*Ensure the first token is an EOB run.*/
1041     old_tok2=_enc->dct_tokens[pli][zzi][0];
1042     if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
1043     /*Search for a previous coefficient that has any tokens at all.*/
1044     old_tok1=OC_NDCT_EOB_TOKEN_MAX;
1045     for(zzj=zzi,plj=pli;zzj>=0;zzj--){
1046       while(plj-->0){
1047         ti=_enc->ndct_tokens[plj][zzj]-1;
1048         if(ti>=_enc->dct_token_offs[plj][zzj]){
1049           old_tok1=_enc->dct_tokens[plj][zzj][ti];
1050           break;
1051         }
1052       }
1053       if(plj>=0)break;
1054       plj=3;
1055     }
1056     /*Ensure its last token was an EOB run.*/
1057     if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
1058     /*Pull off the associated extra bits, if any, and decode the runs.*/
1059     old_eb1=_enc->extra_bits[plj][zzj][ti];
1060     old_eb2=_enc->extra_bits[pli][zzi][0];
1061     run_count=oc_decode_eob_token(old_tok1,old_eb1)
1062      +oc_decode_eob_token(old_tok2,old_eb2);
1063     /*We can't possibly combine these into one run.
1064       It might be possible to split them more optimally, but we'll just leave
1065        them as-is.*/
1066     if(run_count>=4096)continue;
1067     /*We CAN combine them into one run.*/
1068     new_tok=oc_make_eob_token_full(run_count,&new_eb);
1069     _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
1070     _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
1071     _enc->dct_token_offs[pli][zzi]++;
1072   }
1073 }
1074