1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17 #include <stdlib.h>
18 #include <string.h>
19 #include "encint.h"
20
21
22
oc_make_eob_token(int _run_count)23 static int oc_make_eob_token(int _run_count){
24 if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
25 else{
26 int cat;
27 cat=OC_ILOGNZ_32(_run_count)-3;
28 cat=OC_MINI(cat,3);
29 return OC_DCT_REPEAT_RUN0_TOKEN+cat;
30 }
31 }
32
oc_make_eob_token_full(int _run_count,int * _eb)33 static int oc_make_eob_token_full(int _run_count,int *_eb){
34 if(_run_count<4){
35 *_eb=0;
36 return OC_DCT_EOB1_TOKEN+_run_count-1;
37 }
38 else{
39 int cat;
40 cat=OC_ILOGNZ_32(_run_count)-3;
41 cat=OC_MINI(cat,3);
42 *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
43 return OC_DCT_REPEAT_RUN0_TOKEN+cat;
44 }
45 }
46
47 /*Returns the number of blocks ended by an EOB token.*/
oc_decode_eob_token(int _token,int _eb)48 static int oc_decode_eob_token(int _token,int _eb){
49 return (0x20820C41U>>_token*5&0x1F)+_eb;
50 }
51
52 /*TODO: This is now only used during DCT tokenization, and never for runs; it
53 should be simplified.*/
oc_make_dct_token_full(int _zzi,int _zzj,int _val,int * _eb)54 static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
55 int neg;
56 int zero_run;
57 int token;
58 int eb;
59 neg=_val<0;
60 _val=abs(_val);
61 zero_run=_zzj-_zzi;
62 if(zero_run>0){
63 int adj;
64 /*Implement a minor restriction on stack 1 so that we know during DC fixups
65 that extending a dctrun token from stack 1 will never overflow.*/
66 adj=_zzi!=1;
67 if(_val<2&&zero_run<17+adj){
68 if(zero_run<6){
69 token=OC_DCT_RUN_CAT1A+zero_run-1;
70 eb=neg;
71 }
72 else if(zero_run<10){
73 token=OC_DCT_RUN_CAT1B;
74 eb=zero_run-6+(neg<<2);
75 }
76 else{
77 token=OC_DCT_RUN_CAT1C;
78 eb=zero_run-10+(neg<<3);
79 }
80 }
81 else if(_val<4&&zero_run<3+adj){
82 if(zero_run<2){
83 token=OC_DCT_RUN_CAT2A;
84 eb=_val-2+(neg<<1);
85 }
86 else{
87 token=OC_DCT_RUN_CAT2B;
88 eb=zero_run-2+(_val-2<<1)+(neg<<2);
89 }
90 }
91 else{
92 if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
93 else token=OC_DCT_ZRL_TOKEN;
94 eb=zero_run-1;
95 }
96 }
97 else if(_val<3){
98 token=OC_ONE_TOKEN+(_val-1<<1)+neg;
99 eb=0;
100 }
101 else if(_val<7){
102 token=OC_DCT_VAL_CAT2+_val-3;
103 eb=neg;
104 }
105 else if(_val<9){
106 token=OC_DCT_VAL_CAT3;
107 eb=_val-7+(neg<<1);
108 }
109 else if(_val<13){
110 token=OC_DCT_VAL_CAT4;
111 eb=_val-9+(neg<<2);
112 }
113 else if(_val<21){
114 token=OC_DCT_VAL_CAT5;
115 eb=_val-13+(neg<<3);
116 }
117 else if(_val<37){
118 token=OC_DCT_VAL_CAT6;
119 eb=_val-21+(neg<<4);
120 }
121 else if(_val<69){
122 token=OC_DCT_VAL_CAT7;
123 eb=_val-37+(neg<<5);
124 }
125 else{
126 token=OC_DCT_VAL_CAT8;
127 eb=_val-69+(neg<<9);
128 }
129 *_eb=eb;
130 return token;
131 }
132
133 /*Token logging to allow a few fragments of efficient rollback.
134 Late SKIP analysis is tied up in the tokenization process, so we need to be
135 able to undo a fragment's tokens on a whim.*/
136
137 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
138 0,16,16,16,16,16,32,32,
139 32,32,32,32,32,32,32,48,
140 48,48,48,48,48,48,48,48,
141 48,48,48,48,64,64,64,64,
142 64,64,64,64,64,64,64,64,
143 64,64,64,64,64,64,64,64,
144 64,64,64,64,64,64,64,64
145 };
146
oc_token_bits(oc_enc_ctx * _enc,int _huffi,int _zzi,int _token)147 static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
148 return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
149 +OC_DCT_TOKEN_EXTRA_BITS[_token];
150 }
151
oc_enc_tokenlog_checkpoint(oc_enc_ctx * _enc,oc_token_checkpoint * _cp,int _pli,int _zzi)152 static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
153 oc_token_checkpoint *_cp,int _pli,int _zzi){
154 _cp->pli=_pli;
155 _cp->zzi=_zzi;
156 _cp->eob_run=_enc->eob_run[_pli][_zzi];
157 _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
158 }
159
oc_enc_tokenlog_rollback(oc_enc_ctx * _enc,const oc_token_checkpoint * _stack,int _n)160 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
161 const oc_token_checkpoint *_stack,int _n){
162 int i;
163 for(i=_n;i-->0;){
164 int pli;
165 int zzi;
166 pli=_stack[i].pli;
167 zzi=_stack[i].zzi;
168 _enc->eob_run[pli][zzi]=_stack[i].eob_run;
169 _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
170 }
171 }
172
oc_enc_token_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _token,int _eb)173 static void oc_enc_token_log(oc_enc_ctx *_enc,
174 int _pli,int _zzi,int _token,int _eb){
175 ptrdiff_t ti;
176 ti=_enc->ndct_tokens[_pli][_zzi]++;
177 _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
178 _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
179 }
180
oc_enc_eob_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _run_count)181 static void oc_enc_eob_log(oc_enc_ctx *_enc,
182 int _pli,int _zzi,int _run_count){
183 int token;
184 int eb;
185 token=oc_make_eob_token_full(_run_count,&eb);
186 oc_enc_token_log(_enc,_pli,_zzi,token,eb);
187 }
188
189
oc_enc_tokenize_start(oc_enc_ctx * _enc)190 void oc_enc_tokenize_start(oc_enc_ctx *_enc){
191 memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
192 memset(_enc->eob_run,0,sizeof(_enc->eob_run));
193 memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
194 memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
195 }
196
197 typedef struct oc_quant_token oc_quant_token;
198
199 /*A single node in the Viterbi trellis.
200 We maintain up to 2 of these per coefficient:
201 - A token to code if the value is zero (EOB, zero run, or combo token).
202 - A token to code if the value is not zero (DCT value token).*/
203 struct oc_quant_token{
204 unsigned char next;
205 signed char token;
206 ogg_int16_t eb;
207 ogg_uint32_t cost;
208 int bits;
209 int qc;
210 };
211
212 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
213 dequantizes and de-zig-zags the result.
214 The DC coefficient is not preserved; it should be restored by the caller.*/
oc_enc_tokenize_ac(oc_enc_ctx * _enc,int _pli,ptrdiff_t _fragi,ogg_int16_t * _qdct,const ogg_uint16_t * _dequant,const ogg_int16_t * _dct,int _zzi,oc_token_checkpoint ** _stack,int _acmin)215 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
216 ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
217 int _zzi,oc_token_checkpoint **_stack,int _acmin){
218 oc_token_checkpoint *stack;
219 ogg_int64_t zflags;
220 ogg_int64_t nzflags;
221 ogg_int64_t best_flags;
222 ogg_uint32_t d2_accum[64];
223 oc_quant_token tokens[64][2];
224 ogg_uint16_t *eob_run;
225 const unsigned char *dct_fzig_zag;
226 ogg_uint32_t cost;
227 int bits;
228 int eob;
229 int token;
230 int eb;
231 int next;
232 int huffi;
233 int zzi;
234 int ti;
235 int zzj;
236 int qc;
237 (void)_fragi;
238 huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
239 eob_run=_enc->eob_run[_pli];
240 memset(tokens[0],0,sizeof(tokens[0]));
241 best_flags=nzflags=0;
242 zflags=1;
243 d2_accum[0]=0;
244 zzj=64;
245 for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
246 ogg_int32_t lambda;
247 ogg_uint32_t best_cost;
248 int best_bits=best_bits;
249 int best_next=best_next;
250 int best_token=best_token;
251 int best_eb=best_eb;
252 int best_qc=best_qc;
253 int flush_bits;
254 ogg_uint32_t d2;
255 int dq;
256 int e;
257 int c;
258 int s;
259 int tj;
260 lambda=_enc->lambda;
261 qc=_qdct[zzi];
262 s=-(qc<0);
263 qc=qc+s^s;
264 c=_dct[OC_FZIG_ZAG[zzi]];
265 if(qc<=1){
266 ogg_uint32_t sum_d2;
267 int nzeros;
268 int dc_reserve;
269 /*The hard case: try a zero run.*/
270 if(!qc){
271 /*Skip runs that are already quantized to zeros.
272 If we considered each zero coefficient in turn, we might
273 theoretically find a better way to partition long zero runs (e.g.,
274 a run of > 17 zeros followed by a 1 might be better coded as a short
275 zero run followed by a combo token, rather than the longer zero
276 token followed by a 1 value token), but zeros are so common that
277 this becomes very computationally expensive (quadratic instead of
278 linear in the number of coefficients), for a marginal gain.*/
279 while(zzi>1&&!_qdct[zzi-1])zzi--;
280 /*The distortion of coefficients originally quantized to zero is
281 treated as zero (since we'll never quantize them to anything else).*/
282 d2=0;
283 }
284 else{
285 c=c+s^s;
286 d2=c*(ogg_int32_t)c;
287 }
288 eob=eob_run[zzi];
289 nzeros=zzj-zzi;
290 zzj&=63;
291 sum_d2=d2+d2_accum[zzj];
292 d2_accum[zzi]=sum_d2;
293 flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
294 /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
295 to ensure they can be extended to include the DC coefficient if
296 necessary; this greatly simplifies stack-rewriting later on.*/
297 dc_reserve=zzi+62>>6;
298 best_cost=0xFFFFFFFF;
299 for(;;){
300 if(nzflags>>zzj&1){
301 int cat;
302 int val;
303 int val_s;
304 int zzk;
305 int tk;
306 next=tokens[zzj][1].next;
307 tk=next&1;
308 zzk=next>>1;
309 /*Try a pure zero run to this point.*/
310 cat=nzeros+55>>6;
311 token=OC_DCT_SHORT_ZRL_TOKEN+cat;
312 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
313 d2=sum_d2-d2_accum[zzj];
314 cost=d2+lambda*bits+tokens[zzj][1].cost;
315 if(cost<=best_cost){
316 best_next=(zzj<<1)+1;
317 best_token=token;
318 best_eb=nzeros-1;
319 best_cost=cost;
320 best_bits=bits+tokens[zzj][1].bits;
321 best_qc=0;
322 }
323 if(nzeros<16+dc_reserve){
324 val=_qdct[zzj];
325 val_s=-(val<0);
326 val=val+val_s^val_s;
327 if(val<=2){
328 /*Try a +/- 1 combo token.*/
329 if(nzeros<6){
330 token=OC_DCT_RUN_CAT1A+nzeros-1;
331 eb=-val_s;
332 }
333 else{
334 cat=nzeros+54>>6;
335 token=OC_DCT_RUN_CAT1B+cat;
336 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
337 }
338 e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
339 d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
340 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
341 cost=d2+lambda*bits+tokens[zzk][tk].cost;
342 if(cost<=best_cost){
343 best_next=next;
344 best_token=token;
345 best_eb=eb;
346 best_cost=cost;
347 best_bits=bits+tokens[zzk][tk].bits;
348 best_qc=1+val_s^val_s;
349 }
350 }
351 if(nzeros<2+dc_reserve&&2<=val&&val<=4){
352 /*Try a +/- 2/3 combo token.*/
353 cat=nzeros>>1;
354 token=OC_DCT_RUN_CAT2A+cat;
355 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
356 val=2+((val+val_s^val_s)>2);
357 e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
358 d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
359 cost=d2+lambda*bits+tokens[zzk][tk].cost;
360 if(cost<=best_cost){
361 best_cost=cost;
362 best_bits=bits+tokens[zzk][tk].bits;
363 best_next=next;
364 best_token=token;
365 best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
366 best_qc=val+val_s^val_s;
367 }
368 }
369 }
370 /*zzj can't be coded as a zero, so stop trying to extend the run.*/
371 if(!(zflags>>zzj&1))break;
372 }
373 /*We could try to consider _all_ potentially non-zero coefficients, but
374 if we already found a bunch of them not worth coding, it's fairly
375 unlikely they would now be worth coding from this position; skipping
376 them saves a lot of work.*/
377 zzj=(tokens[zzj][0].next>>1)-(tokens[zzj][0].qc!=0)&63;
378 if(zzj==0){
379 /*We made it all the way to the end of the block; try an EOB token.*/
380 if(eob<4095){
381 bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
382 -flush_bits;
383 }
384 else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
385 cost=sum_d2+bits*lambda;
386 /*If the best route so far is still a pure zero run to the end of the
387 block, force coding it as an EOB.
388 Even if it's not optimal for this block, it has a good chance of
389 getting combined with an EOB token from subsequent blocks, saving
390 bits overall.*/
391 if(cost<=best_cost||best_token<=OC_DCT_ZRL_TOKEN&&zzi+best_eb==63){
392 best_next=0;
393 /*This token is just a marker; in reality we may not emit any
394 tokens, but update eob_run[] instead.*/
395 best_token=OC_DCT_EOB1_TOKEN;
396 best_eb=0;
397 best_cost=cost;
398 best_bits=bits;
399 best_qc=0;
400 }
401 break;
402 }
403 nzeros=zzj-zzi;
404 }
405 tokens[zzi][0].next=(unsigned char)best_next;
406 tokens[zzi][0].token=(signed char)best_token;
407 tokens[zzi][0].eb=(ogg_int16_t)best_eb;
408 tokens[zzi][0].cost=best_cost;
409 tokens[zzi][0].bits=best_bits;
410 tokens[zzi][0].qc=best_qc;
411 zflags|=(ogg_int64_t)1<<zzi;
412 if(qc){
413 dq=_dequant[zzi];
414 if(zzi<_acmin)lambda=0;
415 e=dq-c;
416 d2=e*(ogg_int32_t)e;
417 token=OC_ONE_TOKEN-s;
418 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
419 zzj=zzi+1&63;
420 tj=best_flags>>zzj&1;
421 next=(zzj<<1)+tj;
422 tokens[zzi][1].next=(unsigned char)next;
423 tokens[zzi][1].token=(signed char)token;
424 tokens[zzi][1].eb=0;
425 tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
426 tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
427 tokens[zzi][1].qc=1+s^s;
428 nzflags|=(ogg_int64_t)1<<zzi;
429 best_flags|=
430 (ogg_int64_t)(tokens[zzi][1].cost<tokens[zzi][0].cost)<<zzi;
431 }
432 }
433 else{
434 eob=eob_run[zzi];
435 if(zzi<_acmin)lambda=0;
436 c=c+s^s;
437 dq=_dequant[zzi];
438 /*No zero run can extend past this point.*/
439 d2_accum[zzi]=0;
440 flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
441 if(qc<=2){
442 e=2*dq-c;
443 d2=e*(ogg_int32_t)e;
444 best_token=OC_TWO_TOKEN-s;
445 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
446 best_cost=d2+lambda*best_bits;
447 e-=dq;
448 d2=e*(ogg_int32_t)e;
449 token=OC_ONE_TOKEN-s;
450 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
451 cost=d2+lambda*bits;
452 if(cost<=best_cost){
453 best_token=token;
454 best_bits=bits;
455 best_cost=cost;
456 qc--;
457 }
458 best_eb=0;
459 }
460 else if(qc<=3){
461 e=3*dq-c;
462 d2=e*(ogg_int32_t)e;
463 best_token=OC_DCT_VAL_CAT2;
464 best_eb=-s;
465 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
466 best_cost=d2+lambda*best_bits;
467 e-=dq;
468 d2=e*(ogg_int32_t)e;
469 token=OC_TWO_TOKEN-s;
470 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
471 cost=d2+lambda*bits;
472 if(cost<=best_cost){
473 best_token=token;
474 best_eb=0;
475 best_bits=bits;
476 best_cost=cost;
477 qc--;
478 }
479 }
480 else if(qc<=6){
481 e=qc*dq-c;
482 d2=e*(ogg_int32_t)e;
483 best_token=OC_DCT_VAL_CAT2+qc-3;
484 best_eb=-s;
485 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
486 best_cost=d2+lambda*best_bits;
487 e-=dq;
488 d2=e*(ogg_int32_t)e;
489 token=best_token-1;
490 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
491 cost=d2+lambda*bits;
492 if(cost<=best_cost){
493 best_token=token;
494 best_bits=bits;
495 best_cost=cost;
496 qc--;
497 }
498 }
499 else if(qc<=8){
500 e=qc*dq-c;
501 d2=e*(ogg_int32_t)e;
502 best_token=OC_DCT_VAL_CAT3;
503 best_eb=(-s<<1)+qc-7;
504 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
505 best_cost=d2+lambda*best_bits;
506 e=6*dq-c;
507 d2=e*(ogg_int32_t)e;
508 token=OC_DCT_VAL_CAT2+3;
509 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
510 cost=d2+lambda*bits;
511 if(cost<=best_cost){
512 best_token=token;
513 best_eb=-s;
514 best_bits=bits;
515 best_cost=cost;
516 qc=6;
517 }
518 }
519 else if(qc<=12){
520 e=qc*dq-c;
521 d2=e*(ogg_int32_t)e;
522 best_token=OC_DCT_VAL_CAT4;
523 best_eb=(-s<<2)+qc-9;
524 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
525 best_cost=d2+lambda*best_bits;
526 e=8*dq-c;
527 d2=e*(ogg_int32_t)e;
528 token=best_token-1;
529 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
530 cost=d2+lambda*bits;
531 if(cost<=best_cost){
532 best_token=token;
533 best_eb=(-s<<1)+1;
534 best_bits=bits;
535 best_cost=cost;
536 qc=8;
537 }
538 }
539 else if(qc<=20){
540 e=qc*dq-c;
541 d2=e*(ogg_int32_t)e;
542 best_token=OC_DCT_VAL_CAT5;
543 best_eb=(-s<<3)+qc-13;
544 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
545 best_cost=d2+lambda*best_bits;
546 e=12*dq-c;
547 d2=e*(ogg_int32_t)e;
548 token=best_token-1;
549 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
550 cost=d2+lambda*bits;
551 if(cost<=best_cost){
552 best_token=token;
553 best_eb=(-s<<2)+3;
554 best_bits=bits;
555 best_cost=cost;
556 qc=12;
557 }
558 }
559 else if(qc<=36){
560 e=qc*dq-c;
561 d2=e*(ogg_int32_t)e;
562 best_token=OC_DCT_VAL_CAT6;
563 best_eb=(-s<<4)+qc-21;
564 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
565 best_cost=d2+lambda*best_bits;
566 e=20*dq-c;
567 d2=e*(ogg_int32_t)e;
568 token=best_token-1;
569 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
570 cost=d2+lambda*bits;
571 if(cost<=best_cost){
572 best_token=token;
573 best_eb=(-s<<3)+7;
574 best_bits=bits;
575 best_cost=cost;
576 qc=20;
577 }
578 }
579 else if(qc<=68){
580 e=qc*dq-c;
581 d2=e*(ogg_int32_t)e;
582 best_token=OC_DCT_VAL_CAT7;
583 best_eb=(-s<<5)+qc-37;
584 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
585 best_cost=d2+lambda*best_bits;
586 e=36*dq-c;
587 d2=e*(ogg_int32_t)e;
588 token=best_token-1;
589 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
590 cost=d2+lambda*bits;
591 if(cost<best_cost){
592 best_token=token;
593 best_eb=(-s<<4)+15;
594 best_bits=bits;
595 best_cost=cost;
596 qc=36;
597 }
598 }
599 else{
600 e=qc*dq-c;
601 d2=e*(ogg_int32_t)e;
602 best_token=OC_DCT_VAL_CAT8;
603 best_eb=(-s<<9)+qc-69;
604 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
605 best_cost=d2+lambda*best_bits;
606 e=68*dq-c;
607 d2=e*(ogg_int32_t)e;
608 token=best_token-1;
609 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
610 cost=d2+lambda*bits;
611 if(cost<best_cost){
612 best_token=token;
613 best_eb=(-s<<5)+31;
614 best_bits=bits;
615 best_cost=cost;
616 qc=68;
617 }
618 }
619 zzj=zzi+1&63;
620 tj=best_flags>>zzj&1;
621 next=(zzj<<1)+tj;
622 tokens[zzi][1].next=(unsigned char)next;
623 tokens[zzi][1].token=(signed char)best_token;
624 tokens[zzi][1].eb=best_eb;
625 tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
626 tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
627 tokens[zzi][1].qc=qc+s^s;
628 nzflags|=(ogg_int64_t)1<<zzi;
629 best_flags|=(ogg_int64_t)1<<zzi;
630 }
631 zzj=zzi;
632 }
633 /*Emit the tokens from the best path through the trellis.*/
634 stack=*_stack;
635 /*We blow away the first entry here so that things vectorize better.
636 The DC coefficient is not actually stored in the array yet.*/
637 for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
638 dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
639 zzi=1;
640 ti=best_flags>>1&1;
641 bits=tokens[zzi][ti].bits;
642 do{
643 oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
644 eob=eob_run[zzi];
645 if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
646 if(++eob>=4095){
647 oc_enc_eob_log(_enc,_pli,zzi,eob);
648 eob=0;
649 }
650 eob_run[zzi]=eob;
651 /*We don't include the actual EOB cost for this block in the return value.
652 It will be paid for by the fragment that terminates the EOB run.*/
653 bits-=tokens[zzi][ti].bits;
654 zzi=_zzi;
655 break;
656 }
657 /*Emit pending EOB run if any.*/
658 if(eob>0){
659 oc_enc_eob_log(_enc,_pli,zzi,eob);
660 eob_run[zzi]=0;
661 }
662 oc_enc_token_log(_enc,_pli,zzi,tokens[zzi][ti].token,tokens[zzi][ti].eb);
663 next=tokens[zzi][ti].next;
664 qc=tokens[zzi][ti].qc;
665 zzj=(next>>1)-1&63;
666 /*TODO: It may be worth saving the dequantized coefficient in the trellis
667 above; we had to compute it to measure the error anyway.*/
668 _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
669 zzi=next>>1;
670 ti=next&1;
671 }
672 while(zzi);
673 *_stack=stack;
674 return bits;
675 }
676
oc_enc_pred_dc_frag_rows(oc_enc_ctx * _enc,int _pli,int _fragy0,int _frag_yend)677 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
678 int _pli,int _fragy0,int _frag_yend){
679 const oc_fragment_plane *fplane;
680 const oc_fragment *frags;
681 ogg_int16_t *frag_dc;
682 ptrdiff_t fragi;
683 int *pred_last;
684 int nhfrags;
685 int fragx;
686 int fragy;
687 fplane=_enc->state.fplanes+_pli;
688 frags=_enc->state.frags;
689 frag_dc=_enc->frag_dc;
690 pred_last=_enc->dc_pred_last[_pli];
691 nhfrags=fplane->nhfrags;
692 fragi=fplane->froffset+_fragy0*nhfrags;
693 for(fragy=_fragy0;fragy<_frag_yend;fragy++){
694 if(fragy==0){
695 /*For the first row, all of the cases reduce to just using the previous
696 predictor for the same reference frame.*/
697 for(fragx=0;fragx<nhfrags;fragx++,fragi++){
698 if(frags[fragi].coded){
699 int ref;
700 ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
701 frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
702 pred_last[ref]=frags[fragi].dc;
703 }
704 }
705 }
706 else{
707 const oc_fragment *u_frags;
708 int l_ref;
709 int ul_ref;
710 int u_ref;
711 u_frags=frags-nhfrags;
712 l_ref=-1;
713 ul_ref=-1;
714 u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
715 for(fragx=0;fragx<nhfrags;fragx++,fragi++){
716 int ur_ref;
717 if(fragx+1>=nhfrags)ur_ref=-1;
718 else{
719 ur_ref=u_frags[fragi+1].coded?
720 OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
721 }
722 if(frags[fragi].coded){
723 int pred;
724 int ref;
725 ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
726 /*We break out a separate case based on which of our neighbors use
727 the same reference frames.
728 This is somewhat faster than trying to make a generic case which
729 handles all of them, since it reduces lots of poorly predicted
730 jumps to one switch statement, and also lets a number of the
731 multiplications be optimized out by strength reduction.*/
732 switch((l_ref==ref)|(ul_ref==ref)<<1|
733 (u_ref==ref)<<2|(ur_ref==ref)<<3){
734 default:pred=pred_last[ref];break;
735 case 1:
736 case 3:pred=frags[fragi-1].dc;break;
737 case 2:pred=u_frags[fragi-1].dc;break;
738 case 4:
739 case 6:
740 case 12:pred=u_frags[fragi].dc;break;
741 case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
742 case 8:pred=u_frags[fragi+1].dc;break;
743 case 9:
744 case 11:
745 case 13:{
746 pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
747 }break;
748 case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
749 case 14:{
750 pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
751 +10*u_frags[fragi].dc)/16;
752 }break;
753 case 7:
754 case 15:{
755 int p0;
756 int p1;
757 int p2;
758 p0=frags[fragi-1].dc;
759 p1=u_frags[fragi-1].dc;
760 p2=u_frags[fragi].dc;
761 pred=(29*(p0+p2)-26*p1)/32;
762 if(abs(pred-p2)>128)pred=p2;
763 else if(abs(pred-p0)>128)pred=p0;
764 else if(abs(pred-p1)>128)pred=p1;
765 }break;
766 }
767 frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
768 pred_last[ref]=frags[fragi].dc;
769 l_ref=ref;
770 }
771 else l_ref=-1;
772 ul_ref=u_ref;
773 u_ref=ur_ref;
774 }
775 }
776 }
777 }
778
oc_enc_tokenize_dc_frag_list(oc_enc_ctx * _enc,int _pli,const ptrdiff_t * _coded_fragis,ptrdiff_t _ncoded_fragis,int _prev_ndct_tokens1,int _prev_eob_run1)779 void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
780 const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
781 int _prev_ndct_tokens1,int _prev_eob_run1){
782 const ogg_int16_t *frag_dc;
783 ptrdiff_t fragii;
784 unsigned char *dct_tokens0;
785 unsigned char *dct_tokens1;
786 ogg_uint16_t *extra_bits0;
787 ogg_uint16_t *extra_bits1;
788 ptrdiff_t ti0;
789 ptrdiff_t ti1r;
790 ptrdiff_t ti1w;
791 int eob_run0;
792 int eob_run1;
793 int neobs1;
794 int token;
795 int eb;
796 int token1=token1;
797 int eb1=eb1;
798 /*Return immediately if there are no coded fragments; otherwise we'd flush
799 any trailing EOB run into the AC 1 list and never read it back out.*/
800 if(_ncoded_fragis<=0)return;
801 frag_dc=_enc->frag_dc;
802 dct_tokens0=_enc->dct_tokens[_pli][0];
803 dct_tokens1=_enc->dct_tokens[_pli][1];
804 extra_bits0=_enc->extra_bits[_pli][0];
805 extra_bits1=_enc->extra_bits[_pli][1];
806 ti0=_enc->ndct_tokens[_pli][0];
807 ti1w=ti1r=_prev_ndct_tokens1;
808 eob_run0=_enc->eob_run[_pli][0];
809 /*Flush any trailing EOB run for the 1st AC coefficient.
810 This is needed to allow us to track tokens to the end of the list.*/
811 eob_run1=_enc->eob_run[_pli][1];
812 if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
813 /*If there was an active EOB run at the start of the 1st AC stack, read it
814 in and decode it.*/
815 if(_prev_eob_run1>0){
816 token1=dct_tokens1[ti1r];
817 eb1=extra_bits1[ti1r];
818 ti1r++;
819 eob_run1=oc_decode_eob_token(token1,eb1);
820 /*Consume the portion of the run that came before these fragments.*/
821 neobs1=eob_run1-_prev_eob_run1;
822 }
823 else eob_run1=neobs1=0;
824 for(fragii=0;fragii<_ncoded_fragis;fragii++){
825 int val;
826 /*All tokens in the 1st AC coefficient stack are regenerated as the DC
827 coefficients are produced.
828 This can be done in-place; stack 1 cannot get larger.*/
829 if(!neobs1){
830 /*There's no active EOB run in stack 1; read the next token.*/
831 token1=dct_tokens1[ti1r];
832 eb1=extra_bits1[ti1r];
833 ti1r++;
834 if(token1<OC_NDCT_EOB_TOKEN_MAX){
835 neobs1=oc_decode_eob_token(token1,eb1);
836 /*It's an EOB run; add it to the current (inactive) one.
837 Because we may have moved entries to stack 0, we may have an
838 opportunity to merge two EOB runs in stack 1.*/
839 eob_run1+=neobs1;
840 }
841 }
842 val=frag_dc[_coded_fragis[fragii]];
843 if(val){
844 /*There was a non-zero DC value, so there's no alteration to stack 1
845 for this fragment; just code the stack 0 token.*/
846 /*Flush any pending EOB run.*/
847 if(eob_run0>0){
848 token=oc_make_eob_token_full(eob_run0,&eb);
849 dct_tokens0[ti0]=(unsigned char)token;
850 extra_bits0[ti0]=(ogg_uint16_t)eb;
851 ti0++;
852 eob_run0=0;
853 }
854 token=oc_make_dct_token_full(0,0,val,&eb);
855 dct_tokens0[ti0]=(unsigned char)token;
856 extra_bits0[ti0]=(ogg_uint16_t)eb;
857 ti0++;
858 }
859 else{
860 /*Zero DC value; that means the entry in stack 1 might need to be coded
861 from stack 0.
862 This requires a stack 1 fixup.*/
863 if(neobs1>0){
864 /*We're in the middle of an active EOB run in stack 1.
865 Move it to stack 0.*/
866 if(++eob_run0>=4095){
867 token=oc_make_eob_token_full(eob_run0,&eb);
868 dct_tokens0[ti0]=(unsigned char)token;
869 extra_bits0[ti0]=(ogg_uint16_t)eb;
870 ti0++;
871 eob_run0=0;
872 }
873 eob_run1--;
874 }
875 else{
876 /*No active EOB run in stack 1, so we can't extend one in stack 0.
877 Flush it if we've got it.*/
878 if(eob_run0>0){
879 token=oc_make_eob_token_full(eob_run0,&eb);
880 dct_tokens0[ti0]=(unsigned char)token;
881 extra_bits0[ti0]=(ogg_uint16_t)eb;
882 ti0++;
883 eob_run0=0;
884 }
885 /*Stack 1 token is one of: a pure zero run token, a single
886 coefficient token, or a zero run/coefficient combo token.
887 A zero run token is expanded and moved to token stack 0, and the
888 stack 1 entry dropped.
889 A single coefficient value may be transformed into combo token that
890 is moved to stack 0, or if it cannot be combined, it is left alone
891 and a single length-1 zero run is emitted in stack 0.
892 A combo token is extended and moved to stack 0.
893 During AC coding, we restrict the run lengths on combo tokens for
894 stack 1 to guarantee we can extend them.*/
895 switch(token1){
896 case OC_DCT_SHORT_ZRL_TOKEN:{
897 if(eb1<7){
898 dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
899 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
900 ti0++;
901 /*Don't write the AC coefficient back out.*/
902 continue;
903 }
904 /*Fall through.*/
905 }
906 case OC_DCT_ZRL_TOKEN:{
907 dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
908 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
909 ti0++;
910 /*Don't write the AC coefficient back out.*/
911 }continue;
912 case OC_ONE_TOKEN:
913 case OC_MINUS_ONE_TOKEN:{
914 dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
915 extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
916 ti0++;
917 /*Don't write the AC coefficient back out.*/
918 }continue;
919 case OC_TWO_TOKEN:
920 case OC_MINUS_TWO_TOKEN:{
921 dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
922 extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
923 ti0++;
924 /*Don't write the AC coefficient back out.*/
925 }continue;
926 case OC_DCT_VAL_CAT2:{
927 dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
928 extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
929 ti0++;
930 /*Don't write the AC coefficient back out.*/
931 }continue;
932 case OC_DCT_RUN_CAT1A:
933 case OC_DCT_RUN_CAT1A+1:
934 case OC_DCT_RUN_CAT1A+2:
935 case OC_DCT_RUN_CAT1A+3:{
936 dct_tokens0[ti0]=(unsigned char)(token1+1);
937 extra_bits0[ti0]=(ogg_uint16_t)eb1;
938 ti0++;
939 /*Don't write the AC coefficient back out.*/
940 }continue;
941 case OC_DCT_RUN_CAT1A+4:{
942 dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
943 extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
944 ti0++;
945 /*Don't write the AC coefficient back out.*/
946 }continue;
947 case OC_DCT_RUN_CAT1B:{
948 if((eb1&3)<3){
949 dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
950 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
951 ti0++;
952 /*Don't write the AC coefficient back out.*/
953 continue;
954 }
955 eb1=((eb1&4)<<1)-1;
956 /*Fall through.*/
957 }
958 case OC_DCT_RUN_CAT1C:{
959 dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
960 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
961 ti0++;
962 /*Don't write the AC coefficient back out.*/
963 }continue;
964 case OC_DCT_RUN_CAT2A:{
965 eb1=(eb1<<1)-1;
966 /*Fall through.*/
967 }
968 case OC_DCT_RUN_CAT2B:{
969 dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
970 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
971 ti0++;
972 /*Don't write the AC coefficient back out.*/
973 }continue;
974 }
975 /*We can't merge tokens, write a short zero run and keep going.*/
976 dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
977 extra_bits0[ti0]=0;
978 ti0++;
979 }
980 }
981 if(!neobs1){
982 /*Flush any (inactive) EOB run.*/
983 if(eob_run1>0){
984 token=oc_make_eob_token_full(eob_run1,&eb);
985 dct_tokens1[ti1w]=(unsigned char)token;
986 extra_bits1[ti1w]=(ogg_uint16_t)eb;
987 ti1w++;
988 eob_run1=0;
989 }
990 /*There's no active EOB run, so log the current token.*/
991 dct_tokens1[ti1w]=(unsigned char)token1;
992 extra_bits1[ti1w]=(ogg_uint16_t)eb1;
993 ti1w++;
994 }
995 else{
996 /*Otherwise consume one EOB from the current run.*/
997 neobs1--;
998 /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
999 if(eob_run1-neobs1>=4095){
1000 token=oc_make_eob_token_full(4095,&eb);
1001 dct_tokens1[ti1w]=(unsigned char)token;
1002 extra_bits1[ti1w]=(ogg_uint16_t)eb;
1003 ti1w++;
1004 eob_run1-=4095;
1005 }
1006 }
1007 }
1008 /*Save the current state.*/
1009 _enc->ndct_tokens[_pli][0]=ti0;
1010 _enc->ndct_tokens[_pli][1]=ti1w;
1011 _enc->eob_run[_pli][0]=eob_run0;
1012 _enc->eob_run[_pli][1]=eob_run1;
1013 }
1014
1015 /*Final EOB run welding.*/
oc_enc_tokenize_finish(oc_enc_ctx * _enc)1016 void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
1017 int pli;
1018 int zzi;
1019 /*Emit final EOB runs.*/
1020 for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
1021 int eob_run;
1022 eob_run=_enc->eob_run[pli][zzi];
1023 if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
1024 }
1025 /*Merge the final EOB run of one token list with the start of the next, if
1026 possible.*/
1027 for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
1028 int old_tok1;
1029 int old_tok2;
1030 int old_eb1;
1031 int old_eb2;
1032 int new_tok;
1033 int new_eb;
1034 int zzj;
1035 int plj;
1036 ptrdiff_t ti=ti;
1037 int run_count;
1038 /*Make sure this coefficient has tokens at all.*/
1039 if(_enc->ndct_tokens[pli][zzi]<=0)continue;
1040 /*Ensure the first token is an EOB run.*/
1041 old_tok2=_enc->dct_tokens[pli][zzi][0];
1042 if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
1043 /*Search for a previous coefficient that has any tokens at all.*/
1044 old_tok1=OC_NDCT_EOB_TOKEN_MAX;
1045 for(zzj=zzi,plj=pli;zzj>=0;zzj--){
1046 while(plj-->0){
1047 ti=_enc->ndct_tokens[plj][zzj]-1;
1048 if(ti>=_enc->dct_token_offs[plj][zzj]){
1049 old_tok1=_enc->dct_tokens[plj][zzj][ti];
1050 break;
1051 }
1052 }
1053 if(plj>=0)break;
1054 plj=3;
1055 }
1056 /*Ensure its last token was an EOB run.*/
1057 if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
1058 /*Pull off the associated extra bits, if any, and decode the runs.*/
1059 old_eb1=_enc->extra_bits[plj][zzj][ti];
1060 old_eb2=_enc->extra_bits[pli][zzi][0];
1061 run_count=oc_decode_eob_token(old_tok1,old_eb1)
1062 +oc_decode_eob_token(old_tok2,old_eb2);
1063 /*We can't possibly combine these into one run.
1064 It might be possible to split them more optimally, but we'll just leave
1065 them as-is.*/
1066 if(run_count>=4096)continue;
1067 /*We CAN combine them into one run.*/
1068 new_tok=oc_make_eob_token_full(run_count,&new_eb);
1069 _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
1070 _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
1071 _enc->dct_token_offs[pli][zzi]++;
1072 }
1073 }
1074