1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function:
14 last mod: $Id$
15
16 ********************************************************************/
17 #include <stdlib.h>
18 #include <string.h>
19 #include "encint.h"
20
21
22
oc_make_eob_token(int _run_count)23 static int oc_make_eob_token(int _run_count){
24 if(_run_count<4)return OC_DCT_EOB1_TOKEN+_run_count-1;
25 else{
26 int cat;
27 cat=OC_ILOGNZ_32(_run_count)-3;
28 cat=OC_MINI(cat,3);
29 return OC_DCT_REPEAT_RUN0_TOKEN+cat;
30 }
31 }
32
oc_make_eob_token_full(int _run_count,int * _eb)33 static int oc_make_eob_token_full(int _run_count,int *_eb){
34 if(_run_count<4){
35 *_eb=0;
36 return OC_DCT_EOB1_TOKEN+_run_count-1;
37 }
38 else{
39 int cat;
40 cat=OC_ILOGNZ_32(_run_count)-3;
41 cat=OC_MINI(cat,3);
42 *_eb=_run_count-OC_BYTE_TABLE32(4,8,16,0,cat);
43 return OC_DCT_REPEAT_RUN0_TOKEN+cat;
44 }
45 }
46
47 /*Returns the number of blocks ended by an EOB token.*/
oc_decode_eob_token(int _token,int _eb)48 static int oc_decode_eob_token(int _token,int _eb){
49 return (0x20820C41U>>_token*5&0x1F)+_eb;
50 }
51
52 /*TODO: This is now only used during DCT tokenization, and never for runs; it
53 should be simplified.*/
oc_make_dct_token_full(int _zzi,int _zzj,int _val,int * _eb)54 static int oc_make_dct_token_full(int _zzi,int _zzj,int _val,int *_eb){
55 int neg;
56 int zero_run;
57 int token;
58 int eb;
59 neg=_val<0;
60 _val=abs(_val);
61 zero_run=_zzj-_zzi;
62 if(zero_run>0){
63 int adj;
64 /*Implement a minor restriction on stack 1 so that we know during DC fixups
65 that extending a dctrun token from stack 1 will never overflow.*/
66 adj=_zzi!=1;
67 if(_val<2&&zero_run<17+adj){
68 if(zero_run<6){
69 token=OC_DCT_RUN_CAT1A+zero_run-1;
70 eb=neg;
71 }
72 else if(zero_run<10){
73 token=OC_DCT_RUN_CAT1B;
74 eb=zero_run-6+(neg<<2);
75 }
76 else{
77 token=OC_DCT_RUN_CAT1C;
78 eb=zero_run-10+(neg<<3);
79 }
80 }
81 else if(_val<4&&zero_run<3+adj){
82 if(zero_run<2){
83 token=OC_DCT_RUN_CAT2A;
84 eb=_val-2+(neg<<1);
85 }
86 else{
87 token=OC_DCT_RUN_CAT2B;
88 eb=zero_run-2+(_val-2<<1)+(neg<<2);
89 }
90 }
91 else{
92 if(zero_run<9)token=OC_DCT_SHORT_ZRL_TOKEN;
93 else token=OC_DCT_ZRL_TOKEN;
94 eb=zero_run-1;
95 }
96 }
97 else if(_val<3){
98 token=OC_ONE_TOKEN+(_val-1<<1)+neg;
99 eb=0;
100 }
101 else if(_val<7){
102 token=OC_DCT_VAL_CAT2+_val-3;
103 eb=neg;
104 }
105 else if(_val<9){
106 token=OC_DCT_VAL_CAT3;
107 eb=_val-7+(neg<<1);
108 }
109 else if(_val<13){
110 token=OC_DCT_VAL_CAT4;
111 eb=_val-9+(neg<<2);
112 }
113 else if(_val<21){
114 token=OC_DCT_VAL_CAT5;
115 eb=_val-13+(neg<<3);
116 }
117 else if(_val<37){
118 token=OC_DCT_VAL_CAT6;
119 eb=_val-21+(neg<<4);
120 }
121 else if(_val<69){
122 token=OC_DCT_VAL_CAT7;
123 eb=_val-37+(neg<<5);
124 }
125 else{
126 token=OC_DCT_VAL_CAT8;
127 eb=_val-69+(neg<<9);
128 }
129 *_eb=eb;
130 return token;
131 }
132
133 /*Token logging to allow a few fragments of efficient rollback.
134 Late SKIP analysis is tied up in the tokenization process, so we need to be
135 able to undo a fragment's tokens on a whim.*/
136
137 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
138 0,16,16,16,16,16,32,32,
139 32,32,32,32,32,32,32,48,
140 48,48,48,48,48,48,48,48,
141 48,48,48,48,64,64,64,64,
142 64,64,64,64,64,64,64,64,
143 64,64,64,64,64,64,64,64,
144 64,64,64,64,64,64,64,64
145 };
146
oc_token_bits(oc_enc_ctx * _enc,int _huffi,int _zzi,int _token)147 static int oc_token_bits(oc_enc_ctx *_enc,int _huffi,int _zzi,int _token){
148 return _enc->huff_codes[_huffi+OC_ZZI_HUFF_OFFSET[_zzi]][_token].nbits
149 +OC_DCT_TOKEN_EXTRA_BITS[_token];
150 }
151
oc_enc_tokenlog_checkpoint(oc_enc_ctx * _enc,oc_token_checkpoint * _cp,int _pli,int _zzi)152 static void oc_enc_tokenlog_checkpoint(oc_enc_ctx *_enc,
153 oc_token_checkpoint *_cp,int _pli,int _zzi){
154 _cp->pli=_pli;
155 _cp->zzi=_zzi;
156 _cp->eob_run=_enc->eob_run[_pli][_zzi];
157 _cp->ndct_tokens=_enc->ndct_tokens[_pli][_zzi];
158 }
159
oc_enc_tokenlog_rollback(oc_enc_ctx * _enc,const oc_token_checkpoint * _stack,int _n)160 void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
161 const oc_token_checkpoint *_stack,int _n){
162 int i;
163 for(i=_n;i-->0;){
164 int pli;
165 int zzi;
166 pli=_stack[i].pli;
167 zzi=_stack[i].zzi;
168 _enc->eob_run[pli][zzi]=_stack[i].eob_run;
169 _enc->ndct_tokens[pli][zzi]=_stack[i].ndct_tokens;
170 }
171 }
172
oc_enc_token_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _token,int _eb)173 static void oc_enc_token_log(oc_enc_ctx *_enc,
174 int _pli,int _zzi,int _token,int _eb){
175 ptrdiff_t ti;
176 ti=_enc->ndct_tokens[_pli][_zzi]++;
177 _enc->dct_tokens[_pli][_zzi][ti]=(unsigned char)_token;
178 _enc->extra_bits[_pli][_zzi][ti]=(ogg_uint16_t)_eb;
179 }
180
oc_enc_eob_log(oc_enc_ctx * _enc,int _pli,int _zzi,int _run_count)181 static void oc_enc_eob_log(oc_enc_ctx *_enc,
182 int _pli,int _zzi,int _run_count){
183 int token;
184 int eb;
185 token=oc_make_eob_token_full(_run_count,&eb);
186 oc_enc_token_log(_enc,_pli,_zzi,token,eb);
187 }
188
189
oc_enc_tokenize_start(oc_enc_ctx * _enc)190 void oc_enc_tokenize_start(oc_enc_ctx *_enc){
191 memset(_enc->ndct_tokens,0,sizeof(_enc->ndct_tokens));
192 memset(_enc->eob_run,0,sizeof(_enc->eob_run));
193 memset(_enc->dct_token_offs,0,sizeof(_enc->dct_token_offs));
194 memset(_enc->dc_pred_last,0,sizeof(_enc->dc_pred_last));
195 }
196
197 typedef struct oc_quant_token oc_quant_token;
198
199 /*A single node in the Viterbi trellis.
200 We maintain up to 2 of these per coefficient:
201 - A token to code if the value is zero (EOB, zero run, or combo token).
202 - A token to code if the value is not zero (DCT value token).*/
203 struct oc_quant_token{
204 unsigned char next;
205 signed char token;
206 ogg_int16_t eb;
207 ogg_uint32_t cost;
208 int bits;
209 int qc;
210 };
211
212 /*Tokenizes the AC coefficients, possibly adjusting the quantization, and then
213 dequantizes and de-zig-zags the result.
214 The DC coefficient is not preserved; it should be restored by the caller.*/
oc_enc_tokenize_ac(oc_enc_ctx * _enc,int _pli,ptrdiff_t _fragi,ogg_int16_t * _qdct,const ogg_uint16_t * _dequant,const ogg_int16_t * _dct,int _zzi,oc_token_checkpoint ** _stack,int _acmin)215 int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
216 ogg_int16_t *_qdct,const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
217 int _zzi,oc_token_checkpoint **_stack,int _acmin){
218 oc_token_checkpoint *stack;
219 ogg_int64_t zflags;
220 ogg_int64_t nzflags;
221 ogg_int64_t best_flags;
222 ogg_uint32_t d2_accum[64];
223 oc_quant_token tokens[64][2];
224 ogg_uint16_t *eob_run;
225 const unsigned char *dct_fzig_zag;
226 ogg_uint32_t cost;
227 int bits;
228 int eob;
229 int token;
230 int eb;
231 int next;
232 int huffi;
233 int zzi;
234 int ti;
235 int zzj;
236 int qc;
237 huffi=_enc->huff_idxs[_enc->state.frame_type][1][_pli+1>>1];
238 eob_run=_enc->eob_run[_pli];
239 memset(tokens[0],0,sizeof(tokens[0]));
240 best_flags=nzflags=0;
241 zflags=1;
242 d2_accum[0]=0;
243 zzj=64;
244 for(zzi=OC_MINI(_zzi,63);zzi>0;zzi--){
245 ogg_int32_t lambda;
246 ogg_uint32_t best_cost;
247 int best_bits=best_bits;
248 int best_next=best_next;
249 int best_token=best_token;
250 int best_eb=best_eb;
251 int best_qc=best_qc;
252 int flush_bits;
253 ogg_uint32_t d2;
254 int dq;
255 int e;
256 int c;
257 int s;
258 int tj;
259 lambda=_enc->lambda;
260 qc=_qdct[zzi];
261 s=-(qc<0);
262 qc=qc+s^s;
263 c=_dct[OC_FZIG_ZAG[zzi]];
264 if(qc<=1){
265 ogg_uint32_t sum_d2;
266 int nzeros;
267 int dc_reserve;
268 /*The hard case: try a zero run.*/
269 if(!qc){
270 /*Skip runs that are already quantized to zeros.
271 If we considered each zero coefficient in turn, we might
272 theoretically find a better way to partition long zero runs (e.g.,
273 a run of > 17 zeros followed by a 1 might be better coded as a short
274 zero run followed by a combo token, rather than the longer zero
275 token followed by a 1 value token), but zeros are so common that
276 this becomes very computationally expensive (quadratic instead of
277 linear in the number of coefficients), for a marginal gain.*/
278 while(zzi>1&&!_qdct[zzi-1])zzi--;
279 /*The distortion of coefficients originally quantized to zero is
280 treated as zero (since we'll never quantize them to anything else).*/
281 d2=0;
282 }
283 else{
284 c=c+s^s;
285 d2=c*(ogg_int32_t)c;
286 }
287 eob=eob_run[zzi];
288 nzeros=zzj-zzi;
289 zzj&=63;
290 sum_d2=d2+d2_accum[zzj];
291 d2_accum[zzi]=sum_d2;
292 flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
293 /*We reserve 1 spot for combo run tokens that start in the 1st AC stack
294 to ensure they can be extended to include the DC coefficient if
295 necessary; this greatly simplifies stack-rewriting later on.*/
296 dc_reserve=zzi+62>>6;
297 best_cost=0xFFFFFFFF;
298 for(;;){
299 if(nzflags>>zzj&1){
300 int cat;
301 int val;
302 int val_s;
303 int zzk;
304 int tk;
305 next=tokens[zzj][1].next;
306 tk=next&1;
307 zzk=next>>1;
308 /*Try a pure zero run to this point.*/
309 cat=nzeros+55>>6;
310 token=OC_DCT_SHORT_ZRL_TOKEN+cat;
311 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
312 d2=sum_d2-d2_accum[zzj];
313 cost=d2+lambda*bits+tokens[zzj][1].cost;
314 if(cost<=best_cost){
315 best_next=(zzj<<1)+1;
316 best_token=token;
317 best_eb=nzeros-1;
318 best_cost=cost;
319 best_bits=bits+tokens[zzj][1].bits;
320 best_qc=0;
321 }
322 if(nzeros<16+dc_reserve){
323 val=_qdct[zzj];
324 val_s=-(val<0);
325 val=val+val_s^val_s;
326 if(val<=2){
327 /*Try a +/- 1 combo token.*/
328 if(nzeros<6){
329 token=OC_DCT_RUN_CAT1A+nzeros-1;
330 eb=-val_s;
331 }
332 else{
333 cat=nzeros+54>>6;
334 token=OC_DCT_RUN_CAT1B+cat;
335 eb=(-val_s<<cat+2)+nzeros-6-(cat<<2);
336 }
337 e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj];
338 d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
339 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
340 cost=d2+lambda*bits+tokens[zzk][tk].cost;
341 if(cost<=best_cost){
342 best_next=next;
343 best_token=token;
344 best_eb=eb;
345 best_cost=cost;
346 best_bits=bits+tokens[zzk][tk].bits;
347 best_qc=1+val_s^val_s;
348 }
349 }
350 if(nzeros<2+dc_reserve&&2<=val&&val<=4){
351 /*Try a +/- 2/3 combo token.*/
352 cat=nzeros>>1;
353 token=OC_DCT_RUN_CAT2A+cat;
354 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
355 val=2+((val+val_s^val_s)>2);
356 e=(_dct[OC_FZIG_ZAG[zzj]]+val_s^val_s)-_dequant[zzj]*val;
357 d2=e*(ogg_int32_t)e+sum_d2-d2_accum[zzj];
358 cost=d2+lambda*bits+tokens[zzk][tk].cost;
359 if(cost<=best_cost){
360 best_cost=cost;
361 best_bits=bits+tokens[zzk][tk].bits;
362 best_next=next;
363 best_token=token;
364 best_eb=(-val_s<<1+cat)+(val-2<<cat)+(nzeros-1>>1);
365 best_qc=val+val_s^val_s;
366 }
367 }
368 }
369 /*zzj can't be coded as a zero, so stop trying to extend the run.*/
370 if(!(zflags>>zzj&1))break;
371 }
372 /*We could try to consider _all_ potentially non-zero coefficients, but
373 if we already found a bunch of them not worth coding, it's fairly
374 unlikely they would now be worth coding from this position; skipping
375 them saves a lot of work.*/
376 zzj=(tokens[zzj][0].next>>1)-(tokens[zzj][0].qc!=0)&63;
377 if(zzj==0){
378 /*We made it all the way to the end of the block; try an EOB token.*/
379 if(eob<4095){
380 bits=oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob+1))
381 -flush_bits;
382 }
383 else bits=oc_token_bits(_enc,huffi,zzi,OC_DCT_EOB1_TOKEN);
384 cost=sum_d2+bits*lambda;
385 /*If the best route so far is still a pure zero run to the end of the
386 block, force coding it as an EOB.
387 Even if it's not optimal for this block, it has a good chance of
388 getting combined with an EOB token from subsequent blocks, saving
389 bits overall.*/
390 if(cost<=best_cost||best_token<=OC_DCT_ZRL_TOKEN&&zzi+best_eb==63){
391 best_next=0;
392 /*This token is just a marker; in reality we may not emit any
393 tokens, but update eob_run[] instead.*/
394 best_token=OC_DCT_EOB1_TOKEN;
395 best_eb=0;
396 best_cost=cost;
397 best_bits=bits;
398 best_qc=0;
399 }
400 break;
401 }
402 nzeros=zzj-zzi;
403 }
404 tokens[zzi][0].next=(unsigned char)best_next;
405 tokens[zzi][0].token=(signed char)best_token;
406 tokens[zzi][0].eb=(ogg_int16_t)best_eb;
407 tokens[zzi][0].cost=best_cost;
408 tokens[zzi][0].bits=best_bits;
409 tokens[zzi][0].qc=best_qc;
410 zflags|=(ogg_int64_t)1<<zzi;
411 if(qc){
412 dq=_dequant[zzi];
413 if(zzi<_acmin)lambda=0;
414 e=dq-c;
415 d2=e*(ogg_int32_t)e;
416 token=OC_ONE_TOKEN-s;
417 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
418 zzj=zzi+1&63;
419 tj=best_flags>>zzj&1;
420 next=(zzj<<1)+tj;
421 tokens[zzi][1].next=(unsigned char)next;
422 tokens[zzi][1].token=(signed char)token;
423 tokens[zzi][1].eb=0;
424 tokens[zzi][1].cost=d2+lambda*bits+tokens[zzj][tj].cost;
425 tokens[zzi][1].bits=bits+tokens[zzj][tj].bits;
426 tokens[zzi][1].qc=1+s^s;
427 nzflags|=(ogg_int64_t)1<<zzi;
428 best_flags|=
429 (ogg_int64_t)(tokens[zzi][1].cost<tokens[zzi][0].cost)<<zzi;
430 }
431 }
432 else{
433 eob=eob_run[zzi];
434 if(zzi<_acmin)lambda=0;
435 c=c+s^s;
436 dq=_dequant[zzi];
437 /*No zero run can extend past this point.*/
438 d2_accum[zzi]=0;
439 flush_bits=eob>0?oc_token_bits(_enc,huffi,zzi,oc_make_eob_token(eob)):0;
440 if(qc<=2){
441 e=2*dq-c;
442 d2=e*(ogg_int32_t)e;
443 best_token=OC_TWO_TOKEN-s;
444 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
445 best_cost=d2+lambda*best_bits;
446 e-=dq;
447 d2=e*(ogg_int32_t)e;
448 token=OC_ONE_TOKEN-s;
449 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
450 cost=d2+lambda*bits;
451 if(cost<=best_cost){
452 best_token=token;
453 best_bits=bits;
454 best_cost=cost;
455 qc--;
456 }
457 best_eb=0;
458 }
459 else if(qc<=3){
460 e=3*dq-c;
461 d2=e*(ogg_int32_t)e;
462 best_token=OC_DCT_VAL_CAT2;
463 best_eb=-s;
464 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
465 best_cost=d2+lambda*best_bits;
466 e-=dq;
467 d2=e*(ogg_int32_t)e;
468 token=OC_TWO_TOKEN-s;
469 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
470 cost=d2+lambda*bits;
471 if(cost<=best_cost){
472 best_token=token;
473 best_eb=0;
474 best_bits=bits;
475 best_cost=cost;
476 qc--;
477 }
478 }
479 else if(qc<=6){
480 e=qc*dq-c;
481 d2=e*(ogg_int32_t)e;
482 best_token=OC_DCT_VAL_CAT2+qc-3;
483 best_eb=-s;
484 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
485 best_cost=d2+lambda*best_bits;
486 e-=dq;
487 d2=e*(ogg_int32_t)e;
488 token=best_token-1;
489 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
490 cost=d2+lambda*bits;
491 if(cost<=best_cost){
492 best_token=token;
493 best_bits=bits;
494 best_cost=cost;
495 qc--;
496 }
497 }
498 else if(qc<=8){
499 e=qc*dq-c;
500 d2=e*(ogg_int32_t)e;
501 best_token=OC_DCT_VAL_CAT3;
502 best_eb=(-s<<1)+qc-7;
503 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
504 best_cost=d2+lambda*best_bits;
505 e=6*dq-c;
506 d2=e*(ogg_int32_t)e;
507 token=OC_DCT_VAL_CAT2+3;
508 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
509 cost=d2+lambda*bits;
510 if(cost<=best_cost){
511 best_token=token;
512 best_eb=-s;
513 best_bits=bits;
514 best_cost=cost;
515 qc=6;
516 }
517 }
518 else if(qc<=12){
519 e=qc*dq-c;
520 d2=e*(ogg_int32_t)e;
521 best_token=OC_DCT_VAL_CAT4;
522 best_eb=(-s<<2)+qc-9;
523 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
524 best_cost=d2+lambda*best_bits;
525 e=8*dq-c;
526 d2=e*(ogg_int32_t)e;
527 token=best_token-1;
528 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
529 cost=d2+lambda*bits;
530 if(cost<=best_cost){
531 best_token=token;
532 best_eb=(-s<<1)+1;
533 best_bits=bits;
534 best_cost=cost;
535 qc=8;
536 }
537 }
538 else if(qc<=20){
539 e=qc*dq-c;
540 d2=e*(ogg_int32_t)e;
541 best_token=OC_DCT_VAL_CAT5;
542 best_eb=(-s<<3)+qc-13;
543 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
544 best_cost=d2+lambda*best_bits;
545 e=12*dq-c;
546 d2=e*(ogg_int32_t)e;
547 token=best_token-1;
548 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
549 cost=d2+lambda*bits;
550 if(cost<=best_cost){
551 best_token=token;
552 best_eb=(-s<<2)+3;
553 best_bits=bits;
554 best_cost=cost;
555 qc=12;
556 }
557 }
558 else if(qc<=36){
559 e=qc*dq-c;
560 d2=e*(ogg_int32_t)e;
561 best_token=OC_DCT_VAL_CAT6;
562 best_eb=(-s<<4)+qc-21;
563 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
564 best_cost=d2+lambda*best_bits;
565 e=20*dq-c;
566 d2=e*(ogg_int32_t)e;
567 token=best_token-1;
568 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
569 cost=d2+lambda*bits;
570 if(cost<=best_cost){
571 best_token=token;
572 best_eb=(-s<<3)+7;
573 best_bits=bits;
574 best_cost=cost;
575 qc=20;
576 }
577 }
578 else if(qc<=68){
579 e=qc*dq-c;
580 d2=e*(ogg_int32_t)e;
581 best_token=OC_DCT_VAL_CAT7;
582 best_eb=(-s<<5)+qc-37;
583 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
584 best_cost=d2+lambda*best_bits;
585 e=36*dq-c;
586 d2=e*(ogg_int32_t)e;
587 token=best_token-1;
588 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
589 cost=d2+lambda*bits;
590 if(cost<best_cost){
591 best_token=token;
592 best_eb=(-s<<4)+15;
593 best_bits=bits;
594 best_cost=cost;
595 qc=36;
596 }
597 }
598 else{
599 e=qc*dq-c;
600 d2=e*(ogg_int32_t)e;
601 best_token=OC_DCT_VAL_CAT8;
602 best_eb=(-s<<9)+qc-69;
603 best_bits=flush_bits+oc_token_bits(_enc,huffi,zzi,best_token);
604 best_cost=d2+lambda*best_bits;
605 e=68*dq-c;
606 d2=e*(ogg_int32_t)e;
607 token=best_token-1;
608 bits=flush_bits+oc_token_bits(_enc,huffi,zzi,token);
609 cost=d2+lambda*bits;
610 if(cost<best_cost){
611 best_token=token;
612 best_eb=(-s<<5)+31;
613 best_bits=bits;
614 best_cost=cost;
615 qc=68;
616 }
617 }
618 zzj=zzi+1&63;
619 tj=best_flags>>zzj&1;
620 next=(zzj<<1)+tj;
621 tokens[zzi][1].next=(unsigned char)next;
622 tokens[zzi][1].token=(signed char)best_token;
623 tokens[zzi][1].eb=best_eb;
624 tokens[zzi][1].cost=best_cost+tokens[zzj][tj].cost;
625 tokens[zzi][1].bits=best_bits+tokens[zzj][tj].bits;
626 tokens[zzi][1].qc=qc+s^s;
627 nzflags|=(ogg_int64_t)1<<zzi;
628 best_flags|=(ogg_int64_t)1<<zzi;
629 }
630 zzj=zzi;
631 }
632 /*Emit the tokens from the best path through the trellis.*/
633 stack=*_stack;
634 /*We blow away the first entry here so that things vectorize better.
635 The DC coefficient is not actually stored in the array yet.*/
636 for(zzi=0;zzi<64;zzi++)_qdct[zzi]=0;
637 dct_fzig_zag=_enc->state.opt_data.dct_fzig_zag;
638 zzi=1;
639 ti=best_flags>>1&1;
640 bits=tokens[zzi][ti].bits;
641 do{
642 oc_enc_tokenlog_checkpoint(_enc,stack++,_pli,zzi);
643 eob=eob_run[zzi];
644 if(tokens[zzi][ti].token<OC_NDCT_EOB_TOKEN_MAX){
645 if(++eob>=4095){
646 oc_enc_eob_log(_enc,_pli,zzi,eob);
647 eob=0;
648 }
649 eob_run[zzi]=eob;
650 /*We don't include the actual EOB cost for this block in the return value.
651 It will be paid for by the fragment that terminates the EOB run.*/
652 bits-=tokens[zzi][ti].bits;
653 zzi=_zzi;
654 break;
655 }
656 /*Emit pending EOB run if any.*/
657 if(eob>0){
658 oc_enc_eob_log(_enc,_pli,zzi,eob);
659 eob_run[zzi]=0;
660 }
661 oc_enc_token_log(_enc,_pli,zzi,tokens[zzi][ti].token,tokens[zzi][ti].eb);
662 next=tokens[zzi][ti].next;
663 qc=tokens[zzi][ti].qc;
664 zzj=(next>>1)-1&63;
665 /*TODO: It may be worth saving the dequantized coefficient in the trellis
666 above; we had to compute it to measure the error anyway.*/
667 _qdct[dct_fzig_zag[zzj]]=(ogg_int16_t)(qc*(int)_dequant[zzj]);
668 zzi=next>>1;
669 ti=next&1;
670 }
671 while(zzi);
672 *_stack=stack;
673 return bits;
674 }
675
oc_enc_pred_dc_frag_rows(oc_enc_ctx * _enc,int _pli,int _fragy0,int _frag_yend)676 void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
677 int _pli,int _fragy0,int _frag_yend){
678 const oc_fragment_plane *fplane;
679 const oc_fragment *frags;
680 ogg_int16_t *frag_dc;
681 ptrdiff_t fragi;
682 int *pred_last;
683 int nhfrags;
684 int fragx;
685 int fragy;
686 fplane=_enc->state.fplanes+_pli;
687 frags=_enc->state.frags;
688 frag_dc=_enc->frag_dc;
689 pred_last=_enc->dc_pred_last[_pli];
690 nhfrags=fplane->nhfrags;
691 fragi=fplane->froffset+_fragy0*nhfrags;
692 for(fragy=_fragy0;fragy<_frag_yend;fragy++){
693 if(fragy==0){
694 /*For the first row, all of the cases reduce to just using the previous
695 predictor for the same reference frame.*/
696 for(fragx=0;fragx<nhfrags;fragx++,fragi++){
697 if(frags[fragi].coded){
698 int ref;
699 ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
700 frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred_last[ref]);
701 pred_last[ref]=frags[fragi].dc;
702 }
703 }
704 }
705 else{
706 const oc_fragment *u_frags;
707 int l_ref;
708 int ul_ref;
709 int u_ref;
710 u_frags=frags-nhfrags;
711 l_ref=-1;
712 ul_ref=-1;
713 u_ref=u_frags[fragi].coded?OC_FRAME_FOR_MODE(u_frags[fragi].mb_mode):-1;
714 for(fragx=0;fragx<nhfrags;fragx++,fragi++){
715 int ur_ref;
716 if(fragx+1>=nhfrags)ur_ref=-1;
717 else{
718 ur_ref=u_frags[fragi+1].coded?
719 OC_FRAME_FOR_MODE(u_frags[fragi+1].mb_mode):-1;
720 }
721 if(frags[fragi].coded){
722 int pred;
723 int ref;
724 ref=OC_FRAME_FOR_MODE(frags[fragi].mb_mode);
725 /*We break out a separate case based on which of our neighbors use
726 the same reference frames.
727 This is somewhat faster than trying to make a generic case which
728 handles all of them, since it reduces lots of poorly predicted
729 jumps to one switch statement, and also lets a number of the
730 multiplications be optimized out by strength reduction.*/
731 switch((l_ref==ref)|(ul_ref==ref)<<1|
732 (u_ref==ref)<<2|(ur_ref==ref)<<3){
733 default:pred=pred_last[ref];break;
734 case 1:
735 case 3:pred=frags[fragi-1].dc;break;
736 case 2:pred=u_frags[fragi-1].dc;break;
737 case 4:
738 case 6:
739 case 12:pred=u_frags[fragi].dc;break;
740 case 5:pred=(frags[fragi-1].dc+u_frags[fragi].dc)/2;break;
741 case 8:pred=u_frags[fragi+1].dc;break;
742 case 9:
743 case 11:
744 case 13:{
745 pred=(75*frags[fragi-1].dc+53*u_frags[fragi+1].dc)/128;
746 }break;
747 case 10:pred=(u_frags[fragi-1].dc+u_frags[fragi+1].dc)/2;break;
748 case 14:{
749 pred=(3*(u_frags[fragi-1].dc+u_frags[fragi+1].dc)
750 +10*u_frags[fragi].dc)/16;
751 }break;
752 case 7:
753 case 15:{
754 int p0;
755 int p1;
756 int p2;
757 p0=frags[fragi-1].dc;
758 p1=u_frags[fragi-1].dc;
759 p2=u_frags[fragi].dc;
760 pred=(29*(p0+p2)-26*p1)/32;
761 if(abs(pred-p2)>128)pred=p2;
762 else if(abs(pred-p0)>128)pred=p0;
763 else if(abs(pred-p1)>128)pred=p1;
764 }break;
765 }
766 frag_dc[fragi]=(ogg_int16_t)(frags[fragi].dc-pred);
767 pred_last[ref]=frags[fragi].dc;
768 l_ref=ref;
769 }
770 else l_ref=-1;
771 ul_ref=u_ref;
772 u_ref=ur_ref;
773 }
774 }
775 }
776 }
777
oc_enc_tokenize_dc_frag_list(oc_enc_ctx * _enc,int _pli,const ptrdiff_t * _coded_fragis,ptrdiff_t _ncoded_fragis,int _prev_ndct_tokens1,int _prev_eob_run1)778 void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
779 const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
780 int _prev_ndct_tokens1,int _prev_eob_run1){
781 const ogg_int16_t *frag_dc;
782 ptrdiff_t fragii;
783 unsigned char *dct_tokens0;
784 unsigned char *dct_tokens1;
785 ogg_uint16_t *extra_bits0;
786 ogg_uint16_t *extra_bits1;
787 ptrdiff_t ti0;
788 ptrdiff_t ti1r;
789 ptrdiff_t ti1w;
790 int eob_run0;
791 int eob_run1;
792 int neobs1;
793 int token;
794 int eb;
795 int token1=token1;
796 int eb1=eb1;
797 /*Return immediately if there are no coded fragments; otherwise we'd flush
798 any trailing EOB run into the AC 1 list and never read it back out.*/
799 if(_ncoded_fragis<=0)return;
800 frag_dc=_enc->frag_dc;
801 dct_tokens0=_enc->dct_tokens[_pli][0];
802 dct_tokens1=_enc->dct_tokens[_pli][1];
803 extra_bits0=_enc->extra_bits[_pli][0];
804 extra_bits1=_enc->extra_bits[_pli][1];
805 ti0=_enc->ndct_tokens[_pli][0];
806 ti1w=ti1r=_prev_ndct_tokens1;
807 eob_run0=_enc->eob_run[_pli][0];
808 /*Flush any trailing EOB run for the 1st AC coefficient.
809 This is needed to allow us to track tokens to the end of the list.*/
810 eob_run1=_enc->eob_run[_pli][1];
811 if(eob_run1>0)oc_enc_eob_log(_enc,_pli,1,eob_run1);
812 /*If there was an active EOB run at the start of the 1st AC stack, read it
813 in and decode it.*/
814 if(_prev_eob_run1>0){
815 token1=dct_tokens1[ti1r];
816 eb1=extra_bits1[ti1r];
817 ti1r++;
818 eob_run1=oc_decode_eob_token(token1,eb1);
819 /*Consume the portion of the run that came before these fragments.*/
820 neobs1=eob_run1-_prev_eob_run1;
821 }
822 else eob_run1=neobs1=0;
823 for(fragii=0;fragii<_ncoded_fragis;fragii++){
824 int val;
825 /*All tokens in the 1st AC coefficient stack are regenerated as the DC
826 coefficients are produced.
827 This can be done in-place; stack 1 cannot get larger.*/
828 if(!neobs1){
829 /*There's no active EOB run in stack 1; read the next token.*/
830 token1=dct_tokens1[ti1r];
831 eb1=extra_bits1[ti1r];
832 ti1r++;
833 if(token1<OC_NDCT_EOB_TOKEN_MAX){
834 neobs1=oc_decode_eob_token(token1,eb1);
835 /*It's an EOB run; add it to the current (inactive) one.
836 Because we may have moved entries to stack 0, we may have an
837 opportunity to merge two EOB runs in stack 1.*/
838 eob_run1+=neobs1;
839 }
840 }
841 val=frag_dc[_coded_fragis[fragii]];
842 if(val){
843 /*There was a non-zero DC value, so there's no alteration to stack 1
844 for this fragment; just code the stack 0 token.*/
845 /*Flush any pending EOB run.*/
846 if(eob_run0>0){
847 token=oc_make_eob_token_full(eob_run0,&eb);
848 dct_tokens0[ti0]=(unsigned char)token;
849 extra_bits0[ti0]=(ogg_uint16_t)eb;
850 ti0++;
851 eob_run0=0;
852 }
853 token=oc_make_dct_token_full(0,0,val,&eb);
854 dct_tokens0[ti0]=(unsigned char)token;
855 extra_bits0[ti0]=(ogg_uint16_t)eb;
856 ti0++;
857 }
858 else{
859 /*Zero DC value; that means the entry in stack 1 might need to be coded
860 from stack 0.
861 This requires a stack 1 fixup.*/
862 if(neobs1>0){
863 /*We're in the middle of an active EOB run in stack 1.
864 Move it to stack 0.*/
865 if(++eob_run0>=4095){
866 token=oc_make_eob_token_full(eob_run0,&eb);
867 dct_tokens0[ti0]=(unsigned char)token;
868 extra_bits0[ti0]=(ogg_uint16_t)eb;
869 ti0++;
870 eob_run0=0;
871 }
872 eob_run1--;
873 }
874 else{
875 /*No active EOB run in stack 1, so we can't extend one in stack 0.
876 Flush it if we've got it.*/
877 if(eob_run0>0){
878 token=oc_make_eob_token_full(eob_run0,&eb);
879 dct_tokens0[ti0]=(unsigned char)token;
880 extra_bits0[ti0]=(ogg_uint16_t)eb;
881 ti0++;
882 eob_run0=0;
883 }
884 /*Stack 1 token is one of: a pure zero run token, a single
885 coefficient token, or a zero run/coefficient combo token.
886 A zero run token is expanded and moved to token stack 0, and the
887 stack 1 entry dropped.
888 A single coefficient value may be transformed into combo token that
889 is moved to stack 0, or if it cannot be combined, it is left alone
890 and a single length-1 zero run is emitted in stack 0.
891 A combo token is extended and moved to stack 0.
892 During AC coding, we restrict the run lengths on combo tokens for
893 stack 1 to guarantee we can extend them.*/
894 switch(token1){
895 case OC_DCT_SHORT_ZRL_TOKEN:{
896 if(eb1<7){
897 dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
898 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
899 ti0++;
900 /*Don't write the AC coefficient back out.*/
901 continue;
902 }
903 /*Fall through.*/
904 }
905 case OC_DCT_ZRL_TOKEN:{
906 dct_tokens0[ti0]=OC_DCT_ZRL_TOKEN;
907 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
908 ti0++;
909 /*Don't write the AC coefficient back out.*/
910 }continue;
911 case OC_ONE_TOKEN:
912 case OC_MINUS_ONE_TOKEN:{
913 dct_tokens0[ti0]=OC_DCT_RUN_CAT1A;
914 extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_ONE_TOKEN);
915 ti0++;
916 /*Don't write the AC coefficient back out.*/
917 }continue;
918 case OC_TWO_TOKEN:
919 case OC_MINUS_TWO_TOKEN:{
920 dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
921 extra_bits0[ti0]=(ogg_uint16_t)(token1-OC_TWO_TOKEN<<1);
922 ti0++;
923 /*Don't write the AC coefficient back out.*/
924 }continue;
925 case OC_DCT_VAL_CAT2:{
926 dct_tokens0[ti0]=OC_DCT_RUN_CAT2A;
927 extra_bits0[ti0]=(ogg_uint16_t)((eb1<<1)+1);
928 ti0++;
929 /*Don't write the AC coefficient back out.*/
930 }continue;
931 case OC_DCT_RUN_CAT1A:
932 case OC_DCT_RUN_CAT1A+1:
933 case OC_DCT_RUN_CAT1A+2:
934 case OC_DCT_RUN_CAT1A+3:{
935 dct_tokens0[ti0]=(unsigned char)(token1+1);
936 extra_bits0[ti0]=(ogg_uint16_t)eb1;
937 ti0++;
938 /*Don't write the AC coefficient back out.*/
939 }continue;
940 case OC_DCT_RUN_CAT1A+4:{
941 dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
942 extra_bits0[ti0]=(ogg_uint16_t)(eb1<<2);
943 ti0++;
944 /*Don't write the AC coefficient back out.*/
945 }continue;
946 case OC_DCT_RUN_CAT1B:{
947 if((eb1&3)<3){
948 dct_tokens0[ti0]=OC_DCT_RUN_CAT1B;
949 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
950 ti0++;
951 /*Don't write the AC coefficient back out.*/
952 continue;
953 }
954 eb1=((eb1&4)<<1)-1;
955 /*Fall through.*/
956 }
957 case OC_DCT_RUN_CAT1C:{
958 dct_tokens0[ti0]=OC_DCT_RUN_CAT1C;
959 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
960 ti0++;
961 /*Don't write the AC coefficient back out.*/
962 }continue;
963 case OC_DCT_RUN_CAT2A:{
964 eb1=(eb1<<1)-1;
965 /*Fall through.*/
966 }
967 case OC_DCT_RUN_CAT2B:{
968 dct_tokens0[ti0]=OC_DCT_RUN_CAT2B;
969 extra_bits0[ti0]=(ogg_uint16_t)(eb1+1);
970 ti0++;
971 /*Don't write the AC coefficient back out.*/
972 }continue;
973 }
974 /*We can't merge tokens, write a short zero run and keep going.*/
975 dct_tokens0[ti0]=OC_DCT_SHORT_ZRL_TOKEN;
976 extra_bits0[ti0]=0;
977 ti0++;
978 }
979 }
980 if(!neobs1){
981 /*Flush any (inactive) EOB run.*/
982 if(eob_run1>0){
983 token=oc_make_eob_token_full(eob_run1,&eb);
984 dct_tokens1[ti1w]=(unsigned char)token;
985 extra_bits1[ti1w]=(ogg_uint16_t)eb;
986 ti1w++;
987 eob_run1=0;
988 }
989 /*There's no active EOB run, so log the current token.*/
990 dct_tokens1[ti1w]=(unsigned char)token1;
991 extra_bits1[ti1w]=(ogg_uint16_t)eb1;
992 ti1w++;
993 }
994 else{
995 /*Otherwise consume one EOB from the current run.*/
996 neobs1--;
997 /*If we have more than 4095 EOBs outstanding in stack1, flush the run.*/
998 if(eob_run1-neobs1>=4095){
999 token=oc_make_eob_token_full(4095,&eb);
1000 dct_tokens1[ti1w]=(unsigned char)token;
1001 extra_bits1[ti1w]=(ogg_uint16_t)eb;
1002 ti1w++;
1003 eob_run1-=4095;
1004 }
1005 }
1006 }
1007 /*Save the current state.*/
1008 _enc->ndct_tokens[_pli][0]=ti0;
1009 _enc->ndct_tokens[_pli][1]=ti1w;
1010 _enc->eob_run[_pli][0]=eob_run0;
1011 _enc->eob_run[_pli][1]=eob_run1;
1012 }
1013
1014 /*Final EOB run welding.*/
oc_enc_tokenize_finish(oc_enc_ctx * _enc)1015 void oc_enc_tokenize_finish(oc_enc_ctx *_enc){
1016 int pli;
1017 int zzi;
1018 /*Emit final EOB runs.*/
1019 for(pli=0;pli<3;pli++)for(zzi=0;zzi<64;zzi++){
1020 int eob_run;
1021 eob_run=_enc->eob_run[pli][zzi];
1022 if(eob_run>0)oc_enc_eob_log(_enc,pli,zzi,eob_run);
1023 }
1024 /*Merge the final EOB run of one token list with the start of the next, if
1025 possible.*/
1026 for(zzi=0;zzi<64;zzi++)for(pli=0;pli<3;pli++){
1027 int old_tok1;
1028 int old_tok2;
1029 int old_eb1;
1030 int old_eb2;
1031 int new_tok;
1032 int new_eb;
1033 int zzj;
1034 int plj;
1035 ptrdiff_t ti=ti;
1036 int run_count;
1037 /*Make sure this coefficient has tokens at all.*/
1038 if(_enc->ndct_tokens[pli][zzi]<=0)continue;
1039 /*Ensure the first token is an EOB run.*/
1040 old_tok2=_enc->dct_tokens[pli][zzi][0];
1041 if(old_tok2>=OC_NDCT_EOB_TOKEN_MAX)continue;
1042 /*Search for a previous coefficient that has any tokens at all.*/
1043 old_tok1=OC_NDCT_EOB_TOKEN_MAX;
1044 for(zzj=zzi,plj=pli;zzj>=0;zzj--){
1045 while(plj-->0){
1046 ti=_enc->ndct_tokens[plj][zzj]-1;
1047 if(ti>=_enc->dct_token_offs[plj][zzj]){
1048 old_tok1=_enc->dct_tokens[plj][zzj][ti];
1049 break;
1050 }
1051 }
1052 if(plj>=0)break;
1053 plj=3;
1054 }
1055 /*Ensure its last token was an EOB run.*/
1056 if(old_tok1>=OC_NDCT_EOB_TOKEN_MAX)continue;
1057 /*Pull off the associated extra bits, if any, and decode the runs.*/
1058 old_eb1=_enc->extra_bits[plj][zzj][ti];
1059 old_eb2=_enc->extra_bits[pli][zzi][0];
1060 run_count=oc_decode_eob_token(old_tok1,old_eb1)
1061 +oc_decode_eob_token(old_tok2,old_eb2);
1062 /*We can't possibly combine these into one run.
1063 It might be possible to split them more optimally, but we'll just leave
1064 them as-is.*/
1065 if(run_count>=4096)continue;
1066 /*We CAN combine them into one run.*/
1067 new_tok=oc_make_eob_token_full(run_count,&new_eb);
1068 _enc->dct_tokens[plj][zzj][ti]=(unsigned char)new_tok;
1069 _enc->extra_bits[plj][zzj][ti]=(ogg_uint16_t)new_eb;
1070 _enc->dct_token_offs[pli][zzi]++;
1071 }
1072 }
1073