1 /********************************************************************
2 * *
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
7 * *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
10 * *
11 ********************************************************************
12
13 function: mode selection code
14 last mod: $Id$
15
16 ********************************************************************/
17 #include <limits.h>
18 #include <string.h>
19 #include "encint.h"
20 #include "modedec.h"
21
22
23
24 typedef struct oc_fr_state oc_fr_state;
25 typedef struct oc_qii_state oc_qii_state;
26 typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
27 typedef struct oc_rd_metric oc_rd_metric;
28 typedef struct oc_mode_choice oc_mode_choice;
29
30
31
32 /*There are 8 possible schemes used to encode macro block modes.
33 Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
34 The same set of Huffman codes is used for each of these 7 schemes, but the
35 mode assigned to each codeword varies.
36 Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
37 while schemes 1-6 have a fixed mapping.
38 Scheme 7 just encodes each mode directly in 3 bits.*/
39
40 /*The mode orderings for the various mode coding schemes.
41 Scheme 0 uses a custom alphabet, which is not stored in this table.
42 This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
43 decoder.*/
44 static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
45 /*Last MV dominates.*/
46 /*L P M N I G GM 4*/
47 {3,4,2,0,1,5,6,7},
48 /*L P N M I G GM 4*/
49 {2,4,3,0,1,5,6,7},
50 /*L M P N I G GM 4*/
51 {3,4,1,0,2,5,6,7},
52 /*L M N P I G GM 4*/
53 {2,4,1,0,3,5,6,7},
54 /*No MV dominates.*/
55 /*N L P M I G GM 4*/
56 {0,4,3,1,2,5,6,7},
57 /*N G L P M I GM 4*/
58 {0,5,4,2,3,1,6,7},
59 /*Default ordering.*/
60 /*N I M L P G GM 4*/
61 {0,1,2,3,4,5,6,7}
62 };
63
64
65
66 /*Initialize the mode scheme chooser.
67 This need only be called once per encoder.*/
oc_mode_scheme_chooser_init(oc_mode_scheme_chooser * _chooser)68 void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
69 int si;
70 _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
71 for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
72 }
73
74 /*Reset the mode scheme chooser.
75 This needs to be called once for each frame, including the first.*/
oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser * _chooser)76 static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
77 int si;
78 memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
79 /*Scheme 0 starts with 24 bits to store the mode list in.*/
80 _chooser->scheme_bits[0]=24;
81 memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
82 for(si=0;si<8;si++){
83 /*Scheme 7 should always start first, and scheme 0 should always start
84 last.*/
85 _chooser->scheme_list[si]=7-si;
86 _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
87 }
88 }
89
90
91 /*This is the real purpose of this data structure: not actually selecting a
92 mode scheme, but estimating the cost of coding a given mode given all the
93 modes selected so far.
94 This is done via opportunity cost: the cost is defined as the number of bits
95 required to encode all the modes selected so far including the current one
96 using the best possible scheme, minus the number of bits required to encode
97 all the modes selected so far not including the current one using the best
98 possible scheme.
99 The computational expense of doing this probably makes it overkill.
100 Just be happy we take a greedy approach instead of trying to solve the
101 global mode-selection problem (which is NP-hard).
102 _mb_mode: The mode to determine the cost of.
103 Return: The number of bits required to code this mode.*/
oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser * _chooser,int _mb_mode)104 static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
105 int _mb_mode){
106 int scheme0;
107 int scheme1;
108 int best_bits;
109 int mode_bits;
110 int si;
111 int scheme_bits;
112 scheme0=_chooser->scheme_list[0];
113 scheme1=_chooser->scheme_list[1];
114 best_bits=_chooser->scheme_bits[scheme0];
115 mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]];
116 /*Typical case: If the difference between the best scheme and the next best
117 is greater than 6 bits, then adding just one mode cannot change which
118 scheme we use.*/
119 if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
120 /*Otherwise, check to see if adding this mode selects a different scheme as
121 the best.*/
122 si=1;
123 best_bits+=mode_bits;
124 do{
125 /*For any scheme except 0, we can just use the bit cost of the mode's rank
126 in that scheme.*/
127 if(scheme1!=0){
128 scheme_bits=_chooser->scheme_bits[scheme1]+
129 OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]];
130 }
131 else{
132 int ri;
133 /*For scheme 0, incrementing the mode count could potentially change the
134 mode's rank.
135 Find the index where the mode would be moved to in the optimal list,
136 and use its bit cost instead of the one for the mode's current
137 position in the list.*/
138 /*We don't recompute scheme bits; this is computing opportunity cost, not
139 an update.*/
140 for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&&
141 _chooser->mode_counts[_mb_mode]>=
142 _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
143 scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri];
144 }
145 if(scheme_bits<best_bits)best_bits=scheme_bits;
146 if(++si>=8)break;
147 scheme1=_chooser->scheme_list[si];
148 }
149 while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
150 return best_bits-_chooser->scheme_bits[scheme0];
151 }
152
153 /*Incrementally update the mode counts and per-scheme bit counts and re-order
154 the scheme lists once a mode has been selected.
155 _mb_mode: The mode that was chosen.*/
oc_mode_scheme_chooser_update(oc_mode_scheme_chooser * _chooser,int _mb_mode)156 static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
157 int _mb_mode){
158 int ri;
159 int si;
160 _chooser->mode_counts[_mb_mode]++;
161 /*Re-order the scheme0 mode list if necessary.*/
162 for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
163 int pmode;
164 pmode=_chooser->scheme0_list[ri-1];
165 if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
166 /*Reorder the mode ranking.*/
167 _chooser->scheme0_ranks[pmode]++;
168 _chooser->scheme0_list[ri]=pmode;
169 }
170 _chooser->scheme0_ranks[_mb_mode]=ri;
171 _chooser->scheme0_list[ri]=_mb_mode;
172 /*Now add the bit cost for the mode to each scheme.*/
173 for(si=0;si<8;si++){
174 _chooser->scheme_bits[si]+=
175 OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
176 }
177 /*Finally, re-order the list of schemes.*/
178 for(si=1;si<8;si++){
179 int sj;
180 int scheme0;
181 int bits0;
182 sj=si;
183 scheme0=_chooser->scheme_list[si];
184 bits0=_chooser->scheme_bits[scheme0];
185 do{
186 int scheme1;
187 scheme1=_chooser->scheme_list[sj-1];
188 if(bits0>=_chooser->scheme_bits[scheme1])break;
189 _chooser->scheme_list[sj]=scheme1;
190 }
191 while(--sj>0);
192 _chooser->scheme_list[sj]=scheme0;
193 }
194 }
195
196
197
198 /*The number of bits required to encode a super block run.
199 _run_count: The desired run count; must be positive and less than 4130.*/
oc_sb_run_bits(int _run_count)200 static int oc_sb_run_bits(int _run_count){
201 int i;
202 for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
203 return OC_SB_RUN_CODE_NBITS[i];
204 }
205
206 /*The number of bits required to encode a block run.
207 _run_count: The desired run count; must be positive and less than 30.*/
oc_block_run_bits(int _run_count)208 static int oc_block_run_bits(int _run_count){
209 return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
210 }
211
212
213
214 /*State to track coded block flags and their bit cost.*/
215 struct oc_fr_state{
216 ptrdiff_t bits;
217 unsigned sb_partial_count:16;
218 unsigned sb_full_count:16;
219 unsigned b_coded_count_prev:8;
220 unsigned b_coded_count:8;
221 unsigned b_count:8;
222 signed int sb_partial:2;
223 signed int sb_full:2;
224 signed int b_coded_prev:2;
225 signed int b_coded:2;
226 };
227
228
229
oc_fr_state_init(oc_fr_state * _fr)230 static void oc_fr_state_init(oc_fr_state *_fr){
231 _fr->bits=0;
232 _fr->sb_partial_count=0;
233 _fr->sb_full_count=0;
234 _fr->b_coded_count_prev=0;
235 _fr->b_coded_count=0;
236 _fr->b_count=0;
237 _fr->sb_partial=-1;
238 _fr->sb_full=-1;
239 _fr->b_coded_prev=-1;
240 _fr->b_coded=-1;
241 }
242
243
oc_fr_state_advance_sb(oc_fr_state * _fr,int _sb_partial,int _sb_full)244 static void oc_fr_state_advance_sb(oc_fr_state *_fr,
245 int _sb_partial,int _sb_full){
246 ptrdiff_t bits;
247 int sb_partial_count;
248 int sb_full_count;
249 bits=_fr->bits;
250 /*Extend the sb_partial run, or start a new one.*/
251 sb_partial_count=_fr->sb_partial;
252 if(_fr->sb_partial==_sb_partial){
253 if(sb_partial_count>=4129){
254 bits++;
255 sb_partial_count=0;
256 }
257 else bits-=oc_sb_run_bits(sb_partial_count);
258 }
259 else sb_partial_count=0;
260 sb_partial_count++;
261 bits+=oc_sb_run_bits(sb_partial_count);
262 if(!_sb_partial){
263 /*Extend the sb_full run, or start a new one.*/
264 sb_full_count=_fr->sb_full_count;
265 if(_fr->sb_full==_sb_full){
266 if(sb_full_count>=4129){
267 bits++;
268 sb_full_count=0;
269 }
270 else bits-=oc_sb_run_bits(sb_full_count);
271 }
272 else sb_full_count=0;
273 sb_full_count++;
274 bits+=oc_sb_run_bits(sb_full_count);
275 _fr->sb_full=_sb_full;
276 _fr->sb_full_count=sb_full_count;
277 }
278 _fr->bits=bits;
279 _fr->sb_partial=_sb_partial;
280 _fr->sb_partial_count=sb_partial_count;
281 }
282
283 /*Flush any outstanding block flags for a SB (e.g., one with fewer than 16
284 blocks).*/
oc_fr_state_flush_sb(oc_fr_state * _fr)285 static void oc_fr_state_flush_sb(oc_fr_state *_fr){
286 ptrdiff_t bits;
287 int sb_partial;
288 int sb_full=sb_full;
289 int b_coded_count;
290 int b_coded;
291 int b_count;
292 b_count=_fr->b_count;
293 if(b_count>0){
294 bits=_fr->bits;
295 b_coded=_fr->b_coded;
296 b_coded_count=_fr->b_coded_count;
297 if(b_coded_count>=b_count){
298 /*This SB was fully coded/uncoded; roll back the partial block flags.*/
299 bits-=oc_block_run_bits(b_coded_count);
300 if(b_coded_count>b_count)bits+=oc_block_run_bits(b_coded_count-b_count);
301 sb_partial=0;
302 sb_full=b_coded;
303 b_coded=_fr->b_coded_prev;
304 b_coded_count=_fr->b_coded_count_prev;
305 }
306 else{
307 /*It was partially coded.*/
308 sb_partial=1;
309 /*sb_full is unused.*/
310 }
311 _fr->bits=bits;
312 _fr->b_coded_count=b_coded_count;
313 _fr->b_coded_count_prev=b_coded_count;
314 _fr->b_count=0;
315 _fr->b_coded=b_coded;
316 _fr->b_coded_prev=b_coded;
317 oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
318 }
319 }
320
oc_fr_state_advance_block(oc_fr_state * _fr,int _b_coded)321 static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
322 ptrdiff_t bits;
323 int b_coded_count;
324 int b_count;
325 int sb_partial;
326 int sb_full=sb_full;
327 bits=_fr->bits;
328 /*Extend the b_coded run, or start a new one.*/
329 b_coded_count=_fr->b_coded_count;
330 if(_fr->b_coded==_b_coded)bits-=oc_block_run_bits(b_coded_count);
331 else b_coded_count=0;
332 b_coded_count++;
333 b_count=_fr->b_count+1;
334 if(b_count>=16){
335 /*We finished a superblock.*/
336 if(b_coded_count>=16){
337 /*It was fully coded/uncoded; roll back the partial block flags.*/
338 if(b_coded_count>16)bits+=oc_block_run_bits(b_coded_count-16);
339 sb_partial=0;
340 sb_full=_b_coded;
341 _b_coded=_fr->b_coded_prev;
342 b_coded_count=_fr->b_coded_count_prev;
343 }
344 else{
345 bits+=oc_block_run_bits(b_coded_count);
346 /*It was partially coded.*/
347 sb_partial=1;
348 /*sb_full is unused.*/
349 }
350 _fr->bits=bits;
351 _fr->b_coded_count=b_coded_count;
352 _fr->b_coded_count_prev=b_coded_count;
353 _fr->b_count=0;
354 _fr->b_coded=_b_coded;
355 _fr->b_coded_prev=_b_coded;
356 oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
357 }
358 else{
359 bits+=oc_block_run_bits(b_coded_count);
360 _fr->bits=bits;
361 _fr->b_coded_count=b_coded_count;
362 _fr->b_count=b_count;
363 _fr->b_coded=_b_coded;
364 }
365 }
366
oc_fr_skip_block(oc_fr_state * _fr)367 static void oc_fr_skip_block(oc_fr_state *_fr){
368 oc_fr_state_advance_block(_fr,0);
369 }
370
oc_fr_code_block(oc_fr_state * _fr)371 static void oc_fr_code_block(oc_fr_state *_fr){
372 oc_fr_state_advance_block(_fr,1);
373 }
374
oc_fr_cost1(const oc_fr_state * _fr)375 static int oc_fr_cost1(const oc_fr_state *_fr){
376 oc_fr_state tmp;
377 ptrdiff_t bits;
378 *&tmp=*_fr;
379 oc_fr_skip_block(&tmp);
380 bits=tmp.bits;
381 *&tmp=*_fr;
382 oc_fr_code_block(&tmp);
383 return (int)(tmp.bits-bits);
384 }
385
oc_fr_cost4(const oc_fr_state * _pre,const oc_fr_state * _post)386 static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
387 oc_fr_state tmp;
388 *&tmp=*_pre;
389 oc_fr_skip_block(&tmp);
390 oc_fr_skip_block(&tmp);
391 oc_fr_skip_block(&tmp);
392 oc_fr_skip_block(&tmp);
393 return (int)(_post->bits-tmp.bits);
394 }
395
396
397
398 struct oc_qii_state{
399 ptrdiff_t bits;
400 unsigned qi01_count:14;
401 signed int qi01:2;
402 unsigned qi12_count:14;
403 signed int qi12:2;
404 };
405
406
407
oc_qii_state_init(oc_qii_state * _qs)408 static void oc_qii_state_init(oc_qii_state *_qs){
409 _qs->bits=0;
410 _qs->qi01_count=0;
411 _qs->qi01=-1;
412 _qs->qi12_count=0;
413 _qs->qi12=-1;
414 }
415
416
oc_qii_state_advance(oc_qii_state * _qd,const oc_qii_state * _qs,int _qii)417 static void oc_qii_state_advance(oc_qii_state *_qd,
418 const oc_qii_state *_qs,int _qii){
419 ptrdiff_t bits;
420 int qi01;
421 int qi01_count;
422 int qi12;
423 int qi12_count;
424 bits=_qs->bits;
425 qi01=_qii+1>>1;
426 qi01_count=_qs->qi01_count;
427 if(qi01==_qs->qi01){
428 if(qi01_count>=4129){
429 bits++;
430 qi01_count=0;
431 }
432 else bits-=oc_sb_run_bits(qi01_count);
433 }
434 else qi01_count=0;
435 qi01_count++;
436 bits+=oc_sb_run_bits(qi01_count);
437 qi12_count=_qs->qi12_count;
438 if(_qii){
439 qi12=_qii>>1;
440 if(qi12==_qs->qi12){
441 if(qi12_count>=4129){
442 bits++;
443 qi12_count=0;
444 }
445 else bits-=oc_sb_run_bits(qi12_count);
446 }
447 else qi12_count=0;
448 qi12_count++;
449 bits+=oc_sb_run_bits(qi12_count);
450 }
451 else qi12=_qs->qi12;
452 _qd->bits=bits;
453 _qd->qi01=qi01;
454 _qd->qi01_count=qi01_count;
455 _qd->qi12=qi12;
456 _qd->qi12_count=qi12_count;
457 }
458
459
460
461 /*Temporary encoder state for the analysis pipeline.*/
462 struct oc_enc_pipeline_state{
463 int bounding_values[256];
464 oc_fr_state fr[3];
465 oc_qii_state qs[3];
466 /*Condensed dequantization tables.*/
467 const ogg_uint16_t *dequant[3][3][2];
468 /*Condensed quantization tables.*/
469 const oc_iquant *enquant[3][3][2];
470 /*Skip SSD storage for the current MCU in each plane.*/
471 unsigned *skip_ssd[3];
472 /*Coded/uncoded fragment lists for each plane for the current MCU.*/
473 ptrdiff_t *coded_fragis[3];
474 ptrdiff_t *uncoded_fragis[3];
475 ptrdiff_t ncoded_fragis[3];
476 ptrdiff_t nuncoded_fragis[3];
477 /*The starting fragment for the current MCU in each plane.*/
478 ptrdiff_t froffset[3];
479 /*The starting row for the current MCU in each plane.*/
480 int fragy0[3];
481 /*The ending row for the current MCU in each plane.*/
482 int fragy_end[3];
483 /*The starting superblock for the current MCU in each plane.*/
484 unsigned sbi0[3];
485 /*The ending superblock for the current MCU in each plane.*/
486 unsigned sbi_end[3];
487 /*The number of tokens for zzi=1 for each color plane.*/
488 int ndct_tokens1[3];
489 /*The outstanding eob_run count for zzi=1 for each color plane.*/
490 int eob_run1[3];
491 /*Whether or not the loop filter is enabled.*/
492 int loop_filter;
493 };
494
495
oc_enc_pipeline_init(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe)496 static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
497 ptrdiff_t *coded_fragis;
498 unsigned mcu_nvsbs;
499 ptrdiff_t mcu_nfrags;
500 int hdec;
501 int vdec;
502 int pli;
503 int qii;
504 int qti;
505 /*Initialize the per-plane coded block flag trackers.
506 These are used for bit-estimation purposes only; the real flag bits span
507 all three planes, so we can't compute them in parallel.*/
508 for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
509 for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
510 /*Set up the per-plane skip SSD storage pointers.*/
511 mcu_nvsbs=_enc->mcu_nvsbs;
512 mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
513 hdec=!(_enc->state.info.pixel_fmt&1);
514 vdec=!(_enc->state.info.pixel_fmt&2);
515 _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
516 _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
517 _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
518 /*Set up per-plane pointers to the coded and uncoded fragments lists.
519 Unlike the decoder, each planes' coded and uncoded fragment list is kept
520 separate during the analysis stage; we only make the coded list for all
521 three planes contiguous right before the final packet is output
522 (destroying the uncoded lists, which are no longer needed).*/
523 coded_fragis=_enc->state.coded_fragis;
524 for(pli=0;pli<3;pli++){
525 _pipe->coded_fragis[pli]=coded_fragis;
526 coded_fragis+=_enc->state.fplanes[pli].nfrags;
527 _pipe->uncoded_fragis[pli]=coded_fragis;
528 }
529 memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
530 memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
531 /*Set up condensed quantizer tables.*/
532 for(pli=0;pli<3;pli++){
533 for(qii=0;qii<_enc->state.nqis;qii++){
534 int qi;
535 qi=_enc->state.qis[qii];
536 for(qti=0;qti<2;qti++){
537 _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
538 _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
539 }
540 }
541 }
542 /*Initialize the tokenization state.*/
543 for(pli=0;pli<3;pli++){
544 _pipe->ndct_tokens1[pli]=0;
545 _pipe->eob_run1[pli]=0;
546 }
547 /*Initialize the bounding value array for the loop filter.*/
548 _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
549 _pipe->bounding_values);
550 }
551
552 /*Sets the current MCU stripe to super block row _sby.
553 Return: A non-zero value if this was the last MCU.*/
oc_enc_pipeline_set_stripe(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,int _sby)554 static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
555 oc_enc_pipeline_state *_pipe,int _sby){
556 const oc_fragment_plane *fplane;
557 unsigned mcu_nvsbs;
558 int sby_end;
559 int notdone;
560 int vdec;
561 int pli;
562 mcu_nvsbs=_enc->mcu_nvsbs;
563 sby_end=_enc->state.fplanes[0].nvsbs;
564 notdone=_sby+mcu_nvsbs<sby_end;
565 if(notdone)sby_end=_sby+mcu_nvsbs;
566 vdec=0;
567 for(pli=0;pli<3;pli++){
568 fplane=_enc->state.fplanes+pli;
569 _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
570 _pipe->fragy0[pli]=_sby<<2-vdec;
571 _pipe->froffset[pli]=fplane->froffset
572 +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
573 if(notdone){
574 _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
575 _pipe->fragy_end[pli]=sby_end<<2-vdec;
576 }
577 else{
578 _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
579 _pipe->fragy_end[pli]=fplane->nvfrags;
580 }
581 vdec=!(_enc->state.info.pixel_fmt&2);
582 }
583 return notdone;
584 }
585
oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,int _pli,int _sdelay,int _edelay)586 static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
587 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
588 int refi;
589 /*Copy over all the uncoded fragments from this plane and advance the uncoded
590 fragment list.*/
591 _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
592 oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
593 _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
594 _pipe->nuncoded_fragis[_pli]=0;
595 /*Perform DC prediction.*/
596 oc_enc_pred_dc_frag_rows(_enc,_pli,
597 _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
598 /*Finish DC tokenization.*/
599 oc_enc_tokenize_dc_frag_list(_enc,_pli,
600 _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
601 _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
602 _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
603 _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
604 /*And advance the coded fragment list.*/
605 _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
606 _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
607 _pipe->ncoded_fragis[_pli]=0;
608 /*Apply the loop filter if necessary.*/
609 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
610 if(_pipe->loop_filter){
611 oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
612 refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
613 }
614 else _sdelay=_edelay=0;
615 /*To fill borders, we have an additional two pixel delay, since a fragment
616 in the next row could filter its top edge, using two pixels from a
617 fragment in this row.
618 But there's no reason to delay a full fragment between the two.*/
619 oc_state_borders_fill_rows(&_enc->state,refi,_pli,
620 (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
621 (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
622 }
623
624
625
626 /*Cost information about the coded blocks in a MB.*/
627 struct oc_rd_metric{
628 int uncoded_ac_ssd;
629 int coded_ac_ssd;
630 int ac_bits;
631 int dc_flag;
632 };
633
634
635
oc_enc_block_transform_quantize(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,oc_rd_metric * _mo,oc_token_checkpoint ** _stack)636 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
637 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
638 oc_rd_metric *_mo,oc_token_checkpoint **_stack){
639 OC_ALIGN16(ogg_int16_t dct[64]);
640 OC_ALIGN16(ogg_int16_t data[64]);
641 ogg_uint16_t dc_dequant;
642 const ogg_uint16_t *dequant;
643 const oc_iquant *enquant;
644 ptrdiff_t frag_offs;
645 int ystride;
646 const unsigned char *src;
647 const unsigned char *ref;
648 unsigned char *dst;
649 int frame_type;
650 int nonzero;
651 unsigned uncoded_ssd;
652 unsigned coded_ssd;
653 int coded_dc;
654 oc_token_checkpoint *checkpoint;
655 oc_fragment *frags;
656 int mb_mode;
657 int mv_offs[2];
658 int nmv_offs;
659 int ac_bits;
660 int borderi;
661 int qti;
662 int qii;
663 int pi;
664 int zzi;
665 int v;
666 int val;
667 int d;
668 int s;
669 int dc;
670 frags=_enc->state.frags;
671 frag_offs=_enc->state.frag_buf_offs[_fragi];
672 ystride=_enc->state.ref_ystride[_pli];
673 src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
674 borderi=frags[_fragi].borderi;
675 qii=frags[_fragi].qii;
676 if(qii&~3){
677 #if !defined(OC_COLLECT_METRICS)
678 if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
679 /*Enable early skip detection.*/
680 frags[_fragi].coded=0;
681 return 0;
682 }
683 #endif
684 /*Try and code this block anyway.*/
685 qii&=3;
686 frags[_fragi].qii=qii;
687 }
688 mb_mode=frags[_fragi].mb_mode;
689 ref=_enc->state.ref_frame_data[
690 _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
691 dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
692 +frag_offs;
693 /*Motion compensation:*/
694 switch(mb_mode){
695 case OC_MODE_INTRA:{
696 nmv_offs=0;
697 oc_enc_frag_sub_128(_enc,data,src,ystride);
698 }break;
699 case OC_MODE_GOLDEN_NOMV:
700 case OC_MODE_INTER_NOMV:{
701 nmv_offs=1;
702 mv_offs[0]=0;
703 oc_enc_frag_sub(_enc,data,src,ref,ystride);
704 }break;
705 default:{
706 const oc_mv *frag_mvs;
707 frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
708 nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
709 frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
710 if(nmv_offs>1){
711 oc_enc_frag_copy2(_enc,dst,
712 ref+mv_offs[0],ref+mv_offs[1],ystride);
713 oc_enc_frag_sub(_enc,data,src,dst,ystride);
714 }
715 else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
716 }break;
717 }
718 #if defined(OC_COLLECT_METRICS)
719 {
720 unsigned satd;
721 switch(nmv_offs){
722 case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break;
723 case 1:{
724 satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
725 }break;
726 default:{
727 satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX);
728 }
729 }
730 _enc->frag_satd[_fragi]=satd;
731 }
732 #endif
733 /*Transform:*/
734 oc_enc_fdct8x8(_enc,dct,data);
735 /*Quantize the DC coefficient:*/
736 qti=mb_mode!=OC_MODE_INTRA;
737 enquant=_pipe->enquant[_pli][0][qti];
738 dc_dequant=_pipe->dequant[_pli][0][qti][0];
739 v=dct[0];
740 val=v<<1;
741 s=OC_SIGNMASK(val);
742 val+=dc_dequant+s^s;
743 val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
744 dc=OC_CLAMPI(-580,val,580);
745 nonzero=0;
746 /*Quantize the AC coefficients:*/
747 dequant=_pipe->dequant[_pli][qii][qti];
748 enquant=_pipe->enquant[_pli][qii][qti];
749 for(zzi=1;zzi<64;zzi++){
750 v=dct[OC_FZIG_ZAG[zzi]];
751 d=dequant[zzi];
752 val=v<<1;
753 v=abs(val);
754 if(v>=d){
755 s=OC_SIGNMASK(val);
756 /*The bias added here rounds ties away from zero, since token
757 optimization can only decrease the magnitude of the quantized
758 value.*/
759 val+=d+s^s;
760 /*Note the arithmetic right shift is not guaranteed by ANSI C.
761 Hopefully no one still uses ones-complement architectures.*/
762 val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
763 data[zzi]=OC_CLAMPI(-580,val,580);
764 nonzero=zzi;
765 }
766 else data[zzi]=0;
767 }
768 /*Tokenize.*/
769 checkpoint=*_stack;
770 ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
771 _stack,qti?0:3);
772 /*Reconstruct.
773 TODO: nonzero may need to be adjusted after tokenization.*/
774 if(nonzero==0){
775 ogg_int16_t p;
776 int ci;
777 /*We round this dequant product (and not any of the others) because there's
778 no iDCT rounding.*/
779 p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
780 /*LOOP VECTORIZES.*/
781 for(ci=0;ci<64;ci++)data[ci]=p;
782 }
783 else{
784 data[0]=dc*dc_dequant;
785 oc_idct8x8(&_enc->state,data,nonzero+1);
786 }
787 if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
788 else{
789 oc_enc_frag_recon_inter(_enc,dst,
790 nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
791 }
792 frame_type=_enc->state.frame_type;
793 #if !defined(OC_COLLECT_METRICS)
794 if(frame_type!=OC_INTRA_FRAME)
795 #endif
796 {
797 /*In retrospect, should we have skipped this block?*/
798 oc_enc_frag_sub(_enc,data,src,dst,ystride);
799 coded_ssd=coded_dc=0;
800 if(borderi<0){
801 for(pi=0;pi<64;pi++){
802 coded_ssd+=data[pi]*data[pi];
803 coded_dc+=data[pi];
804 }
805 }
806 else{
807 ogg_int64_t mask;
808 mask=_enc->state.borders[borderi].mask;
809 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
810 coded_ssd+=data[pi]*data[pi];
811 coded_dc+=data[pi];
812 }
813 }
814 /*Scale to match DCT domain.*/
815 coded_ssd<<=4;
816 /*We actually only want the AC contribution to the SSD.*/
817 coded_ssd-=coded_dc*coded_dc>>2;
818 #if defined(OC_COLLECT_METRICS)
819 _enc->frag_ssd[_fragi]=coded_ssd;
820 }
821 if(frame_type!=OC_INTRA_FRAME){
822 #endif
823 uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
824 if(uncoded_ssd<UINT_MAX){
825 /*Although the fragment coding overhead determination is accurate, it is
826 greedy, using very coarse-grained local information.
827 Allowing it to mildly discourage coding turns out to be beneficial, but
828 it's not clear that allowing it to encourage coding through negative
829 coding overhead deltas is useful.
830 For that reason, we disallow negative coding_overheads.*/
831 if(_overhead_bits<0)_overhead_bits=0;
832 if(uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&&
833 /*Don't allow luma blocks to be skipped in 4MV mode when VP3
834 compatibility is enabled.*/
835 (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
836 /*Hm, not worth it; roll back.*/
837 oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
838 *_stack=checkpoint;
839 frags[_fragi].coded=0;
840 return 0;
841 }
842 }
843 else _mo->dc_flag=1;
844 _mo->uncoded_ac_ssd+=uncoded_ssd;
845 _mo->coded_ac_ssd+=coded_ssd;
846 _mo->ac_bits+=ac_bits;
847 }
848 oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii);
849 frags[_fragi].dc=dc;
850 frags[_fragi].coded=1;
851 return 1;
852 }
853
oc_enc_mb_transform_quantize_luma(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,unsigned _mbi,int _mode_overhead)854 static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
855 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead){
856 /*Worst case token stack usage for 4 fragments.*/
857 oc_token_checkpoint stack[64*4];
858 oc_token_checkpoint *stackptr;
859 const oc_sb_map *sb_maps;
860 signed char *mb_modes;
861 oc_fragment *frags;
862 ptrdiff_t *coded_fragis;
863 ptrdiff_t ncoded_fragis;
864 ptrdiff_t *uncoded_fragis;
865 ptrdiff_t nuncoded_fragis;
866 oc_rd_metric mo;
867 oc_fr_state fr_checkpoint;
868 oc_qii_state qs_checkpoint;
869 int mb_mode;
870 int ncoded;
871 ptrdiff_t fragi;
872 int bi;
873 *&fr_checkpoint=*(_pipe->fr+0);
874 *&qs_checkpoint=*(_pipe->qs+0);
875 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
876 mb_modes=_enc->state.mb_modes;
877 frags=_enc->state.frags;
878 coded_fragis=_pipe->coded_fragis[0];
879 ncoded_fragis=_pipe->ncoded_fragis[0];
880 uncoded_fragis=_pipe->uncoded_fragis[0];
881 nuncoded_fragis=_pipe->nuncoded_fragis[0];
882 mb_mode=mb_modes[_mbi];
883 ncoded=0;
884 stackptr=stack;
885 memset(&mo,0,sizeof(mo));
886 for(bi=0;bi<4;bi++){
887 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
888 frags[fragi].mb_mode=mb_mode;
889 if(oc_enc_block_transform_quantize(_enc,
890 _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
891 oc_fr_code_block(_pipe->fr+0);
892 coded_fragis[ncoded_fragis++]=fragi;
893 ncoded++;
894 }
895 else{
896 *(uncoded_fragis-++nuncoded_fragis)=fragi;
897 oc_fr_skip_block(_pipe->fr+0);
898 }
899 }
900 if(_enc->state.frame_type!=OC_INTRA_FRAME){
901 if(ncoded>0&&!mo.dc_flag){
902 int cost;
903 /*Some individual blocks were worth coding.
904 See if that's still true when accounting for mode and MV overhead.*/
905 cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
906 +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
907 if(mo.uncoded_ac_ssd<=cost){
908 /*Taking macroblock overhead into account, it is not worth coding this
909 MB.*/
910 oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
911 *(_pipe->fr+0)=*&fr_checkpoint;
912 *(_pipe->qs+0)=*&qs_checkpoint;
913 for(bi=0;bi<4;bi++){
914 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
915 if(frags[fragi].coded){
916 *(uncoded_fragis-++nuncoded_fragis)=fragi;
917 frags[fragi].coded=0;
918 }
919 oc_fr_skip_block(_pipe->fr+0);
920 }
921 ncoded_fragis-=ncoded;
922 ncoded=0;
923 }
924 }
925 /*If no luma blocks coded, the mode is forced.*/
926 if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
927 /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
928 with a single coded block.
929 This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
930 skipped blocks, while a 1MV does not.*/
931 else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
932 mb_modes[_mbi]=OC_MODE_INTER_MV;
933 }
934 }
935 _pipe->ncoded_fragis[0]=ncoded_fragis;
936 _pipe->nuncoded_fragis[0]=nuncoded_fragis;
937 return ncoded;
938 }
939
oc_enc_sb_transform_quantize_chroma(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,int _pli,int _sbi_start,int _sbi_end)940 static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
941 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
942 const oc_sb_map *sb_maps;
943 oc_sb_flags *sb_flags;
944 ptrdiff_t *coded_fragis;
945 ptrdiff_t ncoded_fragis;
946 ptrdiff_t *uncoded_fragis;
947 ptrdiff_t nuncoded_fragis;
948 int sbi;
949 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
950 sb_flags=_enc->state.sb_flags;
951 coded_fragis=_pipe->coded_fragis[_pli];
952 ncoded_fragis=_pipe->ncoded_fragis[_pli];
953 uncoded_fragis=_pipe->uncoded_fragis[_pli];
954 nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
955 for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
956 /*Worst case token stack usage for 1 fragment.*/
957 oc_token_checkpoint stack[64];
958 oc_rd_metric mo;
959 int quadi;
960 int bi;
961 memset(&mo,0,sizeof(mo));
962 for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
963 ptrdiff_t fragi;
964 fragi=sb_maps[sbi][quadi][bi];
965 if(fragi>=0){
966 oc_token_checkpoint *stackptr;
967 stackptr=stack;
968 if(oc_enc_block_transform_quantize(_enc,
969 _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
970 coded_fragis[ncoded_fragis++]=fragi;
971 oc_fr_code_block(_pipe->fr+_pli);
972 }
973 else{
974 *(uncoded_fragis-++nuncoded_fragis)=fragi;
975 oc_fr_skip_block(_pipe->fr+_pli);
976 }
977 }
978 }
979 oc_fr_state_flush_sb(_pipe->fr+_pli);
980 sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full;
981 sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial;
982 }
983 _pipe->ncoded_fragis[_pli]=ncoded_fragis;
984 _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
985 }
986
987 /*Mode decision is done by exhaustively examining all potential choices.
988 Obviously, doing the motion compensation, fDCT, tokenization, and then
989 counting the bits each token uses is computationally expensive.
990 Theora's EOB runs can also split the cost of these tokens across multiple
991 fragments, and naturally we don't know what the optimal choice of Huffman
992 codes will be until we know all the tokens we're going to encode in all the
993 fragments.
994 So we use a simple approach to estimating the bit cost and distortion of each
995 mode based upon the SATD value of the residual before coding.
996 The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
997 the process (modified somewhat from that of the paper) is very simple.
998 We build a non-linear regression of the mappings from
999 (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1000 SSD for each qi.
1001 A separate set of mappings is kept for each quantization type and color
1002 plane.
1003 The mappings are constructed by partitioning the SATD values into a small
1004 number of bins (currently 24) and using a linear regression in each bin
1005 (as opposed to the 0th-order regression used by Kim).
1006 The bit counts and SSD measurements are obtained by examining actual encoded
1007 frames, with appropriate lambda values and optimal Huffman codes selected.
1008 EOB bits are assigned to the fragment that started the EOB run (as opposed to
1009 dividing them among all the blocks in the run; though the latter approach
1010 seems more theoretically correct, Monty's testing showed a small improvement
1011 with the former, though that may have been merely statistical noise).
1012
1013 @ARTICLE{Kim03,
1014 author="Hyun Mun Kim",
1015 title="Adaptive Rate Control Using Nonlinear Regression",
1016 journal="IEEE Transactions on Circuits and Systems for Video Technology",
1017 volume=13,
1018 number=5,
1019 pages="432--439",
1020 month=May,
1021 year=2003
1022 }*/
1023
1024 /*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1025 overflow for large lambda values.*/
1026 #define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1027 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1028 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1029 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1030
1031 /*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1032 prediction.*/
oc_dct_cost2(unsigned * _ssd,int _qi,int _pli,int _qti,int _satd)1033 static unsigned oc_dct_cost2(unsigned *_ssd,
1034 int _qi,int _pli,int _qti,int _satd){
1035 unsigned rmse;
1036 int bin;
1037 int dx;
1038 int y0;
1039 int z0;
1040 int dy;
1041 int dz;
1042 /*SATD metrics for chroma planes vary much less than luma, so we scale them
1043 by 4 to distribute them into the mode decision bins more evenly.*/
1044 _satd<<=_pli+1&2;
1045 bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
1046 dx=_satd-(bin<<OC_SAD_SHIFT);
1047 y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
1048 z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
1049 dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
1050 dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
1051 rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
1052 *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1053 return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
1054 }
1055
1056 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
oc_analyze_intra_mb_luma(oc_enc_ctx * _enc,const oc_qii_state * _qs,unsigned _mbi)1057 static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1058 const oc_qii_state *_qs,unsigned _mbi){
1059 const unsigned char *src;
1060 const ptrdiff_t *frag_buf_offs;
1061 const oc_sb_map *sb_maps;
1062 oc_fragment *frags;
1063 ptrdiff_t frag_offs;
1064 ptrdiff_t fragi;
1065 oc_qii_state qs[4][3];
1066 unsigned cost[4][3];
1067 unsigned ssd[4][3];
1068 unsigned rate[4][3];
1069 int prev[3][3];
1070 unsigned satd;
1071 unsigned best_cost;
1072 unsigned best_ssd;
1073 unsigned best_rate;
1074 int best_qii;
1075 int qii;
1076 int lambda;
1077 int ystride;
1078 int nqis;
1079 int bi;
1080 frag_buf_offs=_enc->state.frag_buf_offs;
1081 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1082 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1083 ystride=_enc->state.ref_ystride[0];
1084 fragi=sb_maps[_mbi>>2][_mbi&3][0];
1085 frag_offs=frag_buf_offs[fragi];
1086 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1087 nqis=_enc->state.nqis;
1088 lambda=_enc->lambda;
1089 for(qii=0;qii<nqis;qii++){
1090 oc_qii_state_advance(qs[0]+qii,_qs,qii);
1091 rate[0][qii]=oc_dct_cost2(ssd[0]+qii,_enc->state.qis[qii],0,0,satd)
1092 +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1093 cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1094 }
1095 for(bi=1;bi<4;bi++){
1096 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1097 frag_offs=frag_buf_offs[fragi];
1098 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1099 for(qii=0;qii<nqis;qii++){
1100 oc_qii_state qt[3];
1101 unsigned cur_ssd;
1102 unsigned cur_rate;
1103 int best_qij;
1104 int qij;
1105 oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1106 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,0,satd);
1107 best_ssd=ssd[bi-1][0]+cur_ssd;
1108 best_rate=rate[bi-1][0]+cur_rate
1109 +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1110 best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1111 best_qij=0;
1112 for(qij=1;qij<nqis;qij++){
1113 unsigned chain_ssd;
1114 unsigned chain_rate;
1115 unsigned chain_cost;
1116 oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1117 chain_ssd=ssd[bi-1][qij]+cur_ssd;
1118 chain_rate=rate[bi-1][qij]+cur_rate
1119 +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1120 chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1121 if(chain_cost<best_cost){
1122 best_cost=chain_cost;
1123 best_ssd=chain_ssd;
1124 best_rate=chain_rate;
1125 best_qij=qij;
1126 }
1127 }
1128 *(qs[bi]+qii)=*(qt+best_qij);
1129 cost[bi][qii]=best_cost;
1130 ssd[bi][qii]=best_ssd;
1131 rate[bi][qii]=best_rate;
1132 prev[bi-1][qii]=best_qij;
1133 }
1134 }
1135 best_qii=0;
1136 best_cost=cost[3][0];
1137 for(qii=1;qii<nqis;qii++){
1138 if(cost[3][qii]<best_cost){
1139 best_cost=cost[3][qii];
1140 best_qii=qii;
1141 }
1142 }
1143 frags=_enc->state.frags;
1144 for(bi=3;;){
1145 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1146 frags[fragi].qii=best_qii;
1147 if(bi--<=0)break;
1148 best_qii=prev[bi][best_qii];
1149 }
1150 return best_cost;
1151 }
1152
1153 /*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
oc_analyze_intra_chroma_block(oc_enc_ctx * _enc,const oc_qii_state * _qs,int _pli,ptrdiff_t _fragi)1154 static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1155 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi){
1156 const unsigned char *src;
1157 oc_fragment *frags;
1158 ptrdiff_t frag_offs;
1159 oc_qii_state qt[3];
1160 unsigned cost[3];
1161 unsigned satd;
1162 unsigned best_cost;
1163 int best_qii;
1164 int qii;
1165 int lambda;
1166 int ystride;
1167 int nqis;
1168 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1169 ystride=_enc->state.ref_ystride[_pli];
1170 frag_offs=_enc->state.frag_buf_offs[_fragi];
1171 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1172 nqis=_enc->state.nqis;
1173 lambda=_enc->lambda;
1174 best_qii=0;
1175 for(qii=0;qii<nqis;qii++){
1176 unsigned cur_rate;
1177 unsigned cur_ssd;
1178 oc_qii_state_advance(qt+qii,_qs,qii);
1179 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],_pli,0,satd)
1180 +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1181 cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1182 }
1183 best_cost=cost[0];
1184 for(qii=1;qii<nqis;qii++){
1185 if(cost[qii]<best_cost){
1186 best_cost=cost[qii];
1187 best_qii=qii;
1188 }
1189 }
1190 frags=_enc->state.frags;
1191 frags[_fragi].qii=best_qii;
1192 return best_cost;
1193 }
1194
oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,int _pli,int _sbi_start,int _sbi_end)1195 static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1196 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1197 const oc_sb_map *sb_maps;
1198 oc_sb_flags *sb_flags;
1199 ptrdiff_t *coded_fragis;
1200 ptrdiff_t ncoded_fragis;
1201 int sbi;
1202 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1203 sb_flags=_enc->state.sb_flags;
1204 coded_fragis=_pipe->coded_fragis[_pli];
1205 ncoded_fragis=_pipe->ncoded_fragis[_pli];
1206 for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1207 /*Worst case token stack usage for 1 fragment.*/
1208 oc_token_checkpoint stack[64];
1209 int quadi;
1210 int bi;
1211 for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1212 ptrdiff_t fragi;
1213 fragi=sb_maps[sbi][quadi][bi];
1214 if(fragi>=0){
1215 oc_token_checkpoint *stackptr;
1216 oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi);
1217 stackptr=stack;
1218 oc_enc_block_transform_quantize(_enc,
1219 _pipe,_pli,fragi,0,NULL,&stackptr);
1220 coded_fragis[ncoded_fragis++]=fragi;
1221 }
1222 }
1223 }
1224 _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1225 }
1226
1227 /*Analysis stage for an INTRA frame.*/
oc_enc_analyze_intra(oc_enc_ctx * _enc,int _recode)1228 void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1229 oc_enc_pipeline_state pipe;
1230 const unsigned char *map_idxs;
1231 int nmap_idxs;
1232 oc_sb_flags *sb_flags;
1233 signed char *mb_modes;
1234 const oc_mb_map *mb_maps;
1235 oc_mb_enc_info *embs;
1236 oc_fragment *frags;
1237 unsigned stripe_sby;
1238 unsigned mcu_nvsbs;
1239 int notstart;
1240 int notdone;
1241 int refi;
1242 int pli;
1243 _enc->state.frame_type=OC_INTRA_FRAME;
1244 oc_enc_tokenize_start(_enc);
1245 oc_enc_pipeline_init(_enc,&pipe);
1246 /*Choose MVs and MB modes and quantize and code luma.
1247 Must be done in Hilbert order.*/
1248 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1249 nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1250 _enc->state.ncoded_fragis[0]=0;
1251 _enc->state.ncoded_fragis[1]=0;
1252 _enc->state.ncoded_fragis[2]=0;
1253 sb_flags=_enc->state.sb_flags;
1254 mb_modes=_enc->state.mb_modes;
1255 mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1256 embs=_enc->mb_info;
1257 frags=_enc->state.frags;
1258 notstart=0;
1259 notdone=1;
1260 mcu_nvsbs=_enc->mcu_nvsbs;
1261 for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1262 unsigned sbi;
1263 unsigned sbi_end;
1264 notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
1265 sbi_end=pipe.sbi_end[0];
1266 for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
1267 int quadi;
1268 /*Mode addressing is through Y plane, always 4 MB per SB.*/
1269 for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1270 unsigned mbi;
1271 int mapii;
1272 int mapi;
1273 int bi;
1274 ptrdiff_t fragi;
1275 mbi=sbi<<2|quadi;
1276 /*Motion estimation:
1277 We always do a basic 1MV search for all macroblocks, coded or not,
1278 keyframe or not.*/
1279 if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_search(_enc,mbi);
1280 oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi);
1281 mb_modes[mbi]=OC_MODE_INTRA;
1282 oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
1283 /*Propagate final MB mode and MVs to the chroma blocks.*/
1284 for(mapii=4;mapii<nmap_idxs;mapii++){
1285 mapi=map_idxs[mapii];
1286 pli=mapi>>2;
1287 bi=mapi&3;
1288 fragi=mb_maps[mbi][pli][bi];
1289 frags[fragi].mb_mode=OC_MODE_INTRA;
1290 }
1291 }
1292 }
1293 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
1294 /*Code chroma planes.*/
1295 for(pli=1;pli<3;pli++){
1296 oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
1297 pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
1298 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
1299 }
1300 notstart=1;
1301 }
1302 /*Finish filling in the reference frame borders.*/
1303 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1304 for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1305 _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1306 }
1307
1308
1309
1310 /*Cost information about a MB mode.*/
1311 struct oc_mode_choice{
1312 unsigned cost;
1313 unsigned ssd;
1314 unsigned rate;
1315 unsigned overhead;
1316 unsigned char qii[12];
1317 };
1318
1319
1320
oc_mode_set_cost(oc_mode_choice * _modec,int _lambda)1321 static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1322 _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1323 _modec->rate+_modec->overhead,_lambda);
1324 }
1325
1326 /*A set of skip SSD's to use to disable early skipping.*/
1327 static const unsigned OC_NOSKIP[12]={
1328 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1329 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1330 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1331 };
1332
1333 /*The estimated number of bits used by a coded chroma block to specify the AC
1334 quantizer.
1335 TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1336 measurements suggest this is in the right ballpark, but it varies somewhat
1337 with lambda.*/
1338 #define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1339
oc_analyze_mb_mode_luma(oc_enc_ctx * _enc,oc_mode_choice * _modec,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti)1340 static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1341 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1342 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
1343 oc_fr_state fr;
1344 oc_qii_state qs;
1345 unsigned ssd;
1346 unsigned rate;
1347 int overhead;
1348 unsigned satd;
1349 unsigned best_ssd;
1350 unsigned best_rate;
1351 int best_overhead;
1352 int best_fri;
1353 int best_qii;
1354 unsigned cur_cost;
1355 unsigned cur_ssd;
1356 unsigned cur_rate;
1357 int cur_overhead;
1358 int lambda;
1359 int nqis;
1360 int nskipped;
1361 int bi;
1362 int qii;
1363 lambda=_enc->lambda;
1364 nqis=_enc->state.nqis;
1365 /*We could do a trellis optimization here, but we don't make final skip
1366 decisions until after transform+quantization, so the result wouldn't be
1367 optimal anyway.
1368 Instead we just use a greedy approach; for most SATD values, the
1369 differences between the qiis are large enough to drown out the cost to
1370 code the flags, anyway.*/
1371 *&fr=*_fr;
1372 *&qs=*_qs;
1373 ssd=rate=overhead=nskipped=0;
1374 for(bi=0;bi<4;bi++){
1375 oc_fr_state ft[2];
1376 oc_qii_state qt[3];
1377 unsigned best_cost;
1378 satd=_frag_satd[bi];
1379 *(ft+0)=*&fr;
1380 oc_fr_code_block(ft+0);
1381 oc_qii_state_advance(qt+0,&qs,0);
1382 best_overhead=(ft[0].bits-fr.bits<<OC_BIT_SCALE);
1383 best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],0,_qti,satd)
1384 +(qt[0].bits-qs.bits<<OC_BIT_SCALE);
1385 best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate+best_overhead,lambda);
1386 best_fri=0;
1387 best_qii=0;
1388 for(qii=1;qii<nqis;qii++){
1389 oc_qii_state_advance(qt+qii,&qs,qii);
1390 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
1391 +(qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1392 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate+best_overhead,lambda);
1393 if(cur_cost<best_cost){
1394 best_cost=cur_cost;
1395 best_ssd=cur_ssd;
1396 best_rate=cur_rate;
1397 best_qii=qii;
1398 }
1399 }
1400 if(_skip_ssd[bi]<UINT_MAX&&nskipped<3){
1401 *(ft+1)=*&fr;
1402 oc_fr_skip_block(ft+1);
1403 cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1404 cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1405 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1406 if(cur_cost<=best_cost){
1407 best_ssd=cur_ssd;
1408 best_rate=0;
1409 best_overhead=cur_overhead;
1410 best_fri=1;
1411 best_qii+=4;
1412 }
1413 }
1414 rate+=best_rate;
1415 ssd+=best_ssd;
1416 overhead+=best_overhead;
1417 *&fr=*(ft+best_fri);
1418 if(best_fri==0)*&qs=*(qt+best_qii);
1419 else nskipped++;
1420 _modec->qii[bi]=best_qii;
1421 }
1422 _modec->ssd=ssd;
1423 _modec->rate=rate;
1424 _modec->overhead=OC_MAXI(overhead,0);
1425 }
1426
oc_analyze_mb_mode_chroma(oc_enc_ctx * _enc,oc_mode_choice * _modec,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti)1427 static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1428 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1429 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
1430 unsigned ssd;
1431 unsigned rate;
1432 unsigned satd;
1433 unsigned best_ssd;
1434 unsigned best_rate;
1435 int best_qii;
1436 unsigned cur_cost;
1437 unsigned cur_ssd;
1438 unsigned cur_rate;
1439 int lambda;
1440 int nblocks;
1441 int nqis;
1442 int pli;
1443 int bi;
1444 int qii;
1445 lambda=_enc->lambda;
1446 nqis=_enc->state.nqis;
1447 ssd=_modec->ssd;
1448 rate=_modec->rate;
1449 /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1450 order, we assume a constant overhead for coded block and qii flags.*/
1451 nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1452 nblocks=(nblocks-4>>1)+4;
1453 bi=4;
1454 for(pli=1;pli<3;pli++){
1455 for(;bi<nblocks;bi++){
1456 unsigned best_cost;
1457 satd=_frag_satd[bi];
1458 best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd)
1459 +OC_CHROMA_QII_RATE;
1460 best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1461 best_qii=0;
1462 for(qii=1;qii<nqis;qii++){
1463 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
1464 +OC_CHROMA_QII_RATE;
1465 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1466 if(cur_cost<best_cost){
1467 best_cost=cur_cost;
1468 best_ssd=cur_ssd;
1469 best_rate=cur_rate;
1470 best_qii=qii;
1471 }
1472 }
1473 if(_skip_ssd[bi]<UINT_MAX){
1474 cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1475 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1476 if(cur_cost<=best_cost){
1477 best_ssd=cur_ssd;
1478 best_rate=0;
1479 best_qii+=4;
1480 }
1481 }
1482 rate+=best_rate;
1483 ssd+=best_ssd;
1484 _modec->qii[bi]=best_qii;
1485 }
1486 nblocks=(nblocks-4<<1)+4;
1487 }
1488 _modec->ssd=ssd;
1489 _modec->rate=rate;
1490 }
1491
oc_skip_cost(oc_enc_ctx * _enc,oc_enc_pipeline_state * _pipe,unsigned _mbi,unsigned _ssd[12])1492 static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1493 unsigned _mbi,unsigned _ssd[12]){
1494 OC_ALIGN16(ogg_int16_t buffer[64]);
1495 const unsigned char *src;
1496 const unsigned char *ref;
1497 int ystride;
1498 const oc_fragment *frags;
1499 const ptrdiff_t *frag_buf_offs;
1500 const ptrdiff_t *sb_map;
1501 const oc_mb_map_plane *mb_map;
1502 const unsigned char *map_idxs;
1503 int map_nidxs;
1504 ogg_int64_t mask;
1505 unsigned uncoded_ssd;
1506 int uncoded_dc;
1507 unsigned dc_dequant;
1508 int dc_flag;
1509 int mapii;
1510 int mapi;
1511 int pli;
1512 int bi;
1513 ptrdiff_t fragi;
1514 ptrdiff_t frag_offs;
1515 int borderi;
1516 int pi;
1517 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1518 ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
1519 ystride=_enc->state.ref_ystride[0];
1520 frags=_enc->state.frags;
1521 frag_buf_offs=_enc->state.frag_buf_offs;
1522 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1523 dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][0][1][0];
1524 for(bi=0;bi<4;bi++){
1525 fragi=sb_map[bi];
1526 frag_offs=frag_buf_offs[fragi];
1527 oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
1528 borderi=frags[fragi].borderi;
1529 uncoded_ssd=uncoded_dc=0;
1530 if(borderi<0){
1531 for(pi=0;pi<64;pi++){
1532 uncoded_ssd+=buffer[pi]*buffer[pi];
1533 uncoded_dc+=buffer[pi];
1534 }
1535 }
1536 else{
1537 ogg_int64_t mask;
1538 mask=_enc->state.borders[borderi].mask;
1539 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
1540 uncoded_ssd+=buffer[pi]*buffer[pi];
1541 uncoded_dc+=buffer[pi];
1542 }
1543 }
1544 /*Scale to match DCT domain.*/
1545 uncoded_ssd<<=4;
1546 /*We actually only want the AC contribution to the SSD.*/
1547 uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
1548 /*DC is a special case; if there's more than a full-quantizer improvement
1549 in the effective DC component, always force-code the block.*/
1550 dc_flag=abs(uncoded_dc)>dc_dequant<<1;
1551 uncoded_ssd|=-dc_flag;
1552 _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
1553 }
1554 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1555 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1556 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1557 map_nidxs=(map_nidxs-4>>1)+4;
1558 mapii=4;
1559 for(pli=1;pli<3;pli++){
1560 ystride=_enc->state.ref_ystride[pli];
1561 dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][pli][1][0];
1562 for(;mapii<map_nidxs;mapii++){
1563 mapi=map_idxs[mapii];
1564 bi=mapi&3;
1565 fragi=mb_map[pli][bi];
1566 frag_offs=frag_buf_offs[fragi];
1567 oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
1568 borderi=frags[fragi].borderi;
1569 uncoded_ssd=uncoded_dc=0;
1570 if(borderi<0){
1571 for(pi=0;pi<64;pi++){
1572 uncoded_ssd+=buffer[pi]*buffer[pi];
1573 uncoded_dc+=buffer[pi];
1574 }
1575 }
1576 else{
1577 mask=_enc->state.borders[borderi].mask;
1578 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
1579 uncoded_ssd+=buffer[pi]*buffer[pi];
1580 uncoded_dc+=buffer[pi];
1581 }
1582 }
1583 /*Scale to match DCT domain.*/
1584 uncoded_ssd<<=4;
1585 /*We actually only want the AC contribution to the SSD.*/
1586 uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
1587 /*DC is a special case; if there's more than a full-quantizer improvement
1588 in the effective DC component, always force-code the block.*/
1589 dc_flag=abs(uncoded_dc)>dc_dequant<<1;
1590 uncoded_ssd|=-dc_flag;
1591 _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
1592 }
1593 map_nidxs=(map_nidxs-4<<1)+4;
1594 }
1595 }
1596
oc_mb_intra_satd(oc_enc_ctx * _enc,unsigned _mbi,unsigned _frag_satd[12])1597 static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1598 unsigned _frag_satd[12]){
1599 const unsigned char *src;
1600 const ptrdiff_t *frag_buf_offs;
1601 const ptrdiff_t *sb_map;
1602 const oc_mb_map_plane *mb_map;
1603 const unsigned char *map_idxs;
1604 int map_nidxs;
1605 int mapii;
1606 int mapi;
1607 int ystride;
1608 int pli;
1609 int bi;
1610 ptrdiff_t fragi;
1611 ptrdiff_t frag_offs;
1612 frag_buf_offs=_enc->state.frag_buf_offs;
1613 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1614 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1615 ystride=_enc->state.ref_ystride[0];
1616 for(bi=0;bi<4;bi++){
1617 fragi=sb_map[bi];
1618 frag_offs=frag_buf_offs[fragi];
1619 _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1620 }
1621 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1622 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1623 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1624 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1625 ystride=_enc->state.ref_ystride[1];
1626 for(mapii=4;mapii<map_nidxs;mapii++){
1627 mapi=map_idxs[mapii];
1628 pli=mapi>>2;
1629 bi=mapi&3;
1630 fragi=mb_map[pli][bi];
1631 frag_offs=frag_buf_offs[fragi];
1632 _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1633 }
1634 }
1635
oc_cost_intra(oc_enc_ctx * _enc,oc_mode_choice * _modec,unsigned _mbi,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _frag_satd[12],const unsigned _skip_ssd[12])1636 static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1637 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
1638 const unsigned _frag_satd[12],const unsigned _skip_ssd[12]){
1639 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
1640 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
1641 _modec->overhead+=
1642 oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
1643 oc_mode_set_cost(_modec,_enc->lambda);
1644 }
1645
oc_cost_inter(oc_enc_ctx * _enc,oc_mode_choice * _modec,unsigned _mbi,int _mb_mode,const signed char * _mv,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _skip_ssd[12])1646 static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1647 unsigned _mbi,int _mb_mode,const signed char *_mv,
1648 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
1649 unsigned frag_satd[12];
1650 const unsigned char *src;
1651 const unsigned char *ref;
1652 int ystride;
1653 const ptrdiff_t *frag_buf_offs;
1654 const ptrdiff_t *sb_map;
1655 const oc_mb_map_plane *mb_map;
1656 const unsigned char *map_idxs;
1657 int map_nidxs;
1658 int mapii;
1659 int mapi;
1660 int mv_offs[2];
1661 int dx;
1662 int dy;
1663 int pli;
1664 int bi;
1665 ptrdiff_t fragi;
1666 ptrdiff_t frag_offs;
1667 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1668 ref=_enc->state.ref_frame_data[
1669 _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
1670 ystride=_enc->state.ref_ystride[0];
1671 frag_buf_offs=_enc->state.frag_buf_offs;
1672 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1673 dx=_mv[0];
1674 dy=_mv[1];
1675 _modec->rate=_modec->ssd=0;
1676 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
1677 for(bi=0;bi<4;bi++){
1678 fragi=sb_map[bi];
1679 frag_offs=frag_buf_offs[fragi];
1680 frag_satd[bi]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1681 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1682 }
1683 }
1684 else{
1685 for(bi=0;bi<4;bi++){
1686 fragi=sb_map[bi];
1687 frag_offs=frag_buf_offs[fragi];
1688 frag_satd[bi]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1689 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1690 }
1691 }
1692 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1693 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1694 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1695 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1696 ystride=_enc->state.ref_ystride[1];
1697 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){
1698 for(mapii=4;mapii<map_nidxs;mapii++){
1699 mapi=map_idxs[mapii];
1700 pli=mapi>>2;
1701 bi=mapi&3;
1702 fragi=mb_map[pli][bi];
1703 frag_offs=frag_buf_offs[fragi];
1704 frag_satd[mapii]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1705 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1706 }
1707 }
1708 else{
1709 for(mapii=4;mapii<map_nidxs;mapii++){
1710 mapi=map_idxs[mapii];
1711 pli=mapi>>2;
1712 bi=mapi&3;
1713 fragi=mb_map[pli][bi];
1714 frag_offs=frag_buf_offs[fragi];
1715 frag_satd[mapii]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1716 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1717 }
1718 }
1719 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1720 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1721 _modec->overhead+=
1722 oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
1723 oc_mode_set_cost(_modec,_enc->lambda);
1724 }
1725
oc_cost_inter_nomv(oc_enc_ctx * _enc,oc_mode_choice * _modec,unsigned _mbi,int _mb_mode,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _skip_ssd[12])1726 static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1727 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
1728 const unsigned _skip_ssd[12]){
1729 static const oc_mv OC_MV_ZERO;
1730 oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_fr,_qs,_skip_ssd);
1731 }
1732
oc_cost_inter1mv(oc_enc_ctx * _enc,oc_mode_choice * _modec,unsigned _mbi,int _mb_mode,const signed char * _mv,const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _skip_ssd[12])1733 static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1734 unsigned _mbi,int _mb_mode,const signed char *_mv,
1735 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
1736 int bits0;
1737 oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd);
1738 bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31];
1739 _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
1740 -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
1741 oc_mode_set_cost(_modec,_enc->lambda);
1742 return bits0;
1743 }
1744
1745 /*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
1746 static const unsigned char OC_MB_PHASE[4][4]={
1747 {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
1748 };
1749
oc_cost_inter4mv(oc_enc_ctx * _enc,oc_mode_choice * _modec,unsigned _mbi,oc_mv _mv[4],const oc_fr_state * _fr,const oc_qii_state * _qs,const unsigned _skip_ssd[12])1750 static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1751 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
1752 const unsigned _skip_ssd[12]){
1753 unsigned frag_satd[12];
1754 oc_mv lbmvs[4];
1755 oc_mv cbmvs[4];
1756 const unsigned char *src;
1757 const unsigned char *ref;
1758 int ystride;
1759 const ptrdiff_t *frag_buf_offs;
1760 oc_mv *frag_mvs;
1761 const oc_mb_map_plane *mb_map;
1762 const unsigned char *map_idxs;
1763 int map_nidxs;
1764 int nqis;
1765 int mapii;
1766 int mapi;
1767 int mv_offs[2];
1768 int dx;
1769 int dy;
1770 int pli;
1771 int bi;
1772 ptrdiff_t fragi;
1773 ptrdiff_t frag_offs;
1774 int bits0;
1775 int bits1;
1776 unsigned satd;
1777 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1778 ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
1779 ystride=_enc->state.ref_ystride[0];
1780 frag_buf_offs=_enc->state.frag_buf_offs;
1781 frag_mvs=_enc->state.frag_mvs;
1782 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1783 _modec->rate=_modec->ssd=0;
1784 for(bi=0;bi<4;bi++){
1785 fragi=mb_map[0][bi];
1786 dx=_mv[bi][0];
1787 dy=_mv[bi][1];
1788 /*Save the block MVs as the current ones while we're here; we'll replace
1789 them if we don't ultimately choose 4MV mode.*/
1790 frag_mvs[fragi][0]=(signed char)dx;
1791 frag_mvs[fragi][1]=(signed char)dy;
1792 frag_offs=frag_buf_offs[fragi];
1793 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
1794 satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1795 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1796 }
1797 else{
1798 satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1799 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1800 }
1801 frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd;
1802 }
1803 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
1804 _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,1);
1805 /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
1806 bits0=0;
1807 bits1=0;
1808 nqis=_enc->state.nqis;
1809 for(bi=0;bi<4;bi++){
1810 if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis){
1811 memset(lbmvs+bi,0,sizeof(*lbmvs));
1812 }
1813 else{
1814 memcpy(lbmvs+bi,_mv+bi,sizeof(*lbmvs));
1815 bits0+=OC_MV_BITS[0][_mv[bi][0]+31]+OC_MV_BITS[0][_mv[bi][1]+31];
1816 bits1+=12;
1817 }
1818 }
1819 (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,
1820 (const oc_mv *)lbmvs);
1821 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1822 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1823 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1824 ystride=_enc->state.ref_ystride[1];
1825 for(mapii=4;mapii<map_nidxs;mapii++){
1826 mapi=map_idxs[mapii];
1827 pli=mapi>>2;
1828 bi=mapi&3;
1829 fragi=mb_map[pli][bi];
1830 dx=cbmvs[bi][0];
1831 dy=cbmvs[bi][1];
1832 frag_offs=frag_buf_offs[fragi];
1833 /*TODO: We could save half these calls by re-using the results for the Cb
1834 and Cr planes; is it worth it?*/
1835 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){
1836 satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1837 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1838 }
1839 else{
1840 satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1841 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1842 }
1843 frag_satd[mapii]=satd;
1844 }
1845 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1846 _modec->overhead+=
1847 oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
1848 +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
1849 -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
1850 oc_mode_set_cost(_modec,_enc->lambda);
1851 }
1852
oc_enc_analyze_inter(oc_enc_ctx * _enc,int _allow_keyframe,int _recode)1853 int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
1854 oc_set_chroma_mvs_func set_chroma_mvs;
1855 oc_enc_pipeline_state pipe;
1856 oc_qii_state intra_luma_qs;
1857 oc_mv last_mv;
1858 oc_mv prior_mv;
1859 ogg_int64_t interbits;
1860 ogg_int64_t intrabits;
1861 const unsigned char *map_idxs;
1862 int nmap_idxs;
1863 unsigned *coded_mbis;
1864 unsigned *uncoded_mbis;
1865 size_t ncoded_mbis;
1866 size_t nuncoded_mbis;
1867 oc_sb_flags *sb_flags;
1868 signed char *mb_modes;
1869 const oc_sb_map *sb_maps;
1870 const oc_mb_map *mb_maps;
1871 oc_mb_enc_info *embs;
1872 oc_fragment *frags;
1873 oc_mv *frag_mvs;
1874 int qi;
1875 unsigned stripe_sby;
1876 unsigned mcu_nvsbs;
1877 int notstart;
1878 int notdone;
1879 int vdec;
1880 unsigned sbi;
1881 unsigned sbi_end;
1882 int refi;
1883 int pli;
1884 set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
1885 _enc->state.frame_type=OC_INTER_FRAME;
1886 oc_mode_scheme_chooser_reset(&_enc->chooser);
1887 oc_enc_tokenize_start(_enc);
1888 oc_enc_pipeline_init(_enc,&pipe);
1889 if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
1890 _enc->mv_bits[0]=_enc->mv_bits[1]=0;
1891 interbits=intrabits=0;
1892 last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
1893 /*Choose MVs and MB modes and quantize and code luma.
1894 Must be done in Hilbert order.*/
1895 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1896 nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1897 qi=_enc->state.qis[0];
1898 coded_mbis=_enc->coded_mbis;
1899 uncoded_mbis=coded_mbis+_enc->state.nmbs;
1900 ncoded_mbis=0;
1901 nuncoded_mbis=0;
1902 _enc->state.ncoded_fragis[0]=0;
1903 _enc->state.ncoded_fragis[1]=0;
1904 _enc->state.ncoded_fragis[2]=0;
1905 sb_flags=_enc->state.sb_flags;
1906 mb_modes=_enc->state.mb_modes;
1907 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1908 mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1909 embs=_enc->mb_info;
1910 frags=_enc->state.frags;
1911 frag_mvs=_enc->state.frag_mvs;
1912 vdec=!(_enc->state.info.pixel_fmt&2);
1913 notstart=0;
1914 notdone=1;
1915 mcu_nvsbs=_enc->mcu_nvsbs;
1916 for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1917 notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
1918 sbi_end=pipe.sbi_end[0];
1919 for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
1920 int quadi;
1921 /*Mode addressing is through Y plane, always 4 MB per SB.*/
1922 for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1923 oc_mode_choice modes[8];
1924 unsigned skip_ssd[12];
1925 unsigned intra_satd[12];
1926 int mb_mv_bits_0;
1927 int mb_gmv_bits_0;
1928 int inter_mv_pref;
1929 int mb_mode;
1930 int dx;
1931 int dy;
1932 unsigned mbi;
1933 int mapii;
1934 int mapi;
1935 int bi;
1936 ptrdiff_t fragi;
1937 mbi=sbi<<2|quadi;
1938 /*Motion estimation:
1939 We always do a basic 1MV search for all macroblocks, coded or not,
1940 keyframe or not.*/
1941 if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
1942 dx=dy=0;
1943 /*Find the block choice with the lowest estimated coding cost.
1944 If a Cb or Cr block is coded but no Y' block from a macro block then
1945 the mode MUST be OC_MODE_INTER_NOMV.
1946 This is the default state to which the mode data structure is
1947 initialised in encoder and decoder at the start of each frame.*/
1948 /*Block coding cost is estimated from correlated SATD metrics.*/
1949 /*At this point, all blocks that are in frame are still marked coded.*/
1950 if(!_recode){
1951 memcpy(embs[mbi].unref_mv,
1952 embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
1953 embs[mbi].refined=0;
1954 }
1955 oc_mb_intra_satd(_enc,mbi,intra_satd);
1956 /*Estimate the cost of coding this MB in a keyframe.*/
1957 if(_allow_keyframe){
1958 oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
1959 pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
1960 intrabits+=modes[OC_MODE_INTRA].rate;
1961 for(bi=0;bi<4;bi++){
1962 oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
1963 modes[OC_MODE_INTRA].qii[bi]);
1964 }
1965 }
1966 /*Estimate the cost in a delta frame for various modes.*/
1967 oc_skip_cost(_enc,&pipe,mbi,skip_ssd);
1968 oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
1969 OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
1970 if(_enc->sp_level<OC_SP_LEVEL_NOMC){
1971 oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
1972 pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd);
1973 mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
1974 OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
1975 pipe.fr+0,pipe.qs+0,skip_ssd);
1976 oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
1977 OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1978 oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
1979 OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1980 oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
1981 embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1982 oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
1983 OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
1984 mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
1985 OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
1986 pipe.fr+0,pipe.qs+0,skip_ssd);
1987 /*The explicit MV modes (2,6,7) have not yet gone through halfpel
1988 refinement.
1989 We choose the explicit MV mode that's already furthest ahead on
1990 R-D cost and refine only that one.
1991 We have to be careful to remember which ones we've refined so that
1992 we don't refine it again if we re-encode this frame.*/
1993 inter_mv_pref=_enc->lambda*3;
1994 if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
1995 modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
1996 if(!(embs[mbi].refined&0x80)){
1997 oc_mcenc_refine4mv(_enc,mbi);
1998 embs[mbi].refined|=0x80;
1999 }
2000 oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2001 embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
2002 }
2003 else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2004 modes[OC_MODE_INTER_MV].cost){
2005 if(!(embs[mbi].refined&0x40)){
2006 oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2007 embs[mbi].refined|=0x40;
2008 }
2009 mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2010 OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2011 pipe.fr+0,pipe.qs+0,skip_ssd);
2012 }
2013 if(!(embs[mbi].refined&0x04)){
2014 oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2015 embs[mbi].refined|=0x04;
2016 }
2017 mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2018 OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2019 pipe.fr+0,pipe.qs+0,skip_ssd);
2020 /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2021 mb_mode=OC_MODE_INTER_NOMV;
2022 if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2023 mb_mode=OC_MODE_INTRA;
2024 }
2025 if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2026 mb_mode=OC_MODE_INTER_MV_LAST;
2027 }
2028 if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2029 mb_mode=OC_MODE_INTER_MV_LAST2;
2030 }
2031 if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2032 mb_mode=OC_MODE_GOLDEN_NOMV;
2033 }
2034 if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2035 mb_mode=OC_MODE_GOLDEN_MV;
2036 }
2037 if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2038 mb_mode=OC_MODE_INTER_MV_FOUR;
2039 }
2040 /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2041 if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2042 inter_mv_pref=0;
2043 }
2044 if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2045 mb_mode=OC_MODE_INTER_MV;
2046 }
2047 }
2048 else{
2049 oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2050 OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
2051 mb_mode=OC_MODE_INTER_NOMV;
2052 if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2053 mb_mode=OC_MODE_INTRA;
2054 }
2055 if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2056 mb_mode=OC_MODE_GOLDEN_NOMV;
2057 }
2058 mb_mv_bits_0=mb_gmv_bits_0=0;
2059 }
2060 mb_modes[mbi]=mb_mode;
2061 /*Propagate the MVs to the luma blocks.*/
2062 if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2063 switch(mb_mode){
2064 case OC_MODE_INTER_MV:{
2065 dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
2066 dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
2067 }break;
2068 case OC_MODE_INTER_MV_LAST:{
2069 dx=last_mv[0];
2070 dy=last_mv[1];
2071 }break;
2072 case OC_MODE_INTER_MV_LAST2:{
2073 dx=prior_mv[0];
2074 dy=prior_mv[1];
2075 }break;
2076 case OC_MODE_GOLDEN_MV:{
2077 dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
2078 dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
2079 }break;
2080 }
2081 for(bi=0;bi<4;bi++){
2082 fragi=mb_maps[mbi][0][bi];
2083 frag_mvs[fragi][0]=(signed char)dx;
2084 frag_mvs[fragi][1]=(signed char)dy;
2085 }
2086 }
2087 for(bi=0;bi<4;bi++){
2088 fragi=sb_maps[mbi>>2][mbi&3][bi];
2089 frags[fragi].qii=modes[mb_mode].qii[bi];
2090 }
2091 if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
2092 modes[mb_mode].overhead>>OC_BIT_SCALE)>0){
2093 int orig_mb_mode;
2094 orig_mb_mode=mb_mode;
2095 mb_mode=mb_modes[mbi];
2096 switch(mb_mode){
2097 case OC_MODE_INTER_MV:{
2098 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2099 /*If we're backing out from 4MV, find the MV we're actually
2100 using.*/
2101 if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2102 for(bi=0;;bi++){
2103 fragi=mb_maps[mbi][0][bi];
2104 if(frags[fragi].coded){
2105 memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
2106 dx=frag_mvs[fragi][0];
2107 dy=frag_mvs[fragi][1];
2108 break;
2109 }
2110 }
2111 mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
2112 }
2113 /*Otherwise we used the original analysis MV.*/
2114 else{
2115 memcpy(last_mv,
2116 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
2117 }
2118 _enc->mv_bits[0]+=mb_mv_bits_0;
2119 _enc->mv_bits[1]+=12;
2120 }break;
2121 case OC_MODE_INTER_MV_LAST2:{
2122 oc_mv tmp_mv;
2123 memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
2124 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2125 memcpy(last_mv,tmp_mv,sizeof(last_mv));
2126 }break;
2127 case OC_MODE_GOLDEN_MV:{
2128 _enc->mv_bits[0]+=mb_gmv_bits_0;
2129 _enc->mv_bits[1]+=12;
2130 }break;
2131 case OC_MODE_INTER_MV_FOUR:{
2132 oc_mv lbmvs[4];
2133 oc_mv cbmvs[4];
2134 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2135 for(bi=0;bi<4;bi++){
2136 fragi=mb_maps[mbi][0][bi];
2137 if(frags[fragi].coded){
2138 memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
2139 memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
2140 _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
2141 +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
2142 _enc->mv_bits[1]+=12;
2143 }
2144 /*Replace the block MVs for not-coded blocks with (0,0).*/
2145 else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
2146 }
2147 (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
2148 for(mapii=4;mapii<nmap_idxs;mapii++){
2149 mapi=map_idxs[mapii];
2150 pli=mapi>>2;
2151 bi=mapi&3;
2152 fragi=mb_maps[mbi][pli][bi];
2153 frags[fragi].mb_mode=mb_mode;
2154 frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2155 memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
2156 }
2157 }break;
2158 }
2159 coded_mbis[ncoded_mbis++]=mbi;
2160 oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2161 interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2162 }
2163 else{
2164 *(uncoded_mbis-++nuncoded_mbis)=mbi;
2165 mb_mode=OC_MODE_INTER_NOMV;
2166 dx=dy=0;
2167 }
2168 /*Propagate final MB mode and MVs to the chroma blocks.
2169 This has already been done for 4MV mode, since it requires individual
2170 block motion vectors.*/
2171 if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2172 for(mapii=4;mapii<nmap_idxs;mapii++){
2173 mapi=map_idxs[mapii];
2174 pli=mapi>>2;
2175 bi=mapi&3;
2176 fragi=mb_maps[mbi][pli][bi];
2177 frags[fragi].mb_mode=mb_mode;
2178 /*If we switched from 4MV mode to INTER_MV mode, then the qii
2179 values won't have been chosen with the right MV, but it's
2180 probaby not worth re-estimating them.*/
2181 frags[fragi].qii=modes[mb_mode].qii[mapii];
2182 frag_mvs[fragi][0]=(signed char)dx;
2183 frag_mvs[fragi][1]=(signed char)dy;
2184 }
2185 }
2186 }
2187 oc_fr_state_flush_sb(pipe.fr+0);
2188 sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
2189 sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
2190 }
2191 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
2192 /*Code chroma planes.*/
2193 for(pli=1;pli<3;pli++){
2194 oc_enc_sb_transform_quantize_chroma(_enc,&pipe,
2195 pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
2196 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
2197 }
2198 notstart=1;
2199 }
2200 /*Finish filling in the reference frame borders.*/
2201 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2202 for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2203 /*Finish adding flagging overhead costs to inter bit counts to determine if
2204 we should have coded a key frame instead.*/
2205 if(_allow_keyframe){
2206 if(interbits>intrabits)return 1;
2207 /*Technically the chroma plane counts are over-estimations, because they
2208 don't account for continuing runs from the luma planes, but the
2209 inaccuracy is small.*/
2210 for(pli=0;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
2211 interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2212 interbits+=
2213 _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
2214 if(interbits>intrabits)return 1;
2215 }
2216 _enc->ncoded_mbis=ncoded_mbis;
2217 /*Compact the coded fragment list.*/
2218 {
2219 ptrdiff_t ncoded_fragis;
2220 ncoded_fragis=_enc->state.ncoded_fragis[0];
2221 for(pli=1;pli<3;pli++){
2222 memmove(_enc->state.coded_fragis+ncoded_fragis,
2223 _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2224 _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2225 ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2226 }
2227 _enc->state.ntotal_coded_fragis=ncoded_fragis;
2228 }
2229 return 0;
2230 }
2231
2232 #if defined(OC_COLLECT_METRICS)
2233 # include <stdio.h>
2234 # include <math.h>
2235
2236 /*TODO: It may be helpful (for block-level quantizers especially) to separate
2237 out the contributions from AC and DC into separate tables.*/
2238
2239 # define OC_ZWEIGHT (0.25)
2240
oc_mode_metrics_add(oc_mode_metrics * _metrics,double _w,int _satd,int _rate,double _rmse)2241 static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
2242 double _w,int _satd,int _rate,double _rmse){
2243 double rate;
2244 /*Accumulate statistics without the scaling; this lets us change the scale
2245 factor yet still use old data.*/
2246 rate=ldexp(_rate,-OC_BIT_SCALE);
2247 if(_metrics->fragw>0){
2248 double dsatd;
2249 double drate;
2250 double drmse;
2251 double w;
2252 dsatd=_satd-_metrics->satd/_metrics->fragw;
2253 drate=rate-_metrics->rate/_metrics->fragw;
2254 drmse=_rmse-_metrics->rmse/_metrics->fragw;
2255 w=_metrics->fragw*_w/(_metrics->fragw+_w);
2256 _metrics->satd2+=dsatd*dsatd*w;
2257 _metrics->satdrate+=dsatd*drate*w;
2258 _metrics->rate2+=drate*drate*w;
2259 _metrics->satdrmse+=dsatd*drmse*w;
2260 _metrics->rmse2+=drmse*drmse*w;
2261 }
2262 _metrics->fragw+=_w;
2263 _metrics->satd+=_satd*_w;
2264 _metrics->rate+=rate*_w;
2265 _metrics->rmse+=_rmse*_w;
2266 }
2267
oc_mode_metrics_merge(oc_mode_metrics * _dst,const oc_mode_metrics * _src,int _n)2268 static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
2269 const oc_mode_metrics *_src,int _n){
2270 int i;
2271 /*Find a non-empty set of metrics.*/
2272 for(i=0;i<_n&&_src[i].fragw<=0;i++);
2273 if(i>=_n){
2274 memset(_dst,0,sizeof(*_dst));
2275 return;
2276 }
2277 memcpy(_dst,_src+i,sizeof(*_dst));
2278 /*And iterate over the remaining non-empty sets of metrics.*/
2279 for(i++;i<_n;i++)if(_src[i].fragw>0){
2280 double wa;
2281 double wb;
2282 double dsatd;
2283 double drate;
2284 double drmse;
2285 double w;
2286 wa=_dst->fragw;
2287 wb=_src[i].fragw;
2288 dsatd=_src[i].satd/wb-_dst->satd/wa;
2289 drate=_src[i].rate/wb-_dst->rate/wa;
2290 drmse=_src[i].rmse/wb-_dst->rmse/wa;
2291 w=wa*wb/(wa+wb);
2292 _dst->fragw+=_src[i].fragw;
2293 _dst->satd+=_src[i].satd;
2294 _dst->rate+=_src[i].rate;
2295 _dst->rmse+=_src[i].rmse;
2296 _dst->satd2+=_src[i].satd2+dsatd*dsatd*w;
2297 _dst->satdrate+=_src[i].satdrate+dsatd*drate*w;
2298 _dst->rate2+=_src[i].rate2+drate*drate*w;
2299 _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w;
2300 _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
2301 }
2302 }
2303
2304 /*Compile collected SATD/rate/RMSE metrics into a form that's immediately
2305 useful for mode decision.*/
oc_enc_mode_metrics_update(oc_enc_ctx * _enc,int _qi)2306 static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){
2307 int pli;
2308 int qti;
2309 oc_restore_fpu(&_enc->state);
2310 /*Convert raw collected data into cleaned up sample points.*/
2311 for(pli=0;pli<3;pli++){
2312 for(qti=0;qti<2;qti++){
2313 double fragw;
2314 int bin0;
2315 int bin1;
2316 int bin;
2317 fragw=0;
2318 bin0=bin1=0;
2319 for(bin=0;bin<OC_SAD_BINS;bin++){
2320 oc_mode_metrics metrics;
2321 OC_MODE_RD[_qi][pli][qti][bin].rate=0;
2322 OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
2323 /*Find some points on either side of the current bin.*/
2324 while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
2325 fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
2326 }
2327 while(bin0+1<bin&&bin0+1<bin1&&
2328 fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
2329 fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
2330 }
2331 /*Merge statistics and fit lines.*/
2332 oc_mode_metrics_merge(&metrics,
2333 OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
2334 if(metrics.fragw>0&&metrics.satd2>0){
2335 double a;
2336 double b;
2337 double msatd;
2338 double mrate;
2339 double mrmse;
2340 double rate;
2341 double rmse;
2342 msatd=metrics.satd/metrics.fragw;
2343 mrate=metrics.rate/metrics.fragw;
2344 mrmse=metrics.rmse/metrics.fragw;
2345 /*Compute the points on these lines corresponding to the actual bin
2346 value.*/
2347 b=metrics.satdrate/metrics.satd2;
2348 a=mrate-b*msatd;
2349 rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
2350 OC_MODE_RD[_qi][pli][qti][bin].rate=
2351 (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
2352 b=metrics.satdrmse/metrics.satd2;
2353 a=mrmse-b*msatd;
2354 rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
2355 OC_MODE_RD[_qi][pli][qti][bin].rmse=
2356 (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
2357 }
2358 }
2359 }
2360 }
2361 }
2362
2363
2364
2365 /*The following token skipping code used to also be used in the decoder (and
2366 even at one point other places in the encoder).
2367 However, it was obsoleted by other optimizations, and is now only used here.
2368 It has been moved here to avoid generating the code when it's not needed.*/
2369
2370 /*Determines the number of blocks or coefficients to be skipped for a given
2371 token value.
2372 _token: The token value to skip.
2373 _extra_bits: The extra bits attached to this token.
2374 Return: A positive value indicates that number of coefficients are to be
2375 skipped in the current block.
2376 Otherwise, the negative of the return value indicates that number of
2377 blocks are to be ended.*/
2378 typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
2379
2380 /*Handles the simple end of block tokens.*/
oc_token_skip_eob(int _token,int _extra_bits)2381 static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
2382 int nblocks_adjust;
2383 nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
2384 return -_extra_bits-nblocks_adjust;
2385 }
2386
2387 /*The last EOB token has a special case, where an EOB run of size zero ends all
2388 the remaining blocks in the frame.*/
oc_token_skip_eob6(int _token,int _extra_bits)2389 static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
2390 /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
2391 yet available everywhere; this should be equivalent.*/
2392 if(!_extra_bits)return -(~(size_t)0>>1);
2393 return -_extra_bits;
2394 }
2395
2396 /*Handles the pure zero run tokens.*/
oc_token_skip_zrl(int _token,int _extra_bits)2397 static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
2398 return _extra_bits+1;
2399 }
2400
2401 /*Handles a normal coefficient value token.*/
oc_token_skip_val(void)2402 static ptrdiff_t oc_token_skip_val(void){
2403 return 1;
2404 }
2405
2406 /*Handles a category 1A zero run/coefficient value combo token.*/
oc_token_skip_run_cat1a(int _token)2407 static ptrdiff_t oc_token_skip_run_cat1a(int _token){
2408 return _token-OC_DCT_RUN_CAT1A+2;
2409 }
2410
2411 /*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
oc_token_skip_run(int _token,int _extra_bits)2412 static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
2413 int run_cati;
2414 int ncoeffs_mask;
2415 int ncoeffs_adjust;
2416 run_cati=_token-OC_DCT_RUN_CAT1B;
2417 ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
2418 ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
2419 return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
2420 }
2421
2422 /*A jump table for computing the number of coefficients or blocks to skip for
2423 a given token value.
2424 This reduces all the conditional branches, etc., needed to parse these token
2425 values down to one indirect jump.*/
2426 static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
2427 oc_token_skip_eob,
2428 oc_token_skip_eob,
2429 oc_token_skip_eob,
2430 oc_token_skip_eob,
2431 oc_token_skip_eob,
2432 oc_token_skip_eob,
2433 oc_token_skip_eob6,
2434 oc_token_skip_zrl,
2435 oc_token_skip_zrl,
2436 (oc_token_skip_func)oc_token_skip_val,
2437 (oc_token_skip_func)oc_token_skip_val,
2438 (oc_token_skip_func)oc_token_skip_val,
2439 (oc_token_skip_func)oc_token_skip_val,
2440 (oc_token_skip_func)oc_token_skip_val,
2441 (oc_token_skip_func)oc_token_skip_val,
2442 (oc_token_skip_func)oc_token_skip_val,
2443 (oc_token_skip_func)oc_token_skip_val,
2444 (oc_token_skip_func)oc_token_skip_val,
2445 (oc_token_skip_func)oc_token_skip_val,
2446 (oc_token_skip_func)oc_token_skip_val,
2447 (oc_token_skip_func)oc_token_skip_val,
2448 (oc_token_skip_func)oc_token_skip_val,
2449 (oc_token_skip_func)oc_token_skip_val,
2450 (oc_token_skip_func)oc_token_skip_run_cat1a,
2451 (oc_token_skip_func)oc_token_skip_run_cat1a,
2452 (oc_token_skip_func)oc_token_skip_run_cat1a,
2453 (oc_token_skip_func)oc_token_skip_run_cat1a,
2454 (oc_token_skip_func)oc_token_skip_run_cat1a,
2455 oc_token_skip_run,
2456 oc_token_skip_run,
2457 oc_token_skip_run,
2458 oc_token_skip_run
2459 };
2460
2461 /*Determines the number of blocks or coefficients to be skipped for a given
2462 token value.
2463 _token: The token value to skip.
2464 _extra_bits: The extra bits attached to this token.
2465 Return: A positive value indicates that number of coefficients are to be
2466 skipped in the current block.
2467 Otherwise, the negative of the return value indicates that number of
2468 blocks are to be ended.
2469 0 will never be returned, so that at least one coefficient in one
2470 block will always be decoded for every token.*/
oc_dct_token_skip(int _token,int _extra_bits)2471 static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
2472 return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
2473 }
2474
2475
2476
oc_enc_mode_metrics_collect(oc_enc_ctx * _enc)2477 void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
2478 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
2479 0,16,16,16,16,16,32,32,
2480 32,32,32,32,32,32,32,48,
2481 48,48,48,48,48,48,48,48,
2482 48,48,48,48,64,64,64,64,
2483 64,64,64,64,64,64,64,64,
2484 64,64,64,64,64,64,64,64,
2485 64,64,64,64,64,64,64,64
2486 };
2487 const oc_fragment *frags;
2488 const unsigned *frag_satd;
2489 const unsigned *frag_ssd;
2490 const ptrdiff_t *coded_fragis;
2491 ptrdiff_t ncoded_fragis;
2492 ptrdiff_t fragii;
2493 double fragw;
2494 int qti;
2495 int qii;
2496 int qi;
2497 int pli;
2498 int zzi;
2499 int token;
2500 int eb;
2501 oc_restore_fpu(&_enc->state);
2502 /*Load any existing mode metrics if we haven't already.*/
2503 if(!oc_has_mode_metrics){
2504 FILE *fmetrics;
2505 memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
2506 fmetrics=fopen("modedec.stats","rb");
2507 if(fmetrics!=NULL){
2508 fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
2509 fclose(fmetrics);
2510 }
2511 for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
2512 oc_has_mode_metrics=1;
2513 }
2514 qti=_enc->state.frame_type;
2515 frags=_enc->state.frags;
2516 frag_satd=_enc->frag_satd;
2517 frag_ssd=_enc->frag_ssd;
2518 coded_fragis=_enc->state.coded_fragis;
2519 ncoded_fragis=fragii=0;
2520 /*Weight the fragments by the inverse frame size; this prevents HD content
2521 from dominating the statistics.*/
2522 fragw=1.0/_enc->state.nfrags;
2523 for(pli=0;pli<3;pli++){
2524 ptrdiff_t ti[64];
2525 int eob_token[64];
2526 int eob_run[64];
2527 /*Set up token indices and eob run counts.
2528 We don't bother trying to figure out the real cost of the runs that span
2529 coefficients; instead we use the costs that were available when R-D
2530 token optimization was done.*/
2531 for(zzi=0;zzi<64;zzi++){
2532 ti[zzi]=_enc->dct_token_offs[pli][zzi];
2533 if(ti[zzi]>0){
2534 token=_enc->dct_tokens[pli][zzi][0];
2535 eb=_enc->extra_bits[pli][zzi][0];
2536 eob_token[zzi]=token;
2537 eob_run[zzi]=-oc_dct_token_skip(token,eb);
2538 }
2539 else{
2540 eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
2541 eob_run[zzi]=0;
2542 }
2543 }
2544 /*Scan the list of coded fragments for this plane.*/
2545 ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2546 for(;fragii<ncoded_fragis;fragii++){
2547 ptrdiff_t fragi;
2548 ogg_uint32_t frag_bits;
2549 int huffi;
2550 int skip;
2551 int mb_mode;
2552 unsigned satd;
2553 int bin;
2554 fragi=coded_fragis[fragii];
2555 frag_bits=0;
2556 for(zzi=0;zzi<64;){
2557 if(eob_run[zzi]>0){
2558 /*We've reached the end of the block.*/
2559 eob_run[zzi]--;
2560 break;
2561 }
2562 huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
2563 +OC_ZZI_HUFF_OFFSET[zzi];
2564 if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
2565 /*This token caused an EOB run to be flushed.
2566 Therefore it gets the bits associated with it.*/
2567 frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
2568 +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
2569 eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
2570 }
2571 token=_enc->dct_tokens[pli][zzi][ti[zzi]];
2572 eb=_enc->extra_bits[pli][zzi][ti[zzi]];
2573 ti[zzi]++;
2574 skip=oc_dct_token_skip(token,eb);
2575 if(skip<0){
2576 eob_token[zzi]=token;
2577 eob_run[zzi]=-skip;
2578 }
2579 else{
2580 /*A regular DCT value token; accumulate the bits for it.*/
2581 frag_bits+=_enc->huff_codes[huffi][token].nbits
2582 +OC_DCT_TOKEN_EXTRA_BITS[token];
2583 zzi+=skip;
2584 }
2585 }
2586 mb_mode=frags[fragi].mb_mode;
2587 qi=_enc->state.qis[frags[fragi].qii];
2588 satd=frag_satd[fragi]<<(pli+1&2);
2589 bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1);
2590 oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin,
2591 fragw,satd,frag_bits<<OC_BIT_SCALE,sqrt(frag_ssd[fragi]));
2592 }
2593 }
2594 /*Update global SATD/rate/RMSE estimation matrix.*/
2595 for(qii=0;qii<_enc->state.nqis;qii++){
2596 oc_enc_mode_metrics_update(_enc,_enc->state.qis[qii]);
2597 }
2598 }
2599
oc_enc_mode_metrics_dump(oc_enc_ctx * _enc)2600 void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){
2601 FILE *fmetrics;
2602 int qi;
2603 /*Generate sample points for complete list of QI values.*/
2604 for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
2605 fmetrics=fopen("modedec.stats","wb");
2606 if(fmetrics!=NULL){
2607 fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
2608 fclose(fmetrics);
2609 }
2610 fprintf(stdout,
2611 "/*File generated by libtheora with OC_COLLECT_METRICS"
2612 " defined at compile time.*/\n"
2613 "#if !defined(_modedec_H)\n"
2614 "# define _modedec_H (1)\n"
2615 "\n"
2616 "\n"
2617 "\n"
2618 "# if defined(OC_COLLECT_METRICS)\n"
2619 "typedef struct oc_mode_metrics oc_mode_metrics;\n"
2620 "# endif\n"
2621 "typedef struct oc_mode_rd oc_mode_rd;\n"
2622 "\n"
2623 "\n"
2624 "\n"
2625 "/*The number of extra bits of precision at which to store rate"
2626 " metrics.*/\n"
2627 "# define OC_BIT_SCALE (%i)\n"
2628 "/*The number of extra bits of precision at which to store RMSE metrics.\n"
2629 " This must be at least half OC_BIT_SCALE (rounded up).*/\n"
2630 "# define OC_RMSE_SCALE (%i)\n"
2631 "/*The number of bins to partition statistics into.*/\n"
2632 "# define OC_SAD_BINS (%i)\n"
2633 "/*The number of bits of precision to drop"
2634 " from SAD scores to assign them to a\n"
2635 " bin.*/\n"
2636 "# define OC_SAD_SHIFT (%i)\n"
2637 "\n"
2638 "\n"
2639 "\n"
2640 "# if defined(OC_COLLECT_METRICS)\n"
2641 "struct oc_mode_metrics{\n"
2642 " double fragw;\n"
2643 " double satd;\n"
2644 " double rate;\n"
2645 " double rmse;\n"
2646 " double satd2;\n"
2647 " double satdrate;\n"
2648 " double rate2;\n"
2649 " double satdrmse;\n"
2650 " double rmse2;\n"
2651 "};\n"
2652 "\n"
2653 "\n"
2654 "int oc_has_mode_metrics;\n"
2655 "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
2656 "# endif\n"
2657 "\n"
2658 "\n"
2659 "\n"
2660 "struct oc_mode_rd{\n"
2661 " ogg_int16_t rate;\n"
2662 " ogg_int16_t rmse;\n"
2663 "};\n"
2664 "\n"
2665 "\n"
2666 "# if !defined(OC_COLLECT_METRICS)\n"
2667 "static const\n"
2668 "# endif\n"
2669 "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
2670 OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
2671 for(qi=0;qi<64;qi++){
2672 int pli;
2673 fprintf(stdout," {\n");
2674 for(pli=0;pli<3;pli++){
2675 int qti;
2676 fprintf(stdout," {\n");
2677 for(qti=0;qti<2;qti++){
2678 int bin;
2679 static const char *pl_names[3]={"Y'","Cb","Cr"};
2680 static const char *qti_names[2]={"INTRA","INTER"};
2681 fprintf(stdout," /*%s qi=%i %s*/\n",
2682 pl_names[pli],qi,qti_names[qti]);
2683 fprintf(stdout," {\n");
2684 fprintf(stdout," ");
2685 for(bin=0;bin<OC_SAD_BINS;bin++){
2686 if(bin&&!(bin&0x3))fprintf(stdout,"\n ");
2687 fprintf(stdout,"{%5i,%5i}",
2688 OC_MODE_RD[qi][pli][qti][bin].rate,
2689 OC_MODE_RD[qi][pli][qti][bin].rmse);
2690 if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
2691 }
2692 fprintf(stdout,"\n }");
2693 if(qti<1)fprintf(stdout,",");
2694 fprintf(stdout,"\n");
2695 }
2696 fprintf(stdout," }");
2697 if(pli<2)fprintf(stdout,",");
2698 fprintf(stdout,"\n");
2699 }
2700 fprintf(stdout," }");
2701 if(qi<63)fprintf(stdout,",");
2702 fprintf(stdout,"\n");
2703 }
2704 fprintf(stdout,
2705 "};\n"
2706 "\n"
2707 "#endif\n");
2708 }
2709 #endif
2710