1 /*****************************************************************************
2 *
3 * XVID MPEG-4 VIDEO CODEC
4 * - MB Transfer/Quantization functions -
5 *
6 * Copyright(C) 2001-2010 Peter Ross <pross@xvid.org>
7 * 2001-2010 Michael Militzer <michael@xvid.org>
8 * 2003 Edouard Gomez <ed.gomez@free.fr>
9 *
10 * This program is free software ; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation ; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY ; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program ; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * $Id: mbtransquant.c 2053 2011-11-04 15:23:46Z Isibaar $
25 *
26 ****************************************************************************/
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "../portab.h"
33 #include "mbfunctions.h"
34
35 #include "../global.h"
36 #include "mem_transfer.h"
37 #include "timer.h"
38 #include "../bitstream/mbcoding.h"
39 #include "../bitstream/zigzag.h"
40 #include "../dct/fdct.h"
41 #include "../dct/idct.h"
42 #include "../quant/quant.h"
43 #include "../motion/sad.h"
44 #include "../encoder.h"
45
46 #include "../quant/quant_matrix.h"
47
48 MBFIELDTEST_PTR MBFieldTest;
49
50 /*
51 * Skip blocks having a coefficient sum below this value. This value will be
52 * corrected according to the MB quantizer to avoid artifacts for quant==1
53 */
54 #define PVOP_TOOSMALL_LIMIT 1
55 #define BVOP_TOOSMALL_LIMIT 3
56
57 /*****************************************************************************
58 * Local functions
59 ****************************************************************************/
60
61 /* permute block and return field dct choice */
62 static __inline uint32_t
MBDecideFieldDCT(int16_t data[6* 64])63 MBDecideFieldDCT(int16_t data[6 * 64])
64 {
65 uint32_t field = MBFieldTest(data);
66
67 if (field)
68 MBFrameToField(data);
69
70 return field;
71 }
72
73 /* Performs Forward DCT on all blocks */
74 static __inline void
MBfDCT(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,uint32_t x_pos,uint32_t y_pos,int16_t data[6* 64])75 MBfDCT(const MBParam * const pParam,
76 const FRAMEINFO * const frame,
77 MACROBLOCK * const pMB,
78 uint32_t x_pos,
79 uint32_t y_pos,
80 int16_t data[6 * 64])
81 {
82 /* Handles interlacing */
83 start_timer();
84 pMB->field_dct = 0;
85 if ((frame->vol_flags & XVID_VOL_INTERLACING) &&
86 (x_pos>0) && (x_pos<pParam->mb_width-1) &&
87 (y_pos>0) && (y_pos<pParam->mb_height-1)) {
88 pMB->field_dct = MBDecideFieldDCT(data);
89 }
90 stop_interlacing_timer();
91
92 /* Perform DCT */
93 start_timer();
94 fdct((short * const)&data[0 * 64]);
95 fdct((short * const)&data[1 * 64]);
96 fdct((short * const)&data[2 * 64]);
97 fdct((short * const)&data[3 * 64]);
98 fdct((short * const)&data[4 * 64]);
99 fdct((short * const)&data[5 * 64]);
100 stop_dct_timer();
101 }
102
103 /* Performs Inverse DCT on all blocks */
104 static __inline void
MBiDCT(int16_t data[6* 64],const uint8_t cbp)105 MBiDCT(int16_t data[6 * 64],
106 const uint8_t cbp)
107 {
108 start_timer();
109 if(cbp & (1 << (5 - 0))) idct((short * const)&data[0 * 64]);
110 if(cbp & (1 << (5 - 1))) idct((short * const)&data[1 * 64]);
111 if(cbp & (1 << (5 - 2))) idct((short * const)&data[2 * 64]);
112 if(cbp & (1 << (5 - 3))) idct((short * const)&data[3 * 64]);
113 if(cbp & (1 << (5 - 4))) idct((short * const)&data[4 * 64]);
114 if(cbp & (1 << (5 - 5))) idct((short * const)&data[5 * 64]);
115 stop_idct_timer();
116 }
117
118 /* Quantize all blocks -- Intra mode */
119 static __inline void
MBQuantIntra(const MBParam * pParam,const FRAMEINFO * const frame,const MACROBLOCK * pMB,int16_t qcoeff[6* 64],int16_t data[6* 64])120 MBQuantIntra(const MBParam * pParam,
121 const FRAMEINFO * const frame,
122 const MACROBLOCK * pMB,
123 int16_t qcoeff[6 * 64],
124 int16_t data[6*64])
125 {
126 int scaler_lum, scaler_chr;
127 quant_intraFuncPtr quant;
128
129 /* check if quant matrices need to be re-initialized with new quant */
130 if (pParam->vol_flags & XVID_VOL_MPEGQUANT) {
131 if (pParam->last_quant_initialized_intra != pMB->quant) {
132 init_intra_matrix(pParam->mpeg_quant_matrices, pMB->quant);
133 }
134 quant = quant_mpeg_intra;
135 } else {
136 quant = quant_h263_intra;
137 }
138
139 scaler_lum = get_dc_scaler(pMB->quant, 1);
140 scaler_chr = get_dc_scaler(pMB->quant, 0);
141
142 /* Quantize the block */
143 start_timer();
144 quant(&data[0 * 64], &qcoeff[0 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
145 quant(&data[1 * 64], &qcoeff[1 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
146 quant(&data[2 * 64], &qcoeff[2 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
147 quant(&data[3 * 64], &qcoeff[3 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
148 quant(&data[4 * 64], &qcoeff[4 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices);
149 quant(&data[5 * 64], &qcoeff[5 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices);
150 stop_quant_timer();
151 }
152
153 /* DeQuantize all blocks -- Intra mode */
154 static __inline void
MBDeQuantIntra(const MBParam * pParam,const int iQuant,int16_t qcoeff[6* 64],int16_t data[6* 64])155 MBDeQuantIntra(const MBParam * pParam,
156 const int iQuant,
157 int16_t qcoeff[6 * 64],
158 int16_t data[6*64])
159 {
160 int mpeg;
161 int scaler_lum, scaler_chr;
162
163 quant_intraFuncPtr const dequant[2] =
164 {
165 dequant_h263_intra,
166 dequant_mpeg_intra
167 };
168
169 mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
170 scaler_lum = get_dc_scaler(iQuant, 1);
171 scaler_chr = get_dc_scaler(iQuant, 0);
172
173 start_timer();
174 dequant[mpeg](&qcoeff[0 * 64], &data[0 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
175 dequant[mpeg](&qcoeff[1 * 64], &data[1 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
176 dequant[mpeg](&qcoeff[2 * 64], &data[2 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
177 dequant[mpeg](&qcoeff[3 * 64], &data[3 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
178 dequant[mpeg](&qcoeff[4 * 64], &data[4 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices);
179 dequant[mpeg](&qcoeff[5 * 64], &data[5 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices);
180 stop_iquant_timer();
181 }
182
183 static int
184 dct_quantize_trellis_c(int16_t *const Out,
185 const int16_t *const In,
186 int Q,
187 const uint16_t * const Zigzag,
188 const uint16_t * const QuantMatrix,
189 int Non_Zero,
190 int Sum,
191 int Lambda_Mod,
192 const uint32_t rel_var8,
193 const int Metric);
194
195 /* Quantize all blocks -- Inter mode */
196 static __inline uint8_t
MBQuantInter(const MBParam * pParam,const FRAMEINFO * const frame,const MACROBLOCK * pMB,int16_t data[6* 64],int16_t qcoeff[6* 64],int bvop,int limit)197 MBQuantInter(const MBParam * pParam,
198 const FRAMEINFO * const frame,
199 const MACROBLOCK * pMB,
200 int16_t data[6 * 64],
201 int16_t qcoeff[6 * 64],
202 int bvop,
203 int limit)
204 {
205
206 int i;
207 uint8_t cbp = 0;
208 int sum;
209 int code_block, mpeg;
210
211 quant_interFuncPtr const quant[2] =
212 {
213 quant_h263_inter,
214 quant_mpeg_inter
215 };
216
217 mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
218
219 for (i = 0; i < 6; i++) {
220
221 /* Quantize the block */
222 start_timer();
223
224 sum = quant[mpeg](&qcoeff[i*64], &data[i*64], pMB->quant, pParam->mpeg_quant_matrices);
225
226 if(sum && (pMB->quant > 2) && (frame->vop_flags & XVID_VOP_TRELLISQUANT)) {
227 const uint16_t *matrix;
228 const static uint16_t h263matrix[] =
229 {
230 16, 16, 16, 16, 16, 16, 16, 16,
231 16, 16, 16, 16, 16, 16, 16, 16,
232 16, 16, 16, 16, 16, 16, 16, 16,
233 16, 16, 16, 16, 16, 16, 16, 16,
234 16, 16, 16, 16, 16, 16, 16, 16,
235 16, 16, 16, 16, 16, 16, 16, 16,
236 16, 16, 16, 16, 16, 16, 16, 16,
237 16, 16, 16, 16, 16, 16, 16, 16
238 };
239
240 matrix = (mpeg)?get_inter_matrix(pParam->mpeg_quant_matrices):h263matrix;
241 sum = dct_quantize_trellis_c(&qcoeff[i*64], &data[i*64],
242 pMB->quant, &scan_tables[0][0],
243 matrix,
244 63,
245 sum,
246 pMB->lambda[i],
247 pMB->rel_var8[i],
248 !!(frame->vop_flags & XVID_VOP_RD_PSNRHVSM));
249 }
250 stop_quant_timer();
251
252 /*
253 * We code the block if the sum is higher than the limit and if the first
254 * two AC coefficients in zig zag order are not zero.
255 */
256 code_block = 0;
257 if ((sum >= limit) || (qcoeff[i*64+1] != 0) || (qcoeff[i*64+8] != 0)) {
258 code_block = 1;
259 } else {
260
261 if (bvop && (pMB->mode == MODE_DIRECT || pMB->mode == MODE_DIRECT_NO4V)) {
262 /* dark blocks prevention for direct mode */
263 if ((qcoeff[i*64] < -1) || (qcoeff[i*64] > 0))
264 code_block = 1;
265 } else {
266 /* not direct mode */
267 if (qcoeff[i*64] != 0)
268 code_block = 1;
269 }
270 }
271
272 /* Set the corresponding cbp bit */
273 cbp |= code_block << (5 - i);
274 }
275
276 return(cbp);
277 }
278
279 /* DeQuantize all blocks -- Inter mode */
280 static __inline void
MBDeQuantInter(const MBParam * pParam,const int iQuant,int16_t data[6* 64],int16_t qcoeff[6* 64],const uint8_t cbp)281 MBDeQuantInter(const MBParam * pParam,
282 const int iQuant,
283 int16_t data[6 * 64],
284 int16_t qcoeff[6 * 64],
285 const uint8_t cbp)
286 {
287 int mpeg;
288
289 quant_interFuncPtr const dequant[2] =
290 {
291 dequant_h263_inter,
292 dequant_mpeg_inter
293 };
294
295 mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
296
297 start_timer();
298 if(cbp & (1 << (5 - 0))) dequant[mpeg](&data[0 * 64], &qcoeff[0 * 64], iQuant, pParam->mpeg_quant_matrices);
299 if(cbp & (1 << (5 - 1))) dequant[mpeg](&data[1 * 64], &qcoeff[1 * 64], iQuant, pParam->mpeg_quant_matrices);
300 if(cbp & (1 << (5 - 2))) dequant[mpeg](&data[2 * 64], &qcoeff[2 * 64], iQuant, pParam->mpeg_quant_matrices);
301 if(cbp & (1 << (5 - 3))) dequant[mpeg](&data[3 * 64], &qcoeff[3 * 64], iQuant, pParam->mpeg_quant_matrices);
302 if(cbp & (1 << (5 - 4))) dequant[mpeg](&data[4 * 64], &qcoeff[4 * 64], iQuant, pParam->mpeg_quant_matrices);
303 if(cbp & (1 << (5 - 5))) dequant[mpeg](&data[5 * 64], &qcoeff[5 * 64], iQuant, pParam->mpeg_quant_matrices);
304 stop_iquant_timer();
305 }
306
307 typedef void (transfer_operation_8to16_t) (int16_t *Dst, const uint8_t *Src, int BpS);
308 typedef void (transfer_operation_16to8_t) (uint8_t *Dst, const int16_t *Src, int BpS);
309
310
311 static __inline void
MBTrans8to16(const MBParam * const pParam,const FRAMEINFO * const frame,const MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64])312 MBTrans8to16(const MBParam * const pParam,
313 const FRAMEINFO * const frame,
314 const MACROBLOCK * const pMB,
315 const uint32_t x_pos,
316 const uint32_t y_pos,
317 int16_t data[6 * 64])
318 {
319 uint32_t stride = pParam->edged_width;
320 uint32_t stride2 = stride / 2;
321 uint32_t next_block = stride * 8;
322 uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
323 const IMAGE * const pCurrent = &frame->image;
324
325 /* Image pointers */
326 pY_Cur = pCurrent->y + (y_pos << 4) * stride + (x_pos << 4);
327 pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3);
328 pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3);
329
330 /* Do the transfer */
331 start_timer();
332 transfer_8to16copy(&data[0 * 64], pY_Cur, stride);
333 transfer_8to16copy(&data[1 * 64], pY_Cur + 8, stride);
334 transfer_8to16copy(&data[2 * 64], pY_Cur + next_block, stride);
335 transfer_8to16copy(&data[3 * 64], pY_Cur + next_block + 8, stride);
336 transfer_8to16copy(&data[4 * 64], pU_Cur, stride2);
337 transfer_8to16copy(&data[5 * 64], pV_Cur, stride2);
338 stop_transfer_timer();
339 }
340
341 static __inline void
MBTrans16to8(const MBParam * const pParam,const FRAMEINFO * const frame,const MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],const uint32_t add,const uint8_t cbp)342 MBTrans16to8(const MBParam * const pParam,
343 const FRAMEINFO * const frame,
344 const MACROBLOCK * const pMB,
345 const uint32_t x_pos,
346 const uint32_t y_pos,
347 int16_t data[6 * 64],
348 const uint32_t add, /* Must be 1 or 0 */
349 const uint8_t cbp)
350 {
351 uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
352 uint32_t stride = pParam->edged_width;
353 uint32_t stride2 = stride / 2;
354 uint32_t next_block = stride * 8;
355 const IMAGE * const pCurrent = &frame->image;
356
357 /* Array of function pointers, indexed by [add] */
358 transfer_operation_16to8_t * const functions[2] =
359 {
360 (transfer_operation_16to8_t*)transfer_16to8copy,
361 (transfer_operation_16to8_t*)transfer_16to8add,
362 };
363
364 transfer_operation_16to8_t *transfer_op = NULL;
365
366 /* Image pointers */
367 pY_Cur = pCurrent->y + (y_pos << 4) * stride + (x_pos << 4);
368 pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3);
369 pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3);
370
371 if (pMB->field_dct) {
372 next_block = stride;
373 stride *= 2;
374 }
375
376 /* Operation function */
377 transfer_op = functions[add];
378
379 /* Do the operation */
380 start_timer();
381 if (cbp&32) transfer_op(pY_Cur, &data[0 * 64], stride);
382 if (cbp&16) transfer_op(pY_Cur + 8, &data[1 * 64], stride);
383 if (cbp& 8) transfer_op(pY_Cur + next_block, &data[2 * 64], stride);
384 if (cbp& 4) transfer_op(pY_Cur + next_block + 8, &data[3 * 64], stride);
385 if (cbp& 2) transfer_op(pU_Cur, &data[4 * 64], stride2);
386 if (cbp& 1) transfer_op(pV_Cur, &data[5 * 64], stride2);
387 stop_transfer_timer();
388 }
389
390 /*****************************************************************************
391 * Module functions
392 ****************************************************************************/
393
394 void
MBTransQuantIntra(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])395 MBTransQuantIntra(const MBParam * const pParam,
396 const FRAMEINFO * const frame,
397 MACROBLOCK * const pMB,
398 const uint32_t x_pos,
399 const uint32_t y_pos,
400 int16_t data[6 * 64],
401 int16_t qcoeff[6 * 64])
402 {
403
404 /* Transfer data */
405 MBTrans8to16(pParam, frame, pMB, x_pos, y_pos, data);
406
407 /* Perform DCT (and field decision) */
408 MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
409
410 /* Quantize the block */
411 MBQuantIntra(pParam, frame, pMB, data, qcoeff);
412
413 /* DeQuantize the block */
414 MBDeQuantIntra(pParam, pMB->quant, data, qcoeff);
415
416 /* Perform inverse DCT*/
417 MBiDCT(data, 0x3F);
418
419 /* Transfer back the data -- Don't add data */
420 MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 0, 0x3F);
421 }
422
423
424 uint8_t
MBTransQuantInter(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])425 MBTransQuantInter(const MBParam * const pParam,
426 const FRAMEINFO * const frame,
427 MACROBLOCK * const pMB,
428 const uint32_t x_pos,
429 const uint32_t y_pos,
430 int16_t data[6 * 64],
431 int16_t qcoeff[6 * 64])
432 {
433 uint8_t cbp;
434 uint32_t limit;
435
436 /* There is no MBTrans8to16 for Inter block, that's done in motion compensation
437 * already */
438
439 /* Perform DCT (and field decision) */
440 MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
441
442 /* Set the limit threshold */
443 limit = PVOP_TOOSMALL_LIMIT + ((pMB->quant == 1)? 1 : 0);
444
445 if (frame->vop_flags & XVID_VOP_CARTOON)
446 limit *= 3;
447
448 /* Quantize the block */
449 cbp = MBQuantInter(pParam, frame, pMB, data, qcoeff, 0, limit);
450
451 /* DeQuantize the block */
452 MBDeQuantInter(pParam, pMB->quant, data, qcoeff, cbp);
453
454 /* Perform inverse DCT*/
455 MBiDCT(data, cbp);
456
457 /* Transfer back the data -- Add the data */
458 MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 1, cbp);
459
460 return(cbp);
461 }
462
463 uint8_t
MBTransQuantInterBVOP(const MBParam * pParam,FRAMEINFO * frame,MACROBLOCK * pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])464 MBTransQuantInterBVOP(const MBParam * pParam,
465 FRAMEINFO * frame,
466 MACROBLOCK * pMB,
467 const uint32_t x_pos,
468 const uint32_t y_pos,
469 int16_t data[6 * 64],
470 int16_t qcoeff[6 * 64])
471 {
472 uint8_t cbp;
473 uint32_t limit;
474
475 /* There is no MBTrans8to16 for Inter block, that's done in motion compensation
476 * already */
477
478 /* Perform DCT (and field decision) */
479 MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
480
481 /* Set the limit threshold */
482 limit = BVOP_TOOSMALL_LIMIT;
483
484 if (frame->vop_flags & XVID_VOP_CARTOON)
485 limit *= 2;
486
487 /* Quantize the block */
488 cbp = MBQuantInter(pParam, frame, pMB, data, qcoeff, 1, limit);
489
490 /*
491 * History comment:
492 * We don't have to DeQuant, iDCT and Transfer back data for B-frames.
493 *
494 * BUT some plugins require the rebuilt original frame to be passed so we
495 * have to take care of that here
496 */
497 if((pParam->plugin_flags & XVID_REQORIGINAL)) {
498
499 /* DeQuantize the block */
500 MBDeQuantInter(pParam, pMB->quant, data, qcoeff, cbp);
501
502 /* Perform inverse DCT*/
503 MBiDCT(data, cbp);
504
505 /* Transfer back the data -- Add the data */
506 MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 1, cbp);
507 }
508
509 return(cbp);
510 }
511
512 /* if sum(diff between field lines) < sum(diff between frame lines), use field dct */
513 uint32_t
MBFieldTest_c(int16_t data[6* 64])514 MBFieldTest_c(int16_t data[6 * 64])
515 {
516 const uint8_t blocks[] =
517 { 0 * 64, 0 * 64, 0 * 64, 0 * 64, 2 * 64, 2 * 64, 2 * 64, 2 * 64 };
518 const uint8_t lines[] = { 0, 16, 32, 48, 0, 16, 32, 48 };
519
520 int frame = 0, field = 0;
521 int i, j;
522
523 for (i = 0; i < 7; ++i) {
524 for (j = 0; j < 8; ++j) {
525 frame +=
526 abs(data[0 * 64 + (i + 1) * 8 + j] - data[0 * 64 + i * 8 + j]);
527 frame +=
528 abs(data[1 * 64 + (i + 1) * 8 + j] - data[1 * 64 + i * 8 + j]);
529 frame +=
530 abs(data[2 * 64 + (i + 1) * 8 + j] - data[2 * 64 + i * 8 + j]);
531 frame +=
532 abs(data[3 * 64 + (i + 1) * 8 + j] - data[3 * 64 + i * 8 + j]);
533
534 field +=
535 abs(data[blocks[i + 1] + lines[i + 1] + j] -
536 data[blocks[i] + lines[i] + j]);
537 field +=
538 abs(data[blocks[i + 1] + lines[i + 1] + 8 + j] -
539 data[blocks[i] + lines[i] + 8 + j]);
540 field +=
541 abs(data[blocks[i + 1] + 64 + lines[i + 1] + j] -
542 data[blocks[i] + 64 + lines[i] + j]);
543 field +=
544 abs(data[blocks[i + 1] + 64 + lines[i + 1] + 8 + j] -
545 data[blocks[i] + 64 + lines[i] + 8 + j]);
546 }
547 }
548
549 return (frame >= (field + 350));
550 }
551
552
553 /* deinterlace Y blocks vertically */
554
555 #define MOVLINE(X,Y) memcpy(X, Y, sizeof(tmp))
556 #define LINE(X,Y) &data[X*64 + Y*8]
557
558 void
MBFrameToField(int16_t data[6* 64])559 MBFrameToField(int16_t data[6 * 64])
560 {
561 int16_t tmp[8];
562
563 /* left blocks */
564
565 /* 1=2, 2=4, 4=8, 8=1 */
566 MOVLINE(tmp, LINE(0, 1));
567 MOVLINE(LINE(0, 1), LINE(0, 2));
568 MOVLINE(LINE(0, 2), LINE(0, 4));
569 MOVLINE(LINE(0, 4), LINE(2, 0));
570 MOVLINE(LINE(2, 0), tmp);
571
572 /* 3=6, 6=12, 12=9, 9=3 */
573 MOVLINE(tmp, LINE(0, 3));
574 MOVLINE(LINE(0, 3), LINE(0, 6));
575 MOVLINE(LINE(0, 6), LINE(2, 4));
576 MOVLINE(LINE(2, 4), LINE(2, 1));
577 MOVLINE(LINE(2, 1), tmp);
578
579 /* 5=10, 10=5 */
580 MOVLINE(tmp, LINE(0, 5));
581 MOVLINE(LINE(0, 5), LINE(2, 2));
582 MOVLINE(LINE(2, 2), tmp);
583
584 /* 7=14, 14=13, 13=11, 11=7 */
585 MOVLINE(tmp, LINE(0, 7));
586 MOVLINE(LINE(0, 7), LINE(2, 6));
587 MOVLINE(LINE(2, 6), LINE(2, 5));
588 MOVLINE(LINE(2, 5), LINE(2, 3));
589 MOVLINE(LINE(2, 3), tmp);
590
591 /* right blocks */
592
593 /* 1=2, 2=4, 4=8, 8=1 */
594 MOVLINE(tmp, LINE(1, 1));
595 MOVLINE(LINE(1, 1), LINE(1, 2));
596 MOVLINE(LINE(1, 2), LINE(1, 4));
597 MOVLINE(LINE(1, 4), LINE(3, 0));
598 MOVLINE(LINE(3, 0), tmp);
599
600 /* 3=6, 6=12, 12=9, 9=3 */
601 MOVLINE(tmp, LINE(1, 3));
602 MOVLINE(LINE(1, 3), LINE(1, 6));
603 MOVLINE(LINE(1, 6), LINE(3, 4));
604 MOVLINE(LINE(3, 4), LINE(3, 1));
605 MOVLINE(LINE(3, 1), tmp);
606
607 /* 5=10, 10=5 */
608 MOVLINE(tmp, LINE(1, 5));
609 MOVLINE(LINE(1, 5), LINE(3, 2));
610 MOVLINE(LINE(3, 2), tmp);
611
612 /* 7=14, 14=13, 13=11, 11=7 */
613 MOVLINE(tmp, LINE(1, 7));
614 MOVLINE(LINE(1, 7), LINE(3, 6));
615 MOVLINE(LINE(3, 6), LINE(3, 5));
616 MOVLINE(LINE(3, 5), LINE(3, 3));
617 MOVLINE(LINE(3, 3), tmp);
618 }
619
620 /*****************************************************************************
621 * Trellis based R-D optimal quantization
622 *
623 * Trellis Quant code (C) 2003 Pascal Massimino skal(at)planet-d.net
624 *
625 ****************************************************************************/
626
627 /*----------------------------------------------------------------------------
628 *
629 * Trellis-Based quantization
630 *
631 * So far I understand this paper:
632 *
633 * "Trellis-Based R-D Optimal Quantization in H.263+"
634 * J.Wen, M.Luttrell, J.Villasenor
635 * IEEE Transactions on Image Processing, Vol.9, No.8, Aug. 2000.
636 *
637 * we are at stake with a simplified Bellmand-Ford / Dijkstra Single
638 * Source Shortest Path algo. But due to the underlying graph structure
639 * ("Trellis"), it can be turned into a dynamic programming algo,
640 * partially saving the explicit graph's nodes representation. And
641 * without using a heap, since the open frontier of the DAG is always
642 * known, and of fixed size.
643 *--------------------------------------------------------------------------*/
644
645
646
647 /* Codes lengths for relevant levels. */
648
649 /* let's factorize: */
650 static const uint8_t Code_Len0[64] = {
651 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
652 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
653 static const uint8_t Code_Len1[64] = {
654 20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
655 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
656 static const uint8_t Code_Len2[64] = {
657 19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
658 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
659 static const uint8_t Code_Len3[64] = {
660 18,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
661 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
662 static const uint8_t Code_Len4[64] = {
663 17,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
664 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
665 static const uint8_t Code_Len5[64] = {
666 16,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
667 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
668 static const uint8_t Code_Len6[64] = {
669 15,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
670 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
671 static const uint8_t Code_Len7[64] = {
672 13,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
673 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
674 static const uint8_t Code_Len8[64] = {
675 11,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
676 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
677 static const uint8_t Code_Len9[64] = {
678 12,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
679 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
680 static const uint8_t Code_Len10[64] = {
681 12,20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
682 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
683 static const uint8_t Code_Len11[64] = {
684 12,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
685 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
686 static const uint8_t Code_Len12[64] = {
687 11,17,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
688 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
689 static const uint8_t Code_Len13[64] = {
690 11,15,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
691 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
692 static const uint8_t Code_Len14[64] = {
693 10,12,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
694 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
695 static const uint8_t Code_Len15[64] = {
696 10,13,17,19,21,21,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
697 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
698 static const uint8_t Code_Len16[64] = {
699 9,12,13,18,18,19,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
700 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
701 static const uint8_t Code_Len17[64] = {
702 8,11,13,14,14,14,15,19,19,19,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
703 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
704 static const uint8_t Code_Len18[64] = {
705 7, 9,11,11,13,13,13,15,15,15,16,22,22,22,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
706 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
707 static const uint8_t Code_Len19[64] = {
708 5, 7, 9,10,10,11,11,11,11,11,13,14,16,17,17,18,18,18,18,18,18,18,18,20,20,21,21,30,30,30,30,30,
709 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
710 static const uint8_t Code_Len20[64] = {
711 3, 4, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9,10,10,10,10,10,10,10,10,12,12,13,13,12,13,14,15,15,
712 15,16,16,16,16,17,17,17,18,18,19,19,19,19,19,19,19,19,21,21,22,22,30,30,30,30,30,30,30,30,30,30 };
713
714 /* a few more table for LAST table: */
715 static const uint8_t Code_Len21[64] = {
716 13,20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
717 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
718 static const uint8_t Code_Len22[64] = {
719 12,15,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
720 30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
721 static const uint8_t Code_Len23[64] = {
722 10,12,15,15,15,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,20,20,20,
723 20,21,21,21,21,21,21,21,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
724 static const uint8_t Code_Len24[64] = {
725 5, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,10,10,10,10,10,10,10,10,11,11,11,11,12,12,12,
726 12,13,13,13,13,13,13,13,13,14,16,16,16,16,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19};
727
728
729 static const uint8_t * const B16_17_Code_Len[24] = { /* levels [1..24] */
730 Code_Len20,Code_Len19,Code_Len18,Code_Len17,
731 Code_Len16,Code_Len15,Code_Len14,Code_Len13,
732 Code_Len12,Code_Len11,Code_Len10,Code_Len9,
733 Code_Len8, Code_Len7 ,Code_Len6 ,Code_Len5,
734 Code_Len4, Code_Len3, Code_Len3 ,Code_Len2,
735 Code_Len2, Code_Len1, Code_Len1, Code_Len1,
736 };
737
738 static const uint8_t * const B16_17_Code_Len_Last[6] = { /* levels [1..6] */
739 Code_Len24,Code_Len23,Code_Len22,Code_Len21, Code_Len3, Code_Len1,
740 };
741
742 /* TL_SHIFT controls the precision of the RD optimizations in trellis
743 * valid range is [10..16]. The bigger, the more trellis is vulnerable
744 * to overflows in cost formulas.
745 * - 10 allows ac values up to 2^11 == 2048
746 * - 16 allows ac values up to 2^8 == 256
747 */
748 #define TL_SHIFT 11
749 #define TL(q) ((0xfe00>>(16-TL_SHIFT))/(q*q))
750
751 static const int Trellis_Lambda_Tabs[31] = {
752 TL( 1),TL( 2),TL( 3),TL( 4),TL( 5),TL( 6), TL( 7),
753 TL( 8),TL( 9),TL(10),TL(11),TL(12),TL(13),TL(14), TL(15),
754 TL(16),TL(17),TL(18),TL(19),TL(20),TL(21),TL(22), TL(23),
755 TL(24),TL(25),TL(26),TL(27),TL(28),TL(29),TL(30), TL(31)
756 };
757 #undef TL
758
759 static int __inline
Find_Last(const int16_t * C,const uint16_t * Zigzag,int i)760 Find_Last(const int16_t *C, const uint16_t *Zigzag, int i)
761 {
762 while(i>=0)
763 if (C[Zigzag[i]])
764 return i;
765 else i--;
766 return -1;
767 }
768
769 #define TRELLIS_MIN_EFFORT 3
770
calc_mseh(int16_t dQ,uint16_t mask,const int index,const int Lambda)771 static __inline uint32_t calc_mseh(int16_t dQ, uint16_t mask,
772 const int index, const int Lambda)
773 {
774 uint32_t t = (mask * Inv_iMask_Coeff[index] + 32) >> 7;
775 uint16_t u = abs(dQ) << 4;
776 uint16_t thresh = (t < 65536) ? t : 65535;
777
778 if (u <= thresh)
779 u = 0; /* The error is not perceivable */
780 else
781 u -= thresh;
782
783 u = ((u + iCSF_Round[index]) * iCSF_Coeff[index]) >> 16;
784
785 return Lambda*((((u*u)>>4) + 4*(dQ*dQ))/5);
786 }
787
788 /* this routine has been strippen of all debug code */
789 static int
dct_quantize_trellis_c(int16_t * const Out,const int16_t * const In,int Q,const uint16_t * const Zigzag,const uint16_t * const QuantMatrix,int Non_Zero,int Sum,int Lambda_Mod,const uint32_t rel_var8,const int Metric)790 dct_quantize_trellis_c(int16_t *const Out,
791 const int16_t *const In,
792 int Q,
793 const uint16_t * const Zigzag,
794 const uint16_t * const QuantMatrix,
795 int Non_Zero,
796 int Sum,
797 int Lambda_Mod,
798 const uint32_t rel_var8,
799 const int Metric)
800 {
801
802 /* Note: We should search last non-zero coeffs on *real* DCT input coeffs
803 * (In[]), not quantized one (Out[]). However, it only improves the result
804 * *very* slightly (~0.01dB), whereas speed drops to crawling level :)
805 * Well, actually, taking 1 more coeff past Non_Zero into account sometimes
806 * helps. */
807 typedef struct { int16_t Run, Level; } NODE;
808
809 NODE Nodes[65], Last = { 0, 0};
810 uint32_t Run_Costs0[64+1];
811 uint32_t * const Run_Costs = Run_Costs0 + 1;
812
813 /* it's 1/lambda, actually */
814 const int Lambda = (Lambda_Mod*Trellis_Lambda_Tabs[Q-1])>>LAMBDA_EXP;
815
816 int Run_Start = -1;
817 uint32_t Min_Cost = 2<<TL_SHIFT;
818
819 int Last_Node = -1;
820 uint32_t Last_Cost = 0;
821
822 int i, j;
823
824 uint32_t mask = (Metric) ? ((isqrt(2*coeff8_energy(In)*rel_var8) + 48) >> 6) : 0;
825
826 /* source (w/ CBP penalty) */
827 Run_Costs[-1] = 2<<TL_SHIFT;
828
829 Non_Zero = Find_Last(Out, Zigzag, Non_Zero);
830 if (Non_Zero < TRELLIS_MIN_EFFORT)
831 Non_Zero = TRELLIS_MIN_EFFORT;
832
833 for(i=0; i<=Non_Zero; i++) {
834 const int q = ((Q*QuantMatrix[Zigzag[i]])>>4);
835 const int Mult = 2*q;
836 const int Bias = (q-1) | 1;
837 const int Lev0 = Mult + Bias;
838
839 const int AC = In[Zigzag[i]];
840 const int Level1 = Out[Zigzag[i]];
841 const unsigned int Dist0 = (Metric) ? (calc_mseh(AC, mask, Zigzag[i], Lambda)) : (Lambda* AC*AC);
842 uint32_t Best_Cost = 0xf0000000;
843 Last_Cost += Dist0;
844
845 /* very specialized loop for -1,0,+1 */
846 if ((uint32_t)(Level1+1)<3) {
847 int dQ;
848 int Run;
849 uint32_t Cost0;
850
851 if (AC<0) {
852 Nodes[i].Level = -1;
853 dQ = Lev0 + AC;
854 } else {
855 Nodes[i].Level = 1;
856 dQ = Lev0 - AC;
857 }
858 Cost0 = (Metric) ? (calc_mseh(dQ, mask, Zigzag[i], Lambda)) : (Lambda*dQ*dQ);
859
860 Nodes[i].Run = 1;
861 Best_Cost = (Code_Len20[0]<<TL_SHIFT) + Run_Costs[i-1]+Cost0;
862 for(Run=i-Run_Start; Run>0; --Run) {
863 const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run];
864 const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<TL_SHIFT);
865 const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<TL_SHIFT);
866
867 /* TODO: what about tie-breaks? Should we favor short runs or
868 * long runs? Although the error is the same, it would not be
869 * spread the same way along high and low frequencies... */
870
871 /* Gruel: I'd say, favour short runs => hifreq errors (HVS) */
872
873 if (Cost<Best_Cost) {
874 Best_Cost = Cost;
875 Nodes[i].Run = Run;
876 }
877
878 if (lCost<Last_Cost) {
879 Last_Cost = lCost;
880 Last.Run = Run;
881 Last_Node = i;
882 }
883 }
884 if (Last_Node==i)
885 Last.Level = Nodes[i].Level;
886 } else if (51U>(uint32_t)(Level1+25)) {
887 /* "big" levels (not less than ESC3, though) */
888 const uint8_t *Tbl_L1, *Tbl_L2, *Tbl_L1_Last, *Tbl_L2_Last;
889 int Level2;
890 int dQ1, dQ2;
891 int Run;
892 uint32_t Dist1,Dist2;
893 int dDist21;
894
895 if (Level1>1) {
896 dQ1 = Level1*Mult-AC + Bias;
897 dQ2 = dQ1 - Mult;
898 Level2 = Level1-1;
899 Tbl_L1 = (Level1<=24) ? B16_17_Code_Len[Level1-1] : Code_Len0;
900 Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0;
901 Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0;
902 Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0;
903 } else { /* Level1<-1 */
904 dQ1 = Level1*Mult-AC - Bias;
905 dQ2 = dQ1 + Mult;
906 Level2 = Level1 + 1;
907 Tbl_L1 = (Level1>=-24) ? B16_17_Code_Len[Level1^-1] : Code_Len0;
908 Tbl_L2 = (Level2>=-24) ? B16_17_Code_Len[Level2^-1] : Code_Len0;
909 Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0;
910 Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0;
911 }
912
913 if (Metric) {
914 Dist1 = calc_mseh(dQ1, mask, Zigzag[i], Lambda);
915 Dist2 = calc_mseh(dQ2, mask, Zigzag[i], Lambda);
916 }
917 else {
918 Dist1 = Lambda*dQ1*dQ1;
919 Dist2 = Lambda*dQ2*dQ2;
920 }
921 dDist21 = Dist2-Dist1;
922
923 for(Run=i-Run_Start; Run>0; --Run)
924 {
925 const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run];
926 uint32_t Cost1, Cost2;
927 int bLevel;
928
929 /* for sub-optimal (but slightly worth it, speed-wise) search,
930 * uncomment the following:
931 * if (Cost_Base>=Best_Cost) continue;
932 * (? doesn't seem to have any effect -- gruel ) */
933
934 Cost1 = Cost_Base + (Tbl_L1[Run-1]<<TL_SHIFT);
935 Cost2 = Cost_Base + (Tbl_L2[Run-1]<<TL_SHIFT) + dDist21;
936
937 if (Cost2<Cost1) {
938 Cost1 = Cost2;
939 bLevel = Level2;
940 } else {
941 bLevel = Level1;
942 }
943
944 if (Cost1<Best_Cost) {
945 Best_Cost = Cost1;
946 Nodes[i].Run = Run;
947 Nodes[i].Level = bLevel;
948 }
949
950 Cost1 = Cost_Base + (Tbl_L1_Last[Run-1]<<TL_SHIFT);
951 Cost2 = Cost_Base + (Tbl_L2_Last[Run-1]<<TL_SHIFT) + dDist21;
952
953 if (Cost2<Cost1) {
954 Cost1 = Cost2;
955 bLevel = Level2;
956 } else {
957 bLevel = Level1;
958 }
959
960 if (Cost1<Last_Cost) {
961 Last_Cost = Cost1;
962 Last.Run = Run;
963 Last.Level = bLevel;
964 Last_Node = i;
965 }
966 } /* end of "for Run" */
967 } else {
968 /* Very very high levels, with no chance of being optimizable
969 * => Simply pick best Run. */
970 int Run;
971 for(Run=i-Run_Start; Run>0; --Run) {
972 /* 30 bits + no distortion */
973 const uint32_t Cost = (30<<TL_SHIFT) + Run_Costs[i-Run];
974 if (Cost<Best_Cost) {
975 Best_Cost = Cost;
976 Nodes[i].Run = Run;
977 Nodes[i].Level = Level1;
978 }
979
980 if (Cost<Last_Cost) {
981 Last_Cost = Cost;
982 Last.Run = Run;
983 Last.Level = Level1;
984 Last_Node = i;
985 }
986 }
987 }
988
989
990 Run_Costs[i] = Best_Cost;
991
992 if (Best_Cost < Min_Cost + Dist0) {
993 Min_Cost = Best_Cost;
994 Run_Start = i;
995 } else {
996 /* as noticed by Michael Niedermayer (michaelni at gmx.at),
997 * there's a code shorter by 1 bit for a larger run (!), same
998 * level. We give it a chance by not moving the left barrier too
999 * much. */
1000 while( Run_Costs[Run_Start]>Min_Cost+(1<<TL_SHIFT) )
1001 Run_Start++;
1002
1003 /* spread on preceding coeffs the cost incurred by skipping this
1004 * one */
1005 for(j=Run_Start; j<i; ++j) Run_Costs[j] += Dist0;
1006 Min_Cost += Dist0;
1007 }
1008 }
1009
1010 /* It seems trellis doesn't give good results... just leave the block untouched
1011 * and return the original sum value */
1012 if (Last_Node<0)
1013 return Sum;
1014
1015 /* reconstruct optimal sequence backward with surviving paths */
1016 memset(Out, 0x00, 64*sizeof(*Out));
1017 Out[Zigzag[Last_Node]] = Last.Level;
1018 i = Last_Node - Last.Run;
1019 Sum = abs(Last.Level);
1020 while(i>=0) {
1021 Out[Zigzag[i]] = Nodes[i].Level;
1022 Sum += abs(Nodes[i].Level);
1023 i -= Nodes[i].Run;
1024 }
1025
1026 return Sum;
1027 }
1028
1029 /* original version including heavy debugging info */
1030
1031 #ifdef DBGTRELL
1032
1033 #define DBG 0
1034
Evaluate_Cost(const int16_t * C,int Mult,int Bias,const uint16_t * Zigzag,int Max,int Lambda)1035 static __inline uint32_t Evaluate_Cost(const int16_t *C, int Mult, int Bias,
1036 const uint16_t * Zigzag, int Max, int Lambda)
1037 {
1038 #if (DBG>0)
1039 const int16_t * const Ref = C + 6*64;
1040 int Last = Max;
1041 int Bits = 0;
1042 int Dist = 0;
1043 int i;
1044 uint32_t Cost;
1045
1046 while(Last>=0 && C[Zigzag[Last]]==0)
1047 Last--;
1048
1049 if (Last>=0) {
1050 int j=0, j0=0;
1051 int Run, Level;
1052
1053 Bits = 2; /* CBP */
1054 while(j<Last) {
1055 while(!C[Zigzag[j]])
1056 j++;
1057 if (j==Last)
1058 break;
1059 Level=C[Zigzag[j]];
1060 Run = j - j0;
1061 j0 = ++j;
1062 if (Level>=-24 && Level<=24)
1063 Bits += B16_17_Code_Len[(Level<0) ? -Level-1 : Level-1][Run];
1064 else
1065 Bits += 30;
1066 }
1067 Level = C[Zigzag[Last]];
1068 Run = j - j0;
1069 if (Level>=-6 && Level<=6)
1070 Bits += B16_17_Code_Len_Last[(Level<0) ? -Level-1 : Level-1][Run];
1071 else
1072 Bits += 30;
1073 }
1074
1075 for(i=0; i<=Last; ++i) {
1076 int V = C[Zigzag[i]]*Mult;
1077 if (V>0)
1078 V += Bias;
1079 else
1080 if (V<0)
1081 V -= Bias;
1082 V -= Ref[Zigzag[i]];
1083 Dist += V*V;
1084 }
1085 Cost = Lambda*Dist + (Bits<<TL_SHIFT);
1086 if (DBG==1)
1087 printf( " Last:%2d/%2d Cost = [(Bits=%5.0d) + Lambda*(Dist=%6.0d) = %d ] >>12= %d ", Last,Max, Bits, Dist, Cost, Cost>>12 );
1088 return Cost;
1089
1090 #else
1091 return 0;
1092 #endif
1093 }
1094
1095
1096 static int
dct_quantize_trellis_h263_c(int16_t * const Out,const int16_t * const In,int Q,const uint16_t * const Zigzag,int Non_Zero)1097 dct_quantize_trellis_h263_c(int16_t *const Out, const int16_t *const In, int Q, const uint16_t * const Zigzag, int Non_Zero)
1098 {
1099
1100 /*
1101 * Note: We should search last non-zero coeffs on *real* DCT input coeffs (In[]),
1102 * not quantized one (Out[]). However, it only improves the result *very*
1103 * slightly (~0.01dB), whereas speed drops to crawling level :)
1104 * Well, actually, taking 1 more coeff past Non_Zero into account sometimes helps.
1105 */
1106 typedef struct { int16_t Run, Level; } NODE;
1107
1108 NODE Nodes[65], Last;
1109 uint32_t Run_Costs0[64+1];
1110 uint32_t * const Run_Costs = Run_Costs0 + 1;
1111 const int Mult = 2*Q;
1112 const int Bias = (Q-1) | 1;
1113 const int Lev0 = Mult + Bias;
1114 const int Lambda = Trellis_Lambda_Tabs[Q-1]; /* it's 1/lambda, actually */
1115
1116 int Run_Start = -1;
1117 Run_Costs[-1] = 2<<TL_SHIFT; /* source (w/ CBP penalty) */
1118 uint32_t Min_Cost = 2<<TL_SHIFT;
1119
1120 int Last_Node = -1;
1121 uint32_t Last_Cost = 0;
1122
1123 int i, j;
1124
1125 #if (DBG>0)
1126 Last.Level = 0; Last.Run = -1; /* just initialize to smthg */
1127 #endif
1128
1129 Non_Zero = Find_Last(Out, Zigzag, Non_Zero);
1130 if (Non_Zero<0)
1131 return -1;
1132
1133 for(i=0; i<=Non_Zero; i++)
1134 {
1135 const int AC = In[Zigzag[i]];
1136 const int Level1 = Out[Zigzag[i]];
1137 const int Dist0 = Lambda* AC*AC;
1138 uint32_t Best_Cost = 0xf0000000;
1139 Last_Cost += Dist0;
1140
1141 if ((uint32_t)(Level1+1)<3) /* very specialized loop for -1,0,+1 */
1142 {
1143 int dQ;
1144 int Run;
1145 uint32_t Cost0;
1146
1147 if (AC<0) {
1148 Nodes[i].Level = -1;
1149 dQ = Lev0 + AC;
1150 } else {
1151 Nodes[i].Level = 1;
1152 dQ = Lev0 - AC;
1153 }
1154 Cost0 = Lambda*dQ*dQ;
1155
1156 Nodes[i].Run = 1;
1157 Best_Cost = (Code_Len20[0]<<TL_SHIFT) + Run_Costs[i-1]+Cost0;
1158 for(Run=i-Run_Start; Run>0; --Run)
1159 {
1160 const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run];
1161 const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<TL_SHIFT);
1162 const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<TL_SHIFT);
1163
1164 /*
1165 * TODO: what about tie-breaks? Should we favor short runs or
1166 * long runs? Although the error is the same, it would not be
1167 * spread the same way along high and low frequencies...
1168 */
1169 if (Cost<Best_Cost) {
1170 Best_Cost = Cost;
1171 Nodes[i].Run = Run;
1172 }
1173
1174 if (lCost<Last_Cost) {
1175 Last_Cost = lCost;
1176 Last.Run = Run;
1177 Last_Node = i;
1178 }
1179 }
1180 if (Last_Node==i)
1181 Last.Level = Nodes[i].Level;
1182
1183 if (DBG==1) {
1184 Run_Costs[i] = Best_Cost;
1185 printf( "Costs #%2d: ", i);
1186 for(j=-1;j<=Non_Zero;++j) {
1187 if (j==Run_Start) printf( " %3.0d|", Run_Costs[j]>>12 );
1188 else if (j>Run_Start && j<i) printf( " %3.0d|", Run_Costs[j]>>12 );
1189 else if (j==i) printf( "(%3.0d)", Run_Costs[j]>>12 );
1190 else printf( " - |" );
1191 }
1192 printf( "<%3.0d %2d %d>", Min_Cost>>12, Nodes[i].Level, Nodes[i].Run );
1193 printf( " Last:#%2d {%3.0d %2d %d}", Last_Node, Last_Cost>>12, Last.Level, Last.Run );
1194 printf( " AC:%3.0d Dist0:%3d Dist(%d)=%d", AC, Dist0>>12, Nodes[i].Level, Cost0>>12 );
1195 printf( "\n" );
1196 }
1197 }
1198 else /* "big" levels */
1199 {
1200 const uint8_t *Tbl_L1, *Tbl_L2, *Tbl_L1_Last, *Tbl_L2_Last;
1201 int Level2;
1202 int dQ1, dQ2;
1203 int Run;
1204 uint32_t Dist1,Dist2;
1205 int dDist21;
1206
1207 if (Level1>1) {
1208 dQ1 = Level1*Mult-AC + Bias;
1209 dQ2 = dQ1 - Mult;
1210 Level2 = Level1-1;
1211 Tbl_L1 = (Level1<=24) ? B16_17_Code_Len[Level1-1] : Code_Len0;
1212 Tbl_L2 = (Level2<=24) ? B16_17_Code_Len[Level2-1] : Code_Len0;
1213 Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0;
1214 Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0;
1215 } else { /* Level1<-1 */
1216 dQ1 = Level1*Mult-AC - Bias;
1217 dQ2 = dQ1 + Mult;
1218 Level2 = Level1 + 1;
1219 Tbl_L1 = (Level1>=-24) ? B16_17_Code_Len[Level1^-1] : Code_Len0;
1220 Tbl_L2 = (Level2>=-24) ? B16_17_Code_Len[Level2^-1] : Code_Len0;
1221 Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0;
1222 Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0;
1223 }
1224 Dist1 = Lambda*dQ1*dQ1;
1225 Dist2 = Lambda*dQ2*dQ2;
1226 dDist21 = Dist2-Dist1;
1227
1228 for(Run=i-Run_Start; Run>0; --Run)
1229 {
1230 const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run];
1231 uint32_t Cost1, Cost2;
1232 int bLevel;
1233
1234 /*
1235 * for sub-optimal (but slightly worth it, speed-wise) search, uncomment the following:
1236 * if (Cost_Base>=Best_Cost) continue;
1237 */
1238 Cost1 = Cost_Base + (Tbl_L1[Run-1]<<TL_SHIFT);
1239 Cost2 = Cost_Base + (Tbl_L2[Run-1]<<TL_SHIFT) + dDist21;
1240
1241 if (Cost2<Cost1) {
1242 Cost1 = Cost2;
1243 bLevel = Level2;
1244 } else
1245 bLevel = Level1;
1246
1247 if (Cost1<Best_Cost) {
1248 Best_Cost = Cost1;
1249 Nodes[i].Run = Run;
1250 Nodes[i].Level = bLevel;
1251 }
1252
1253 Cost1 = Cost_Base + (Tbl_L1_Last[Run-1]<<TL_SHIFT);
1254 Cost2 = Cost_Base + (Tbl_L2_Last[Run-1]<<TL_SHIFT) + dDist21;
1255
1256 if (Cost2<Cost1) {
1257 Cost1 = Cost2;
1258 bLevel = Level2;
1259 } else
1260 bLevel = Level1;
1261
1262 if (Cost1<Last_Cost) {
1263 Last_Cost = Cost1;
1264 Last.Run = Run;
1265 Last.Level = bLevel;
1266 Last_Node = i;
1267 }
1268 } /* end of "for Run" */
1269
1270 if (DBG==1) {
1271 Run_Costs[i] = Best_Cost;
1272 printf( "Costs #%2d: ", i);
1273 for(j=-1;j<=Non_Zero;++j) {
1274 if (j==Run_Start) printf( " %3.0d|", Run_Costs[j]>>12 );
1275 else if (j>Run_Start && j<i) printf( " %3.0d|", Run_Costs[j]>>12 );
1276 else if (j==i) printf( "(%3.0d)", Run_Costs[j]>>12 );
1277 else printf( " - |" );
1278 }
1279 printf( "<%3.0d %2d %d>", Min_Cost>>12, Nodes[i].Level, Nodes[i].Run );
1280 printf( " Last:#%2d {%3.0d %2d %d}", Last_Node, Last_Cost>>12, Last.Level, Last.Run );
1281 printf( " AC:%3.0d Dist0:%3d Dist(%2d):%3d Dist(%2d):%3d", AC, Dist0>>12, Level1, Dist1>>12, Level2, Dist2>>12 );
1282 printf( "\n" );
1283 }
1284 }
1285
1286 Run_Costs[i] = Best_Cost;
1287
1288 if (Best_Cost < Min_Cost + Dist0) {
1289 Min_Cost = Best_Cost;
1290 Run_Start = i;
1291 }
1292 else
1293 {
1294 /*
1295 * as noticed by Michael Niedermayer (michaelni at gmx.at), there's
1296 * a code shorter by 1 bit for a larger run (!), same level. We give
1297 * it a chance by not moving the left barrier too much.
1298 */
1299
1300 while( Run_Costs[Run_Start]>Min_Cost+(1<<TL_SHIFT) )
1301 Run_Start++;
1302
1303 /* spread on preceding coeffs the cost incurred by skipping this one */
1304 for(j=Run_Start; j<i; ++j) Run_Costs[j] += Dist0;
1305 Min_Cost += Dist0;
1306 }
1307 }
1308
1309 if (DBG) {
1310 Last_Cost = Evaluate_Cost(Out,Mult,Bias, Zigzag,Non_Zero, Lambda);
1311 if (DBG==1) {
1312 printf( "=> " );
1313 for(i=0; i<=Non_Zero; ++i) printf( "[%3.0d] ", Out[Zigzag[i]] );
1314 printf( "\n" );
1315 }
1316 }
1317
1318 if (Last_Node<0)
1319 return -1;
1320
1321 /* reconstruct optimal sequence backward with surviving paths */
1322 memset(Out, 0x00, 64*sizeof(*Out));
1323 Out[Zigzag[Last_Node]] = Last.Level;
1324 i = Last_Node - Last.Run;
1325 while(i>=0) {
1326 Out[Zigzag[i]] = Nodes[i].Level;
1327 i -= Nodes[i].Run;
1328 }
1329
1330 if (DBG) {
1331 uint32_t Cost = Evaluate_Cost(Out,Mult,Bias, Zigzag,Non_Zero, Lambda);
1332 if (DBG==1) {
1333 printf( "<= " );
1334 for(i=0; i<=Last_Node; ++i) printf( "[%3.0d] ", Out[Zigzag[i]] );
1335 printf( "\n--------------------------------\n" );
1336 }
1337 if (Cost>Last_Cost) printf( "!!! %u > %u\n", Cost, Last_Cost );
1338 }
1339 return Last_Node;
1340 }
1341
1342 #undef DBG
1343
1344 #endif
1345