1 /*****************************************************************************
2  *
3  *  XVID MPEG-4 VIDEO CODEC
4  *  - MB Transfer/Quantization functions -
5  *
6  *  Copyright(C) 2001-2010  Peter Ross <pross@xvid.org>
7  *               2001-2010  Michael Militzer <michael@xvid.org>
8  *               2003       Edouard Gomez <ed.gomez@free.fr>
9  *
10  *  This program is free software ; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation ; either version 2 of the License, or
13  *  (at your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful,
16  *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *  GNU General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License
21  *  along with this program ; if not, write to the Free Software
22  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
23  *
24  * $Id: mbtransquant.c 2053 2011-11-04 15:23:46Z Isibaar $
25  *
26  ****************************************************************************/
27 
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "../portab.h"
33 #include "mbfunctions.h"
34 
35 #include "../global.h"
36 #include "mem_transfer.h"
37 #include "timer.h"
38 #include "../bitstream/mbcoding.h"
39 #include "../bitstream/zigzag.h"
40 #include "../dct/fdct.h"
41 #include "../dct/idct.h"
42 #include "../quant/quant.h"
43 #include "../motion/sad.h"
44 #include "../encoder.h"
45 
46 #include  "../quant/quant_matrix.h"
47 
48 MBFIELDTEST_PTR MBFieldTest;
49 
50 /*
51  * Skip blocks having a coefficient sum below this value. This value will be
52  * corrected according to the MB quantizer to avoid artifacts for quant==1
53  */
54 #define PVOP_TOOSMALL_LIMIT 1
55 #define BVOP_TOOSMALL_LIMIT 3
56 
57 /*****************************************************************************
58  * Local functions
59  ****************************************************************************/
60 
61 /* permute block and return field dct choice */
62 static __inline uint32_t
MBDecideFieldDCT(int16_t data[6* 64])63 MBDecideFieldDCT(int16_t data[6 * 64])
64 {
65 	uint32_t field = MBFieldTest(data);
66 
67 	if (field)
68 		MBFrameToField(data);
69 
70 	return field;
71 }
72 
73 /* Performs Forward DCT on all blocks */
74 static __inline void
MBfDCT(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,uint32_t x_pos,uint32_t y_pos,int16_t data[6* 64])75 MBfDCT(const MBParam * const pParam,
76 	   const FRAMEINFO * const frame,
77 	   MACROBLOCK * const pMB,
78 	   uint32_t x_pos,
79 	   uint32_t y_pos,
80 	   int16_t data[6 * 64])
81 {
82 	/* Handles interlacing */
83 	start_timer();
84 	pMB->field_dct = 0;
85 	if ((frame->vol_flags & XVID_VOL_INTERLACING) &&
86 		(x_pos>0) && (x_pos<pParam->mb_width-1) &&
87 		(y_pos>0) && (y_pos<pParam->mb_height-1)) {
88 		pMB->field_dct = MBDecideFieldDCT(data);
89 	}
90 	stop_interlacing_timer();
91 
92 	/* Perform DCT */
93 	start_timer();
94 	fdct((short * const)&data[0 * 64]);
95 	fdct((short * const)&data[1 * 64]);
96 	fdct((short * const)&data[2 * 64]);
97 	fdct((short * const)&data[3 * 64]);
98 	fdct((short * const)&data[4 * 64]);
99 	fdct((short * const)&data[5 * 64]);
100 	stop_dct_timer();
101 }
102 
103 /* Performs Inverse DCT on all blocks */
104 static __inline void
MBiDCT(int16_t data[6* 64],const uint8_t cbp)105 MBiDCT(int16_t data[6 * 64],
106 	   const uint8_t cbp)
107 {
108 	start_timer();
109 	if(cbp & (1 << (5 - 0))) idct((short * const)&data[0 * 64]);
110 	if(cbp & (1 << (5 - 1))) idct((short * const)&data[1 * 64]);
111 	if(cbp & (1 << (5 - 2))) idct((short * const)&data[2 * 64]);
112 	if(cbp & (1 << (5 - 3))) idct((short * const)&data[3 * 64]);
113 	if(cbp & (1 << (5 - 4))) idct((short * const)&data[4 * 64]);
114 	if(cbp & (1 << (5 - 5))) idct((short * const)&data[5 * 64]);
115 	stop_idct_timer();
116 }
117 
118 /* Quantize all blocks -- Intra mode */
119 static __inline void
MBQuantIntra(const MBParam * pParam,const FRAMEINFO * const frame,const MACROBLOCK * pMB,int16_t qcoeff[6* 64],int16_t data[6* 64])120 MBQuantIntra(const MBParam * pParam,
121 			 const FRAMEINFO * const frame,
122 			 const MACROBLOCK * pMB,
123 			 int16_t qcoeff[6 * 64],
124 			 int16_t data[6*64])
125 {
126 	int scaler_lum, scaler_chr;
127 	quant_intraFuncPtr quant;
128 
129 	/* check if quant matrices need to be re-initialized with new quant */
130 	if (pParam->vol_flags & XVID_VOL_MPEGQUANT) {
131 		if (pParam->last_quant_initialized_intra != pMB->quant) {
132 			init_intra_matrix(pParam->mpeg_quant_matrices, pMB->quant);
133 		}
134 		quant = quant_mpeg_intra;
135 	} else {
136 		quant = quant_h263_intra;
137 	}
138 
139 	scaler_lum = get_dc_scaler(pMB->quant, 1);
140 	scaler_chr = get_dc_scaler(pMB->quant, 0);
141 
142 	/* Quantize the block */
143 	start_timer();
144 	quant(&data[0 * 64], &qcoeff[0 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
145 	quant(&data[1 * 64], &qcoeff[1 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
146 	quant(&data[2 * 64], &qcoeff[2 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
147 	quant(&data[3 * 64], &qcoeff[3 * 64], pMB->quant, scaler_lum, pParam->mpeg_quant_matrices);
148 	quant(&data[4 * 64], &qcoeff[4 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices);
149 	quant(&data[5 * 64], &qcoeff[5 * 64], pMB->quant, scaler_chr, pParam->mpeg_quant_matrices);
150 	stop_quant_timer();
151 }
152 
153 /* DeQuantize all blocks -- Intra mode */
154 static __inline void
MBDeQuantIntra(const MBParam * pParam,const int iQuant,int16_t qcoeff[6* 64],int16_t data[6* 64])155 MBDeQuantIntra(const MBParam * pParam,
156 			   const int iQuant,
157 			   int16_t qcoeff[6 * 64],
158 			   int16_t data[6*64])
159 {
160 	int mpeg;
161 	int scaler_lum, scaler_chr;
162 
163 	quant_intraFuncPtr const dequant[2] =
164 		{
165 			dequant_h263_intra,
166 			dequant_mpeg_intra
167 		};
168 
169 	mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
170 	scaler_lum = get_dc_scaler(iQuant, 1);
171 	scaler_chr = get_dc_scaler(iQuant, 0);
172 
173 	start_timer();
174 	dequant[mpeg](&qcoeff[0 * 64], &data[0 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
175 	dequant[mpeg](&qcoeff[1 * 64], &data[1 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
176 	dequant[mpeg](&qcoeff[2 * 64], &data[2 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
177 	dequant[mpeg](&qcoeff[3 * 64], &data[3 * 64], iQuant, scaler_lum, pParam->mpeg_quant_matrices);
178 	dequant[mpeg](&qcoeff[4 * 64], &data[4 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices);
179 	dequant[mpeg](&qcoeff[5 * 64], &data[5 * 64], iQuant, scaler_chr, pParam->mpeg_quant_matrices);
180 	stop_iquant_timer();
181 }
182 
183 static int
184 dct_quantize_trellis_c(int16_t *const Out,
185 					   const int16_t *const In,
186 					   int Q,
187 					   const uint16_t * const Zigzag,
188 					   const uint16_t * const QuantMatrix,
189 					   int Non_Zero,
190 					   int Sum,
191 					   int Lambda_Mod,
192 					   const uint32_t rel_var8,
193 					   const int Metric);
194 
195 /* Quantize all blocks -- Inter mode */
196 static __inline uint8_t
MBQuantInter(const MBParam * pParam,const FRAMEINFO * const frame,const MACROBLOCK * pMB,int16_t data[6* 64],int16_t qcoeff[6* 64],int bvop,int limit)197 MBQuantInter(const MBParam * pParam,
198 			 const FRAMEINFO * const frame,
199 			 const MACROBLOCK * pMB,
200 			 int16_t data[6 * 64],
201 			 int16_t qcoeff[6 * 64],
202 			 int bvop,
203 			 int limit)
204 {
205 
206 	int i;
207 	uint8_t cbp = 0;
208 	int sum;
209 	int code_block, mpeg;
210 
211 	quant_interFuncPtr const quant[2] =
212 		{
213 			quant_h263_inter,
214 			quant_mpeg_inter
215 		};
216 
217 	mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
218 
219 	for (i = 0; i < 6; i++) {
220 
221 		/* Quantize the block */
222 		start_timer();
223 
224 		sum = quant[mpeg](&qcoeff[i*64], &data[i*64], pMB->quant, pParam->mpeg_quant_matrices);
225 
226 		if(sum && (pMB->quant > 2) && (frame->vop_flags & XVID_VOP_TRELLISQUANT)) {
227 			const uint16_t *matrix;
228 			const static uint16_t h263matrix[] =
229 				{
230 					16, 16, 16, 16, 16, 16, 16, 16,
231 					16, 16, 16, 16, 16, 16, 16, 16,
232 					16, 16, 16, 16, 16, 16, 16, 16,
233 					16, 16, 16, 16, 16, 16, 16, 16,
234 					16, 16, 16, 16, 16, 16, 16, 16,
235 					16, 16, 16, 16, 16, 16, 16, 16,
236 					16, 16, 16, 16, 16, 16, 16, 16,
237 					16, 16, 16, 16, 16, 16, 16, 16
238 				};
239 
240 			matrix = (mpeg)?get_inter_matrix(pParam->mpeg_quant_matrices):h263matrix;
241 			sum = dct_quantize_trellis_c(&qcoeff[i*64], &data[i*64],
242 										 pMB->quant, &scan_tables[0][0],
243 										 matrix,
244 										 63,
245 										 sum,
246 										 pMB->lambda[i],
247 										 pMB->rel_var8[i],
248 										 !!(frame->vop_flags & XVID_VOP_RD_PSNRHVSM));
249 		}
250 		stop_quant_timer();
251 
252 		/*
253 		 * We code the block if the sum is higher than the limit and if the first
254 		 * two AC coefficients in zig zag order are not zero.
255 		 */
256 		code_block = 0;
257 		if ((sum >= limit) || (qcoeff[i*64+1] != 0) || (qcoeff[i*64+8] != 0)) {
258 			code_block = 1;
259 		} else {
260 
261 			if (bvop && (pMB->mode == MODE_DIRECT || pMB->mode == MODE_DIRECT_NO4V)) {
262 				/* dark blocks prevention for direct mode */
263 				if ((qcoeff[i*64] < -1) || (qcoeff[i*64] > 0))
264 					code_block = 1;
265 			} else {
266 				/* not direct mode */
267 				if (qcoeff[i*64] != 0)
268 					code_block = 1;
269 			}
270 		}
271 
272 		/* Set the corresponding cbp bit */
273 		cbp |= code_block << (5 - i);
274 	}
275 
276 	return(cbp);
277 }
278 
279 /* DeQuantize all blocks -- Inter mode */
280 static __inline void
MBDeQuantInter(const MBParam * pParam,const int iQuant,int16_t data[6* 64],int16_t qcoeff[6* 64],const uint8_t cbp)281 MBDeQuantInter(const MBParam * pParam,
282 			   const int iQuant,
283 			   int16_t data[6 * 64],
284 			   int16_t qcoeff[6 * 64],
285 			   const uint8_t cbp)
286 {
287 	int mpeg;
288 
289 	quant_interFuncPtr const dequant[2] =
290 		{
291 			dequant_h263_inter,
292 			dequant_mpeg_inter
293 		};
294 
295 	mpeg = !!(pParam->vol_flags & XVID_VOL_MPEGQUANT);
296 
297 	start_timer();
298 	if(cbp & (1 << (5 - 0))) dequant[mpeg](&data[0 * 64], &qcoeff[0 * 64], iQuant, pParam->mpeg_quant_matrices);
299 	if(cbp & (1 << (5 - 1))) dequant[mpeg](&data[1 * 64], &qcoeff[1 * 64], iQuant, pParam->mpeg_quant_matrices);
300 	if(cbp & (1 << (5 - 2))) dequant[mpeg](&data[2 * 64], &qcoeff[2 * 64], iQuant, pParam->mpeg_quant_matrices);
301 	if(cbp & (1 << (5 - 3))) dequant[mpeg](&data[3 * 64], &qcoeff[3 * 64], iQuant, pParam->mpeg_quant_matrices);
302 	if(cbp & (1 << (5 - 4))) dequant[mpeg](&data[4 * 64], &qcoeff[4 * 64], iQuant, pParam->mpeg_quant_matrices);
303 	if(cbp & (1 << (5 - 5))) dequant[mpeg](&data[5 * 64], &qcoeff[5 * 64], iQuant, pParam->mpeg_quant_matrices);
304 	stop_iquant_timer();
305 }
306 
307 typedef void (transfer_operation_8to16_t) (int16_t *Dst, const uint8_t *Src, int BpS);
308 typedef void (transfer_operation_16to8_t) (uint8_t *Dst, const int16_t *Src, int BpS);
309 
310 
311 static __inline void
MBTrans8to16(const MBParam * const pParam,const FRAMEINFO * const frame,const MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64])312 MBTrans8to16(const MBParam * const pParam,
313 			 const FRAMEINFO * const frame,
314 			 const MACROBLOCK * const pMB,
315 			 const uint32_t x_pos,
316 			 const uint32_t y_pos,
317 			 int16_t data[6 * 64])
318 {
319 	uint32_t stride = pParam->edged_width;
320 	uint32_t stride2 = stride / 2;
321 	uint32_t next_block = stride * 8;
322 	uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
323 	const IMAGE * const pCurrent = &frame->image;
324 
325 	/* Image pointers */
326 	pY_Cur = pCurrent->y + (y_pos << 4) * stride  + (x_pos << 4);
327 	pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3);
328 	pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3);
329 
330 	/* Do the transfer */
331 	start_timer();
332 	transfer_8to16copy(&data[0 * 64], pY_Cur, stride);
333 	transfer_8to16copy(&data[1 * 64], pY_Cur + 8, stride);
334 	transfer_8to16copy(&data[2 * 64], pY_Cur + next_block, stride);
335 	transfer_8to16copy(&data[3 * 64], pY_Cur + next_block + 8, stride);
336 	transfer_8to16copy(&data[4 * 64], pU_Cur, stride2);
337 	transfer_8to16copy(&data[5 * 64], pV_Cur, stride2);
338 	stop_transfer_timer();
339 }
340 
341 static __inline void
MBTrans16to8(const MBParam * const pParam,const FRAMEINFO * const frame,const MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],const uint32_t add,const uint8_t cbp)342 MBTrans16to8(const MBParam * const pParam,
343 			 const FRAMEINFO * const frame,
344 			 const MACROBLOCK * const pMB,
345 			 const uint32_t x_pos,
346 			 const uint32_t y_pos,
347 			 int16_t data[6 * 64],
348 			 const uint32_t add, /* Must be 1 or 0 */
349 			 const uint8_t cbp)
350 {
351 	uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
352 	uint32_t stride = pParam->edged_width;
353 	uint32_t stride2 = stride / 2;
354 	uint32_t next_block = stride * 8;
355 	const IMAGE * const pCurrent = &frame->image;
356 
357 	/* Array of function pointers, indexed by [add] */
358 	transfer_operation_16to8_t  * const functions[2] =
359 		{
360 			(transfer_operation_16to8_t*)transfer_16to8copy,
361 			(transfer_operation_16to8_t*)transfer_16to8add,
362 		};
363 
364 	transfer_operation_16to8_t *transfer_op = NULL;
365 
366 	/* Image pointers */
367 	pY_Cur = pCurrent->y + (y_pos << 4) * stride  + (x_pos << 4);
368 	pU_Cur = pCurrent->u + (y_pos << 3) * stride2 + (x_pos << 3);
369 	pV_Cur = pCurrent->v + (y_pos << 3) * stride2 + (x_pos << 3);
370 
371 	if (pMB->field_dct) {
372 		next_block = stride;
373 		stride *= 2;
374 	}
375 
376 	/* Operation function */
377 	transfer_op = functions[add];
378 
379 	/* Do the operation */
380 	start_timer();
381 	if (cbp&32) transfer_op(pY_Cur,						&data[0 * 64], stride);
382 	if (cbp&16) transfer_op(pY_Cur + 8,					&data[1 * 64], stride);
383 	if (cbp& 8) transfer_op(pY_Cur + next_block,		&data[2 * 64], stride);
384 	if (cbp& 4) transfer_op(pY_Cur + next_block + 8,	&data[3 * 64], stride);
385 	if (cbp& 2) transfer_op(pU_Cur,						&data[4 * 64], stride2);
386 	if (cbp& 1) transfer_op(pV_Cur,						&data[5 * 64], stride2);
387 	stop_transfer_timer();
388 }
389 
390 /*****************************************************************************
391  * Module functions
392  ****************************************************************************/
393 
394 void
MBTransQuantIntra(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])395 MBTransQuantIntra(const MBParam * const pParam,
396 				  const FRAMEINFO * const frame,
397 				  MACROBLOCK * const pMB,
398 				  const uint32_t x_pos,
399 				  const uint32_t y_pos,
400 				  int16_t data[6 * 64],
401 				  int16_t qcoeff[6 * 64])
402 {
403 
404 	/* Transfer data */
405 	MBTrans8to16(pParam, frame, pMB, x_pos, y_pos, data);
406 
407 	/* Perform DCT (and field decision) */
408 	MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
409 
410 	/* Quantize the block */
411 	MBQuantIntra(pParam, frame, pMB, data, qcoeff);
412 
413 	/* DeQuantize the block */
414 	MBDeQuantIntra(pParam, pMB->quant, data, qcoeff);
415 
416 	/* Perform inverse DCT*/
417 	MBiDCT(data, 0x3F);
418 
419  	/* Transfer back the data -- Don't add data */
420 	MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 0, 0x3F);
421 }
422 
423 
424 uint8_t
MBTransQuantInter(const MBParam * const pParam,const FRAMEINFO * const frame,MACROBLOCK * const pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])425 MBTransQuantInter(const MBParam * const pParam,
426 				  const FRAMEINFO * const frame,
427 				  MACROBLOCK * const pMB,
428 				  const uint32_t x_pos,
429 				  const uint32_t y_pos,
430 				  int16_t data[6 * 64],
431 				  int16_t qcoeff[6 * 64])
432 {
433 	uint8_t cbp;
434 	uint32_t limit;
435 
436 	/* There is no MBTrans8to16 for Inter block, that's done in motion compensation
437 	 * already */
438 
439 	/* Perform DCT (and field decision) */
440 	MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
441 
442 	/* Set the limit threshold */
443 	limit = PVOP_TOOSMALL_LIMIT + ((pMB->quant == 1)? 1 : 0);
444 
445 	if (frame->vop_flags & XVID_VOP_CARTOON)
446 		limit *= 3;
447 
448 	/* Quantize the block */
449 	cbp = MBQuantInter(pParam, frame, pMB, data, qcoeff, 0, limit);
450 
451 	/* DeQuantize the block */
452 	MBDeQuantInter(pParam, pMB->quant, data, qcoeff, cbp);
453 
454 	/* Perform inverse DCT*/
455 	MBiDCT(data, cbp);
456 
457  	/* Transfer back the data -- Add the data */
458 	MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 1, cbp);
459 
460 	return(cbp);
461 }
462 
463 uint8_t
MBTransQuantInterBVOP(const MBParam * pParam,FRAMEINFO * frame,MACROBLOCK * pMB,const uint32_t x_pos,const uint32_t y_pos,int16_t data[6* 64],int16_t qcoeff[6* 64])464 MBTransQuantInterBVOP(const MBParam * pParam,
465 					  FRAMEINFO * frame,
466 					  MACROBLOCK * pMB,
467 					  const uint32_t x_pos,
468 					  const uint32_t y_pos,
469 					  int16_t data[6 * 64],
470 					  int16_t qcoeff[6 * 64])
471 {
472 	uint8_t cbp;
473 	uint32_t limit;
474 
475 	/* There is no MBTrans8to16 for Inter block, that's done in motion compensation
476 	 * already */
477 
478 	/* Perform DCT (and field decision) */
479 	MBfDCT(pParam, frame, pMB, x_pos, y_pos, data);
480 
481 	/* Set the limit threshold */
482 	limit = BVOP_TOOSMALL_LIMIT;
483 
484 	if (frame->vop_flags & XVID_VOP_CARTOON)
485 		limit *= 2;
486 
487 	/* Quantize the block */
488 	cbp = MBQuantInter(pParam, frame, pMB, data, qcoeff, 1, limit);
489 
490 	/*
491 	 * History comment:
492 	 * We don't have to DeQuant, iDCT and Transfer back data for B-frames.
493 	 *
494 	 * BUT some plugins require the rebuilt original frame to be passed so we
495 	 * have to take care of that here
496 	 */
497 	if((pParam->plugin_flags & XVID_REQORIGINAL)) {
498 
499 		/* DeQuantize the block */
500 		MBDeQuantInter(pParam, pMB->quant, data, qcoeff, cbp);
501 
502 		/* Perform inverse DCT*/
503 		MBiDCT(data, cbp);
504 
505 		/* Transfer back the data -- Add the data */
506 		MBTrans16to8(pParam, frame, pMB, x_pos, y_pos, data, 1, cbp);
507 	}
508 
509 	return(cbp);
510 }
511 
512 /* if sum(diff between field lines) < sum(diff between frame lines), use field dct */
513 uint32_t
MBFieldTest_c(int16_t data[6* 64])514 MBFieldTest_c(int16_t data[6 * 64])
515 {
516 	const uint8_t blocks[] =
517 		{ 0 * 64, 0 * 64, 0 * 64, 0 * 64, 2 * 64, 2 * 64, 2 * 64, 2 * 64 };
518 	const uint8_t lines[] = { 0, 16, 32, 48, 0, 16, 32, 48 };
519 
520 	int frame = 0, field = 0;
521 	int i, j;
522 
523 	for (i = 0; i < 7; ++i) {
524 		for (j = 0; j < 8; ++j) {
525 			frame +=
526 				abs(data[0 * 64 + (i + 1) * 8 + j] - data[0 * 64 + i * 8 + j]);
527 			frame +=
528 				abs(data[1 * 64 + (i + 1) * 8 + j] - data[1 * 64 + i * 8 + j]);
529 			frame +=
530 				abs(data[2 * 64 + (i + 1) * 8 + j] - data[2 * 64 + i * 8 + j]);
531 			frame +=
532 				abs(data[3 * 64 + (i + 1) * 8 + j] - data[3 * 64 + i * 8 + j]);
533 
534 			field +=
535 				abs(data[blocks[i + 1] + lines[i + 1] + j] -
536 					data[blocks[i] + lines[i] + j]);
537 			field +=
538 				abs(data[blocks[i + 1] + lines[i + 1] + 8 + j] -
539 					data[blocks[i] + lines[i] + 8 + j]);
540 			field +=
541 				abs(data[blocks[i + 1] + 64 + lines[i + 1] + j] -
542 					data[blocks[i] + 64 + lines[i] + j]);
543 			field +=
544 				abs(data[blocks[i + 1] + 64 + lines[i + 1] + 8 + j] -
545 					data[blocks[i] + 64 + lines[i] + 8 + j]);
546 		}
547 	}
548 
549 	return (frame >= (field + 350));
550 }
551 
552 
553 /* deinterlace Y blocks vertically */
554 
555 #define MOVLINE(X,Y) memcpy(X, Y, sizeof(tmp))
556 #define LINE(X,Y)	&data[X*64 + Y*8]
557 
558 void
MBFrameToField(int16_t data[6* 64])559 MBFrameToField(int16_t data[6 * 64])
560 {
561 	int16_t tmp[8];
562 
563 	/* left blocks */
564 
565 	/* 1=2, 2=4, 4=8, 8=1 */
566 	MOVLINE(tmp, LINE(0, 1));
567 	MOVLINE(LINE(0, 1), LINE(0, 2));
568 	MOVLINE(LINE(0, 2), LINE(0, 4));
569 	MOVLINE(LINE(0, 4), LINE(2, 0));
570 	MOVLINE(LINE(2, 0), tmp);
571 
572 	/* 3=6, 6=12, 12=9, 9=3 */
573 	MOVLINE(tmp, LINE(0, 3));
574 	MOVLINE(LINE(0, 3), LINE(0, 6));
575 	MOVLINE(LINE(0, 6), LINE(2, 4));
576 	MOVLINE(LINE(2, 4), LINE(2, 1));
577 	MOVLINE(LINE(2, 1), tmp);
578 
579 	/* 5=10, 10=5 */
580 	MOVLINE(tmp, LINE(0, 5));
581 	MOVLINE(LINE(0, 5), LINE(2, 2));
582 	MOVLINE(LINE(2, 2), tmp);
583 
584 	/* 7=14, 14=13, 13=11, 11=7 */
585 	MOVLINE(tmp, LINE(0, 7));
586 	MOVLINE(LINE(0, 7), LINE(2, 6));
587 	MOVLINE(LINE(2, 6), LINE(2, 5));
588 	MOVLINE(LINE(2, 5), LINE(2, 3));
589 	MOVLINE(LINE(2, 3), tmp);
590 
591 	/* right blocks */
592 
593 	/* 1=2, 2=4, 4=8, 8=1 */
594 	MOVLINE(tmp, LINE(1, 1));
595 	MOVLINE(LINE(1, 1), LINE(1, 2));
596 	MOVLINE(LINE(1, 2), LINE(1, 4));
597 	MOVLINE(LINE(1, 4), LINE(3, 0));
598 	MOVLINE(LINE(3, 0), tmp);
599 
600 	/* 3=6, 6=12, 12=9, 9=3 */
601 	MOVLINE(tmp, LINE(1, 3));
602 	MOVLINE(LINE(1, 3), LINE(1, 6));
603 	MOVLINE(LINE(1, 6), LINE(3, 4));
604 	MOVLINE(LINE(3, 4), LINE(3, 1));
605 	MOVLINE(LINE(3, 1), tmp);
606 
607 	/* 5=10, 10=5 */
608 	MOVLINE(tmp, LINE(1, 5));
609 	MOVLINE(LINE(1, 5), LINE(3, 2));
610 	MOVLINE(LINE(3, 2), tmp);
611 
612 	/* 7=14, 14=13, 13=11, 11=7 */
613 	MOVLINE(tmp, LINE(1, 7));
614 	MOVLINE(LINE(1, 7), LINE(3, 6));
615 	MOVLINE(LINE(3, 6), LINE(3, 5));
616 	MOVLINE(LINE(3, 5), LINE(3, 3));
617 	MOVLINE(LINE(3, 3), tmp);
618 }
619 
620 /*****************************************************************************
621  *               Trellis based R-D optimal quantization
622  *
623  *   Trellis Quant code (C) 2003 Pascal Massimino skal(at)planet-d.net
624  *
625  ****************************************************************************/
626 
627 /*----------------------------------------------------------------------------
628  *
629  *        Trellis-Based quantization
630  *
631  * So far I understand this paper:
632  *
633  *  "Trellis-Based R-D Optimal Quantization in H.263+"
634  *    J.Wen, M.Luttrell, J.Villasenor
635  *    IEEE Transactions on Image Processing, Vol.9, No.8, Aug. 2000.
636  *
637  * we are at stake with a simplified Bellmand-Ford / Dijkstra Single
638  * Source Shortest Path algo. But due to the underlying graph structure
639  * ("Trellis"), it can be turned into a dynamic programming algo,
640  * partially saving the explicit graph's nodes representation. And
641  * without using a heap, since the open frontier of the DAG is always
642  * known, and of fixed size.
643  *--------------------------------------------------------------------------*/
644 
645 
646 
647 /* Codes lengths for relevant levels. */
648 
649 /* let's factorize: */
650 static const uint8_t Code_Len0[64] = {
651 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
652 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
653 static const uint8_t Code_Len1[64] = {
654 	20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
655 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
656 static const uint8_t Code_Len2[64] = {
657 	19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
658 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
659 static const uint8_t Code_Len3[64] = {
660 	18,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
661 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
662 static const uint8_t Code_Len4[64] = {
663 	17,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
664 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
665 static const uint8_t Code_Len5[64] = {
666 	16,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
667 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
668 static const uint8_t Code_Len6[64] = {
669 	15,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
670 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
671 static const uint8_t Code_Len7[64] = {
672 	13,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
673 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
674 static const uint8_t Code_Len8[64] = {
675 	11,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
676 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
677 static const uint8_t Code_Len9[64] = {
678 	12,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
679 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
680 static const uint8_t Code_Len10[64] = {
681 	12,20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
682 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
683 static const uint8_t Code_Len11[64] = {
684 	12,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
685 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
686 static const uint8_t Code_Len12[64] = {
687 	11,17,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
688 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
689 static const uint8_t Code_Len13[64] = {
690 	11,15,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
691 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
692 static const uint8_t Code_Len14[64] = {
693 	10,12,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
694 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
695 static const uint8_t Code_Len15[64] = {
696 	10,13,17,19,21,21,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
697 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
698 static const uint8_t Code_Len16[64] = {
699 	9,12,13,18,18,19,19,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
700 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
701 static const uint8_t Code_Len17[64] = {
702 	8,11,13,14,14,14,15,19,19,19,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
703 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
704 static const uint8_t Code_Len18[64] = {
705 	7, 9,11,11,13,13,13,15,15,15,16,22,22,22,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
706 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
707 static const uint8_t Code_Len19[64] = {
708 	5, 7, 9,10,10,11,11,11,11,11,13,14,16,17,17,18,18,18,18,18,18,18,18,20,20,21,21,30,30,30,30,30,
709 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30 };
710 static const uint8_t Code_Len20[64] = {
711 	3, 4, 5, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9,10,10,10,10,10,10,10,10,12,12,13,13,12,13,14,15,15,
712 	15,16,16,16,16,17,17,17,18,18,19,19,19,19,19,19,19,19,21,21,22,22,30,30,30,30,30,30,30,30,30,30 };
713 
714 /* a few more table for LAST table: */
715 static const uint8_t Code_Len21[64] = {
716 	13,20,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
717 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
718 static const uint8_t Code_Len22[64] = {
719 	12,15,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,
720 	30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
721 static const uint8_t Code_Len23[64] = {
722 	10,12,15,15,15,16,16,16,16,17,17,17,17,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,20,20,20,
723 	20,21,21,21,21,21,21,21,21,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30};
724 static const uint8_t Code_Len24[64] = {
725 	5, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,10,10,10,10,10,10,10,10,11,11,11,11,12,12,12,
726 	12,13,13,13,13,13,13,13,13,14,16,16,16,16,17,17,17,17,18,18,18,18,18,18,18,18,19,19,19,19,19,19};
727 
728 
729 static const uint8_t * const B16_17_Code_Len[24] = { /* levels [1..24] */
730 	Code_Len20,Code_Len19,Code_Len18,Code_Len17,
731 	Code_Len16,Code_Len15,Code_Len14,Code_Len13,
732 	Code_Len12,Code_Len11,Code_Len10,Code_Len9,
733 	Code_Len8, Code_Len7 ,Code_Len6 ,Code_Len5,
734 	Code_Len4, Code_Len3, Code_Len3 ,Code_Len2,
735 	Code_Len2, Code_Len1, Code_Len1, Code_Len1,
736 };
737 
738 static const uint8_t * const B16_17_Code_Len_Last[6] = { /* levels [1..6] */
739 	Code_Len24,Code_Len23,Code_Len22,Code_Len21, Code_Len3, Code_Len1,
740 };
741 
742 /* TL_SHIFT controls the precision of the RD optimizations in trellis
743  * valid range is [10..16]. The bigger, the more trellis is vulnerable
744  * to overflows in cost formulas.
745  *  - 10 allows ac values up to 2^11 == 2048
746  *  - 16 allows ac values up to 2^8 == 256
747  */
748 #define TL_SHIFT 11
749 #define TL(q) ((0xfe00>>(16-TL_SHIFT))/(q*q))
750 
751 static const int Trellis_Lambda_Tabs[31] = {
752 	TL( 1),TL( 2),TL( 3),TL( 4),TL( 5),TL( 6), TL( 7),
753 	TL( 8),TL( 9),TL(10),TL(11),TL(12),TL(13),TL(14), TL(15),
754 	TL(16),TL(17),TL(18),TL(19),TL(20),TL(21),TL(22), TL(23),
755 	TL(24),TL(25),TL(26),TL(27),TL(28),TL(29),TL(30), TL(31)
756 };
757 #undef TL
758 
759 static int __inline
Find_Last(const int16_t * C,const uint16_t * Zigzag,int i)760 Find_Last(const int16_t *C, const uint16_t *Zigzag, int i)
761 {
762 	while(i>=0)
763 		if (C[Zigzag[i]])
764 			return i;
765 		else i--;
766 	return -1;
767 }
768 
769 #define TRELLIS_MIN_EFFORT	3
770 
calc_mseh(int16_t dQ,uint16_t mask,const int index,const int Lambda)771 static __inline uint32_t calc_mseh(int16_t dQ, uint16_t mask,
772                                    const int index, const int Lambda)
773 {
774 	uint32_t t = (mask * Inv_iMask_Coeff[index] + 32) >> 7;
775 	uint16_t u = abs(dQ) << 4;
776 	uint16_t thresh = (t < 65536) ? t : 65535;
777 
778 	if (u <= thresh)
779 		u = 0; /* The error is not perceivable */
780 	else
781 		u -= thresh;
782 
783 	u = ((u + iCSF_Round[index]) * iCSF_Coeff[index]) >> 16;
784 
785 	return Lambda*((((u*u)>>4) + 4*(dQ*dQ))/5);
786 }
787 
788 /* this routine has been strippen of all debug code */
789 static int
dct_quantize_trellis_c(int16_t * const Out,const int16_t * const In,int Q,const uint16_t * const Zigzag,const uint16_t * const QuantMatrix,int Non_Zero,int Sum,int Lambda_Mod,const uint32_t rel_var8,const int Metric)790 dct_quantize_trellis_c(int16_t *const Out,
791 					   const int16_t *const In,
792 					   int Q,
793 					   const uint16_t * const Zigzag,
794 					   const uint16_t * const QuantMatrix,
795 					   int Non_Zero,
796 					   int Sum,
797 					   int Lambda_Mod,
798 					   const uint32_t rel_var8,
799 					   const int Metric)
800 {
801 
802 	/* Note: We should search last non-zero coeffs on *real* DCT input coeffs
803 	 * (In[]), not quantized one (Out[]). However, it only improves the result
804 	 * *very* slightly (~0.01dB), whereas speed drops to crawling level :)
805 	 * Well, actually, taking 1 more coeff past Non_Zero into account sometimes
806 	 * helps. */
807 	typedef struct { int16_t Run, Level; } NODE;
808 
809 	NODE Nodes[65], Last = { 0, 0};
810 	uint32_t Run_Costs0[64+1];
811 	uint32_t * const Run_Costs = Run_Costs0 + 1;
812 
813 	/* it's 1/lambda, actually */
814 	const int Lambda = (Lambda_Mod*Trellis_Lambda_Tabs[Q-1])>>LAMBDA_EXP;
815 
816 	int Run_Start = -1;
817 	uint32_t Min_Cost = 2<<TL_SHIFT;
818 
819 	int Last_Node = -1;
820 	uint32_t Last_Cost = 0;
821 
822 	int i, j;
823 
824 	uint32_t mask = (Metric) ? ((isqrt(2*coeff8_energy(In)*rel_var8) + 48) >> 6) : 0;
825 
826 	/* source (w/ CBP penalty) */
827 	Run_Costs[-1] = 2<<TL_SHIFT;
828 
829 	Non_Zero = Find_Last(Out, Zigzag, Non_Zero);
830 	if (Non_Zero < TRELLIS_MIN_EFFORT)
831 		Non_Zero = TRELLIS_MIN_EFFORT;
832 
833 	for(i=0; i<=Non_Zero; i++) {
834 		const int q = ((Q*QuantMatrix[Zigzag[i]])>>4);
835 		const int Mult = 2*q;
836 		const int Bias = (q-1) | 1;
837 		const int Lev0 = Mult + Bias;
838 
839 		const int AC = In[Zigzag[i]];
840 		const int Level1 = Out[Zigzag[i]];
841 		const unsigned int Dist0 = (Metric) ? (calc_mseh(AC, mask, Zigzag[i], Lambda)) : (Lambda* AC*AC);
842 		uint32_t Best_Cost = 0xf0000000;
843 		Last_Cost += Dist0;
844 
845 		/* very specialized loop for -1,0,+1 */
846 		if ((uint32_t)(Level1+1)<3) {
847 			int dQ;
848 			int Run;
849 			uint32_t Cost0;
850 
851 			if (AC<0) {
852 				Nodes[i].Level = -1;
853 				dQ = Lev0 + AC;
854 			} else {
855 				Nodes[i].Level = 1;
856 				dQ = Lev0 - AC;
857 			}
858 			Cost0 = (Metric) ? (calc_mseh(dQ, mask, Zigzag[i], Lambda)) : (Lambda*dQ*dQ);
859 
860 			Nodes[i].Run = 1;
861 			Best_Cost = (Code_Len20[0]<<TL_SHIFT) + Run_Costs[i-1]+Cost0;
862 			for(Run=i-Run_Start; Run>0; --Run) {
863 				const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run];
864 				const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<TL_SHIFT);
865 				const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<TL_SHIFT);
866 
867 				/* TODO: what about tie-breaks? Should we favor short runs or
868 				 * long runs? Although the error is the same, it would not be
869 				 * spread the same way along high and low frequencies... */
870 
871 				/* Gruel: I'd say, favour short runs => hifreq errors (HVS) */
872 
873 				if (Cost<Best_Cost) {
874 					Best_Cost	 = Cost;
875 					Nodes[i].Run = Run;
876 				}
877 
878 				if (lCost<Last_Cost) {
879 					Last_Cost  = lCost;
880 					Last.Run   = Run;
881 					Last_Node  = i;
882 				}
883 			}
884 			if (Last_Node==i)
885 				Last.Level = Nodes[i].Level;
886 		} else if (51U>(uint32_t)(Level1+25)) {
887 			/* "big" levels (not less than ESC3, though) */
888 			const uint8_t *Tbl_L1, *Tbl_L2, *Tbl_L1_Last, *Tbl_L2_Last;
889 			int Level2;
890 			int dQ1, dQ2;
891 			int Run;
892 			uint32_t Dist1,Dist2;
893 			int dDist21;
894 
895 			if (Level1>1) {
896 				dQ1 = Level1*Mult-AC + Bias;
897 				dQ2 = dQ1 - Mult;
898 				Level2 = Level1-1;
899 				Tbl_L1		= (Level1<=24) ? B16_17_Code_Len[Level1-1]	   : Code_Len0;
900 				Tbl_L2		= (Level2<=24) ? B16_17_Code_Len[Level2-1]	   : Code_Len0;
901 				Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0;
902 				Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0;
903 			} else { /* Level1<-1 */
904 				dQ1 = Level1*Mult-AC - Bias;
905 				dQ2 = dQ1 + Mult;
906 				Level2 = Level1 + 1;
907 				Tbl_L1		= (Level1>=-24) ? B16_17_Code_Len[Level1^-1]	  : Code_Len0;
908 				Tbl_L2		= (Level2>=-24) ? B16_17_Code_Len[Level2^-1]	  : Code_Len0;
909 				Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0;
910 				Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0;
911 			}
912 
913 			if (Metric) {
914 				Dist1 = calc_mseh(dQ1, mask, Zigzag[i], Lambda);
915 				Dist2 = calc_mseh(dQ2, mask, Zigzag[i], Lambda);
916 			}
917 			else {
918 				Dist1 = Lambda*dQ1*dQ1;
919 				Dist2 = Lambda*dQ2*dQ2;
920 			}
921 			dDist21 = Dist2-Dist1;
922 
923 			for(Run=i-Run_Start; Run>0; --Run)
924 			{
925 				const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run];
926 				uint32_t Cost1, Cost2;
927 				int bLevel;
928 
929 				/* for sub-optimal (but slightly worth it, speed-wise) search,
930 				 * uncomment the following:
931 				 *		if (Cost_Base>=Best_Cost) continue;
932 				 * (? doesn't seem to have any effect -- gruel ) */
933 
934 				Cost1 = Cost_Base + (Tbl_L1[Run-1]<<TL_SHIFT);
935 				Cost2 = Cost_Base + (Tbl_L2[Run-1]<<TL_SHIFT) + dDist21;
936 
937 				if (Cost2<Cost1) {
938 					Cost1 = Cost2;
939 					bLevel = Level2;
940 				} else {
941 					bLevel = Level1;
942 				}
943 
944 				if (Cost1<Best_Cost) {
945 					Best_Cost = Cost1;
946 					Nodes[i].Run   = Run;
947 					Nodes[i].Level = bLevel;
948 				}
949 
950 				Cost1 = Cost_Base + (Tbl_L1_Last[Run-1]<<TL_SHIFT);
951 				Cost2 = Cost_Base + (Tbl_L2_Last[Run-1]<<TL_SHIFT) + dDist21;
952 
953 				if (Cost2<Cost1) {
954 					Cost1 = Cost2;
955 					bLevel = Level2;
956 				} else {
957 					bLevel = Level1;
958 				}
959 
960 				if (Cost1<Last_Cost) {
961 					Last_Cost  = Cost1;
962 					Last.Run   = Run;
963 					Last.Level = bLevel;
964 					Last_Node  = i;
965 				}
966 			} /* end of "for Run" */
967 		} else {
968 			/* Very very high levels, with no chance of being optimizable
969 			 * => Simply pick best Run. */
970 			int Run;
971 			for(Run=i-Run_Start; Run>0; --Run) {
972 				/* 30 bits + no distortion */
973 				const uint32_t Cost = (30<<TL_SHIFT) + Run_Costs[i-Run];
974 				if (Cost<Best_Cost) {
975 					Best_Cost = Cost;
976 					Nodes[i].Run   = Run;
977 					Nodes[i].Level = Level1;
978 				}
979 
980 				if (Cost<Last_Cost) {
981 					Last_Cost  = Cost;
982 					Last.Run   = Run;
983 					Last.Level = Level1;
984 					Last_Node  = i;
985 				}
986 			}
987 		}
988 
989 
990 		Run_Costs[i] = Best_Cost;
991 
992 		if (Best_Cost < Min_Cost + Dist0) {
993 			Min_Cost = Best_Cost;
994 			Run_Start = i;
995 		} else {
996 			/* as noticed by Michael Niedermayer (michaelni at gmx.at),
997 			 * there's a code shorter by 1 bit for a larger run (!), same
998 			 * level. We give it a chance by not moving the left barrier too
999 			 * much. */
1000 			while( Run_Costs[Run_Start]>Min_Cost+(1<<TL_SHIFT) )
1001 				Run_Start++;
1002 
1003 			/* spread on preceding coeffs the cost incurred by skipping this
1004 			 * one */
1005 			for(j=Run_Start; j<i; ++j) Run_Costs[j] += Dist0;
1006 			Min_Cost += Dist0;
1007 		}
1008 	}
1009 
1010 	/* It seems trellis doesn't give good results... just leave the block untouched
1011 	 * and return the original sum value */
1012 	if (Last_Node<0)
1013 		return Sum;
1014 
1015 	/* reconstruct optimal sequence backward with surviving paths */
1016 	memset(Out, 0x00, 64*sizeof(*Out));
1017 	Out[Zigzag[Last_Node]] = Last.Level;
1018 	i = Last_Node - Last.Run;
1019 	Sum = abs(Last.Level);
1020 	while(i>=0) {
1021 		Out[Zigzag[i]] = Nodes[i].Level;
1022 		Sum += abs(Nodes[i].Level);
1023 		i -= Nodes[i].Run;
1024 	}
1025 
1026 	return Sum;
1027 }
1028 
1029 /* original version including heavy debugging info */
1030 
1031 #ifdef DBGTRELL
1032 
1033 #define DBG 0
1034 
Evaluate_Cost(const int16_t * C,int Mult,int Bias,const uint16_t * Zigzag,int Max,int Lambda)1035 static __inline uint32_t Evaluate_Cost(const int16_t *C, int Mult, int Bias,
1036 									   const uint16_t * Zigzag, int Max, int Lambda)
1037 {
1038 #if (DBG>0)
1039 	const int16_t * const Ref = C + 6*64;
1040 	int Last = Max;
1041 	int Bits = 0;
1042 	int Dist = 0;
1043 	int i;
1044 	uint32_t Cost;
1045 
1046 	while(Last>=0 && C[Zigzag[Last]]==0)
1047 		Last--;
1048 
1049 	if (Last>=0) {
1050 		int j=0, j0=0;
1051 		int Run, Level;
1052 
1053 		Bits = 2;   /* CBP */
1054 		while(j<Last) {
1055 			while(!C[Zigzag[j]])
1056 				j++;
1057 			if (j==Last)
1058 				break;
1059 			Level=C[Zigzag[j]];
1060 			Run = j - j0;
1061 			j0 = ++j;
1062 			if (Level>=-24 && Level<=24)
1063 				Bits += B16_17_Code_Len[(Level<0) ? -Level-1 : Level-1][Run];
1064 			else
1065 				Bits += 30;
1066 		}
1067 		Level = C[Zigzag[Last]];
1068 		Run = j - j0;
1069 		if (Level>=-6 && Level<=6)
1070 			Bits += B16_17_Code_Len_Last[(Level<0) ? -Level-1 : Level-1][Run];
1071 		else
1072 			Bits += 30;
1073 	}
1074 
1075 	for(i=0; i<=Last; ++i) {
1076 		int V = C[Zigzag[i]]*Mult;
1077 		if (V>0)
1078 			V += Bias;
1079 		else
1080 			if (V<0)
1081 				V -= Bias;
1082 		V -= Ref[Zigzag[i]];
1083 		Dist += V*V;
1084 	}
1085 	Cost = Lambda*Dist + (Bits<<TL_SHIFT);
1086 	if (DBG==1)
1087 		printf( " Last:%2d/%2d Cost = [(Bits=%5.0d) + Lambda*(Dist=%6.0d) = %d ] >>12= %d ", Last,Max, Bits, Dist, Cost, Cost>>12 );
1088 	return Cost;
1089 
1090 #else
1091 	return 0;
1092 #endif
1093 }
1094 
1095 
1096 static int
dct_quantize_trellis_h263_c(int16_t * const Out,const int16_t * const In,int Q,const uint16_t * const Zigzag,int Non_Zero)1097 dct_quantize_trellis_h263_c(int16_t *const Out, const int16_t *const In, int Q, const uint16_t * const Zigzag, int Non_Zero)
1098 {
1099 
1100     /*
1101 	 * Note: We should search last non-zero coeffs on *real* DCT input coeffs (In[]),
1102 	 * not quantized one (Out[]). However, it only improves the result *very*
1103 	 * slightly (~0.01dB), whereas speed drops to crawling level :)
1104 	 * Well, actually, taking 1 more coeff past Non_Zero into account sometimes helps.
1105 	 */
1106 	typedef struct { int16_t Run, Level; } NODE;
1107 
1108 	NODE Nodes[65], Last;
1109 	uint32_t Run_Costs0[64+1];
1110 	uint32_t * const Run_Costs = Run_Costs0 + 1;
1111 	const int Mult = 2*Q;
1112 	const int Bias = (Q-1) | 1;
1113 	const int Lev0 = Mult + Bias;
1114 	const int Lambda = Trellis_Lambda_Tabs[Q-1];    /* it's 1/lambda, actually */
1115 
1116 	int Run_Start = -1;
1117 	Run_Costs[-1] = 2<<TL_SHIFT;                          /* source (w/ CBP penalty) */
1118 	uint32_t Min_Cost = 2<<TL_SHIFT;
1119 
1120 	int Last_Node = -1;
1121 	uint32_t Last_Cost = 0;
1122 
1123 	int i, j;
1124 
1125 #if (DBG>0)
1126 	Last.Level = 0; Last.Run = -1; /* just initialize to smthg */
1127 #endif
1128 
1129 	Non_Zero = Find_Last(Out, Zigzag, Non_Zero);
1130 	if (Non_Zero<0)
1131 		return -1;
1132 
1133 	for(i=0; i<=Non_Zero; i++)
1134 	{
1135 		const int AC = In[Zigzag[i]];
1136 		const int Level1 = Out[Zigzag[i]];
1137 		const int Dist0 = Lambda* AC*AC;
1138 		uint32_t Best_Cost = 0xf0000000;
1139 		Last_Cost += Dist0;
1140 
1141 		if ((uint32_t)(Level1+1)<3)                 /* very specialized loop for -1,0,+1 */
1142 		{
1143 			int dQ;
1144 			int Run;
1145 			uint32_t Cost0;
1146 
1147 			if (AC<0) {
1148 				Nodes[i].Level = -1;
1149 				dQ = Lev0 + AC;
1150 			} else {
1151 				Nodes[i].Level = 1;
1152 				dQ = Lev0 - AC;
1153 			}
1154 			Cost0 = Lambda*dQ*dQ;
1155 
1156 			Nodes[i].Run = 1;
1157 			Best_Cost = (Code_Len20[0]<<TL_SHIFT) + Run_Costs[i-1]+Cost0;
1158 			for(Run=i-Run_Start; Run>0; --Run)
1159 			{
1160 				const uint32_t Cost_Base = Cost0 + Run_Costs[i-Run];
1161 				const uint32_t Cost = Cost_Base + (Code_Len20[Run-1]<<TL_SHIFT);
1162 				const uint32_t lCost = Cost_Base + (Code_Len24[Run-1]<<TL_SHIFT);
1163 
1164 				/*
1165 				 * TODO: what about tie-breaks? Should we favor short runs or
1166 				 * long runs? Although the error is the same, it would not be
1167 				 * spread the same way along high and low frequencies...
1168 				 */
1169 				if (Cost<Best_Cost) {
1170 					Best_Cost    = Cost;
1171 					Nodes[i].Run = Run;
1172 				}
1173 
1174 				if (lCost<Last_Cost) {
1175 					Last_Cost  = lCost;
1176 					Last.Run   = Run;
1177 					Last_Node  = i;
1178 				}
1179 			}
1180 			if (Last_Node==i)
1181 				Last.Level = Nodes[i].Level;
1182 
1183 			if (DBG==1) {
1184 				Run_Costs[i] = Best_Cost;
1185 				printf( "Costs #%2d: ", i);
1186 				for(j=-1;j<=Non_Zero;++j) {
1187 					if (j==Run_Start)            printf( " %3.0d|", Run_Costs[j]>>12 );
1188 					else if (j>Run_Start && j<i) printf( " %3.0d|", Run_Costs[j]>>12 );
1189 					else if (j==i)               printf( "(%3.0d)", Run_Costs[j]>>12 );
1190 					else                         printf( "  - |" );
1191 				}
1192 				printf( "<%3.0d %2d %d>", Min_Cost>>12, Nodes[i].Level, Nodes[i].Run );
1193 				printf( "  Last:#%2d {%3.0d %2d %d}", Last_Node, Last_Cost>>12, Last.Level, Last.Run );
1194 				printf( " AC:%3.0d Dist0:%3d Dist(%d)=%d", AC, Dist0>>12, Nodes[i].Level, Cost0>>12 );
1195 				printf( "\n" );
1196 			}
1197 		}
1198 		else                      /* "big" levels */
1199 		{
1200 			const uint8_t *Tbl_L1, *Tbl_L2, *Tbl_L1_Last, *Tbl_L2_Last;
1201 			int Level2;
1202 			int dQ1, dQ2;
1203 			int Run;
1204 			uint32_t Dist1,Dist2;
1205 			int dDist21;
1206 
1207 			if (Level1>1) {
1208 				dQ1 = Level1*Mult-AC + Bias;
1209 				dQ2 = dQ1 - Mult;
1210 				Level2 = Level1-1;
1211 				Tbl_L1      = (Level1<=24) ? B16_17_Code_Len[Level1-1]     : Code_Len0;
1212 				Tbl_L2      = (Level2<=24) ? B16_17_Code_Len[Level2-1]     : Code_Len0;
1213 				Tbl_L1_Last = (Level1<=6) ? B16_17_Code_Len_Last[Level1-1] : Code_Len0;
1214 				Tbl_L2_Last = (Level2<=6) ? B16_17_Code_Len_Last[Level2-1] : Code_Len0;
1215 			} else { /* Level1<-1 */
1216 				dQ1 = Level1*Mult-AC - Bias;
1217 				dQ2 = dQ1 + Mult;
1218 				Level2 = Level1 + 1;
1219 				Tbl_L1      = (Level1>=-24) ? B16_17_Code_Len[Level1^-1]      : Code_Len0;
1220 				Tbl_L2      = (Level2>=-24) ? B16_17_Code_Len[Level2^-1]      : Code_Len0;
1221 				Tbl_L1_Last = (Level1>=- 6) ? B16_17_Code_Len_Last[Level1^-1] : Code_Len0;
1222 				Tbl_L2_Last = (Level2>=- 6) ? B16_17_Code_Len_Last[Level2^-1] : Code_Len0;
1223 			}
1224 			Dist1 = Lambda*dQ1*dQ1;
1225 			Dist2 = Lambda*dQ2*dQ2;
1226 			dDist21 = Dist2-Dist1;
1227 
1228 			for(Run=i-Run_Start; Run>0; --Run)
1229 			{
1230 				const uint32_t Cost_Base = Dist1 + Run_Costs[i-Run];
1231 				uint32_t Cost1, Cost2;
1232 				int bLevel;
1233 
1234 /*
1235  * for sub-optimal (but slightly worth it, speed-wise) search, uncomment the following:
1236  *        if (Cost_Base>=Best_Cost) continue;
1237  */
1238 				Cost1 = Cost_Base + (Tbl_L1[Run-1]<<TL_SHIFT);
1239 				Cost2 = Cost_Base + (Tbl_L2[Run-1]<<TL_SHIFT) + dDist21;
1240 
1241 				if (Cost2<Cost1) {
1242 					Cost1 = Cost2;
1243 					bLevel = Level2;
1244 				} else
1245 					bLevel = Level1;
1246 
1247 				if (Cost1<Best_Cost) {
1248 					Best_Cost = Cost1;
1249 					Nodes[i].Run   = Run;
1250 					Nodes[i].Level = bLevel;
1251 				}
1252 
1253 				Cost1 = Cost_Base + (Tbl_L1_Last[Run-1]<<TL_SHIFT);
1254 				Cost2 = Cost_Base + (Tbl_L2_Last[Run-1]<<TL_SHIFT) + dDist21;
1255 
1256 				if (Cost2<Cost1) {
1257 					Cost1 = Cost2;
1258 					bLevel = Level2;
1259 				} else
1260 					bLevel = Level1;
1261 
1262 				if (Cost1<Last_Cost) {
1263 					Last_Cost  = Cost1;
1264 					Last.Run   = Run;
1265 					Last.Level = bLevel;
1266 					Last_Node  = i;
1267 				}
1268 			} /* end of "for Run" */
1269 
1270 			if (DBG==1) {
1271 				Run_Costs[i] = Best_Cost;
1272 				printf( "Costs #%2d: ", i);
1273 				for(j=-1;j<=Non_Zero;++j) {
1274 					if (j==Run_Start)            printf( " %3.0d|", Run_Costs[j]>>12 );
1275 					else if (j>Run_Start && j<i) printf( " %3.0d|", Run_Costs[j]>>12 );
1276 					else if (j==i)               printf( "(%3.0d)", Run_Costs[j]>>12 );
1277 					else                         printf( "  - |" );
1278 				}
1279 				printf( "<%3.0d %2d %d>", Min_Cost>>12, Nodes[i].Level, Nodes[i].Run );
1280 				printf( "  Last:#%2d {%3.0d %2d %d}", Last_Node, Last_Cost>>12, Last.Level, Last.Run );
1281 				printf( " AC:%3.0d Dist0:%3d Dist(%2d):%3d Dist(%2d):%3d", AC, Dist0>>12, Level1, Dist1>>12, Level2, Dist2>>12 );
1282 				printf( "\n" );
1283 			}
1284 		}
1285 
1286 		Run_Costs[i] = Best_Cost;
1287 
1288 		if (Best_Cost < Min_Cost + Dist0) {
1289 			Min_Cost = Best_Cost;
1290 			Run_Start = i;
1291 		}
1292 		else
1293 		{
1294 			/*
1295 			 * as noticed by Michael Niedermayer (michaelni at gmx.at), there's
1296 			 * a code shorter by 1 bit for a larger run (!), same level. We give
1297 			 * it a chance by not moving the left barrier too much.
1298 			 */
1299 
1300 			while( Run_Costs[Run_Start]>Min_Cost+(1<<TL_SHIFT) )
1301 				Run_Start++;
1302 
1303 			/* spread on preceding coeffs the cost incurred by skipping this one */
1304 			for(j=Run_Start; j<i; ++j) Run_Costs[j] += Dist0;
1305 			Min_Cost += Dist0;
1306 		}
1307 	}
1308 
1309 	if (DBG) {
1310 		Last_Cost = Evaluate_Cost(Out,Mult,Bias, Zigzag,Non_Zero, Lambda);
1311 		if (DBG==1) {
1312 			printf( "=> " );
1313 			for(i=0; i<=Non_Zero; ++i) printf( "[%3.0d] ", Out[Zigzag[i]] );
1314 			printf( "\n" );
1315 		}
1316 	}
1317 
1318 	if (Last_Node<0)
1319 		return -1;
1320 
1321 	/* reconstruct optimal sequence backward with surviving paths */
1322 	memset(Out, 0x00, 64*sizeof(*Out));
1323 	Out[Zigzag[Last_Node]] = Last.Level;
1324 	i = Last_Node - Last.Run;
1325 	while(i>=0) {
1326 		Out[Zigzag[i]] = Nodes[i].Level;
1327 		i -= Nodes[i].Run;
1328 	}
1329 
1330 	if (DBG) {
1331 		uint32_t Cost = Evaluate_Cost(Out,Mult,Bias, Zigzag,Non_Zero, Lambda);
1332 		if (DBG==1) {
1333 			printf( "<= " );
1334 			for(i=0; i<=Last_Node; ++i) printf( "[%3.0d] ", Out[Zigzag[i]] );
1335 			printf( "\n--------------------------------\n" );
1336 		}
1337 		if (Cost>Last_Cost) printf( "!!! %u > %u\n", Cost, Last_Cost );
1338 	}
1339 	return Last_Node;
1340 }
1341 
1342 #undef DBG
1343 
1344 #endif
1345