1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use git log
75
76 #include "config.h"
77
78 #include "libavutil/avutil.h"
79 #include "libavutil/avassert.h"
80 #include <inttypes.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 //#undef HAVE_MMXEXT_INLINE
85 //#define HAVE_AMD3DNOW_INLINE
86 //#undef HAVE_MMX_INLINE
87 //#undef ARCH_X86
88 //#define DEBUG_BRIGHTNESS
89 #include "postprocess.h"
90 #include "postprocess_internal.h"
91 #include "libavutil/avstring.h"
92
postproc_version(void)93 unsigned postproc_version(void)
94 {
95 av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
96 return LIBPOSTPROC_VERSION_INT;
97 }
98
postproc_configuration(void)99 const char *postproc_configuration(void)
100 {
101 return FFMPEG_CONFIGURATION;
102 }
103
postproc_license(void)104 const char *postproc_license(void)
105 {
106 #define LICENSE_PREFIX "libpostproc license: "
107 return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
108 }
109
110 #if HAVE_ALTIVEC_H
111 #include <altivec.h>
112 #endif
113
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116 #define BLOCK_SIZE 8
117 #define TEMP_STRIDE 8
118 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
119
120 #if ARCH_X86 && HAVE_INLINE_ASM
121 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
122 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
123 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
124 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
125 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
126 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
127 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
128 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
129 #endif
130
131 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
132
133
134 static const struct PPFilter filters[]=
135 {
136 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
137 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
138 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
139 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
140 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
141 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
142 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
143 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
144 {"dr", "dering", 1, 5, 6, DERING},
145 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
146 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
147 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
148 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
149 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
150 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
151 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
152 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
153 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
154 {"be", "bitexact", 1, 0, 0, BITEXACT},
155 {NULL, NULL,0,0,0,0} //End Marker
156 };
157
158 static const char * const replaceTable[]=
159 {
160 "default", "hb:a,vb:a,dr:a",
161 "de", "hb:a,vb:a,dr:a",
162 "fast", "h1:a,v1:a,dr:a",
163 "fa", "h1:a,v1:a,dr:a",
164 "ac", "ha:a:128:7,va:a,dr:a",
165 NULL //End Marker
166 };
167
168
169 #if ARCH_X86 && HAVE_INLINE_ASM
prefetchnta(void * p)170 static inline void prefetchnta(void *p)
171 {
172 __asm__ volatile( "prefetchnta (%0)\n\t"
173 : : "r" (p)
174 );
175 }
176
prefetcht0(void * p)177 static inline void prefetcht0(void *p)
178 {
179 __asm__ volatile( "prefetcht0 (%0)\n\t"
180 : : "r" (p)
181 );
182 }
183
prefetcht1(void * p)184 static inline void prefetcht1(void *p)
185 {
186 __asm__ volatile( "prefetcht1 (%0)\n\t"
187 : : "r" (p)
188 );
189 }
190
prefetcht2(void * p)191 static inline void prefetcht2(void *p)
192 {
193 __asm__ volatile( "prefetcht2 (%0)\n\t"
194 : : "r" (p)
195 );
196 }
197 #endif
198
199 /* The horizontal functions exist only in C because the MMX
200 * code is faster with vertical filters and transposing. */
201
202 /**
203 * Check if the given 8x8 Block is mostly "flat"
204 */
isHorizDC_C(const uint8_t src[],int stride,const PPContext * c)205 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
206 {
207 int numEq= 0;
208 int y;
209 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
210 const int dcThreshold= dcOffset*2 + 1;
211
212 for(y=0; y<BLOCK_SIZE; y++){
213 if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
214 if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
215 if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
216 if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
217 if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
218 if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
219 if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
220 src+= stride;
221 }
222 return numEq > c->ppMode.flatnessThreshold;
223 }
224
225 /**
226 * Check if the middle 8x8 Block in the given 8x16 block is flat
227 */
isVertDC_C(const uint8_t src[],int stride,const PPContext * c)228 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
229 {
230 int numEq= 0;
231 int y;
232 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
233 const int dcThreshold= dcOffset*2 + 1;
234
235 src+= stride*4; // src points to begin of the 8x8 Block
236 for(y=0; y<BLOCK_SIZE-1; y++){
237 if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
238 if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
239 if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
240 if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
241 if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
242 if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
243 if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
244 if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
245 src+= stride;
246 }
247 return numEq > c->ppMode.flatnessThreshold;
248 }
249
isHorizMinMaxOk_C(const uint8_t src[],int stride,int QP)250 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
251 {
252 int i;
253 for(i=0; i<2; i++){
254 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
255 src += stride;
256 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
257 src += stride;
258 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
259 src += stride;
260 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
261 src += stride;
262 }
263 return 1;
264 }
265
isVertMinMaxOk_C(const uint8_t src[],int stride,int QP)266 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
267 {
268 int x;
269 src+= stride*4;
270 for(x=0; x<BLOCK_SIZE; x+=4){
271 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
272 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
273 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
274 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
275 }
276 return 1;
277 }
278
horizClassify_C(const uint8_t src[],int stride,const PPContext * c)279 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
280 {
281 if( isHorizDC_C(src, stride, c) ){
282 if( isHorizMinMaxOk_C(src, stride, c->QP) )
283 return 1;
284 else
285 return 0;
286 }else{
287 return 2;
288 }
289 }
290
vertClassify_C(const uint8_t src[],int stride,const PPContext * c)291 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
292 {
293 if( isVertDC_C(src, stride, c) ){
294 if( isVertMinMaxOk_C(src, stride, c->QP) )
295 return 1;
296 else
297 return 0;
298 }else{
299 return 2;
300 }
301 }
302
doHorizDefFilter_C(uint8_t dst[],int stride,const PPContext * c)303 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
304 {
305 int y;
306 for(y=0; y<BLOCK_SIZE; y++){
307 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
308
309 if(FFABS(middleEnergy) < 8*c->QP){
310 const int q=(dst[3] - dst[4])/2;
311 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
312 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
313
314 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
315 d= FFMAX(d, 0);
316
317 d= (5*d + 32) >> 6;
318 d*= FFSIGN(-middleEnergy);
319
320 if(q>0)
321 {
322 d = FFMAX(d, 0);
323 d = FFMIN(d, q);
324 }
325 else
326 {
327 d = FFMIN(d, 0);
328 d = FFMAX(d, q);
329 }
330
331 dst[3]-= d;
332 dst[4]+= d;
333 }
334 dst+= stride;
335 }
336 }
337
338 /**
339 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
340 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
341 */
doHorizLowPass_C(uint8_t dst[],int stride,const PPContext * c)342 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
343 {
344 int y;
345 for(y=0; y<BLOCK_SIZE; y++){
346 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
347 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
348
349 int sums[10];
350 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
351 sums[1] = sums[0] - first + dst[3];
352 sums[2] = sums[1] - first + dst[4];
353 sums[3] = sums[2] - first + dst[5];
354 sums[4] = sums[3] - first + dst[6];
355 sums[5] = sums[4] - dst[0] + dst[7];
356 sums[6] = sums[5] - dst[1] + last;
357 sums[7] = sums[6] - dst[2] + last;
358 sums[8] = sums[7] - dst[3] + last;
359 sums[9] = sums[8] - dst[4] + last;
360
361 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
362 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
363 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
364 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
365 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
366 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
367 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
368 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
369
370 dst+= stride;
371 }
372 }
373
374 /**
375 * Experimental Filter 1 (Horizontal)
376 * will not damage linear gradients
377 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
378 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
379 * MMX2 version does correct clipping C version does not
380 * not identical with the vertical one
381 */
horizX1Filter(uint8_t * src,int stride,int QP)382 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
383 {
384 int y;
385 static uint64_t lut[256];
386 if(!lut[255])
387 {
388 int i;
389 for(i=0; i<256; i++)
390 {
391 int v= i < 128 ? 2*i : 2*(i-256);
392 /*
393 //Simulate 112242211 9-Tap filter
394 uint64_t a= (v/16) & 0xFF;
395 uint64_t b= (v/8) & 0xFF;
396 uint64_t c= (v/4) & 0xFF;
397 uint64_t d= (3*v/8) & 0xFF;
398 */
399 //Simulate piecewise linear interpolation
400 uint64_t a= (v/16) & 0xFF;
401 uint64_t b= (v*3/16) & 0xFF;
402 uint64_t c= (v*5/16) & 0xFF;
403 uint64_t d= (7*v/16) & 0xFF;
404 uint64_t A= (0x100 - a)&0xFF;
405 uint64_t B= (0x100 - b)&0xFF;
406 uint64_t C= (0x100 - c)&0xFF;
407 uint64_t D= (0x100 - c)&0xFF;
408
409 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
410 (D<<24) | (C<<16) | (B<<8) | (A);
411 //lut[i] = (v<<32) | (v<<24);
412 }
413 }
414
415 for(y=0; y<BLOCK_SIZE; y++){
416 int a= src[1] - src[2];
417 int b= src[3] - src[4];
418 int c= src[5] - src[6];
419
420 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
421
422 if(d < QP){
423 int v = d * FFSIGN(-b);
424
425 src[1] +=v/8;
426 src[2] +=v/4;
427 src[3] +=3*v/8;
428 src[4] -=3*v/8;
429 src[5] -=v/4;
430 src[6] -=v/8;
431 }
432 src+=stride;
433 }
434 }
435
436 /**
437 * accurate deblock filter
438 */
do_a_deblock_C(uint8_t * src,int step,int stride,const PPContext * c)439 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
440 int stride, const PPContext *c)
441 {
442 int y;
443 const int QP= c->QP;
444 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
445 const int dcThreshold= dcOffset*2 + 1;
446 //START_TIMER
447 src+= step*4; // src points to begin of the 8x8 Block
448 for(y=0; y<8; y++){
449 int numEq= 0;
450
451 if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
452 if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
453 if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
454 if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
455 if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
456 if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
457 if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
458 if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
459 if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
460 if(numEq > c->ppMode.flatnessThreshold){
461 int min, max, x;
462
463 if(src[0] > src[step]){
464 max= src[0];
465 min= src[step];
466 }else{
467 max= src[step];
468 min= src[0];
469 }
470 for(x=2; x<8; x+=2){
471 if(src[x*step] > src[(x+1)*step]){
472 if(src[x *step] > max) max= src[ x *step];
473 if(src[(x+1)*step] < min) min= src[(x+1)*step];
474 }else{
475 if(src[(x+1)*step] > max) max= src[(x+1)*step];
476 if(src[ x *step] < min) min= src[ x *step];
477 }
478 }
479 if(max-min < 2*QP){
480 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
481 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
482
483 int sums[10];
484 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
485 sums[1] = sums[0] - first + src[3*step];
486 sums[2] = sums[1] - first + src[4*step];
487 sums[3] = sums[2] - first + src[5*step];
488 sums[4] = sums[3] - first + src[6*step];
489 sums[5] = sums[4] - src[0*step] + src[7*step];
490 sums[6] = sums[5] - src[1*step] + last;
491 sums[7] = sums[6] - src[2*step] + last;
492 sums[8] = sums[7] - src[3*step] + last;
493 sums[9] = sums[8] - src[4*step] + last;
494
495 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
496 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
497 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
498 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
499 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
500 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
501 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
502 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
503 }
504 }else{
505 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
506
507 if(FFABS(middleEnergy) < 8*QP){
508 const int q=(src[3*step] - src[4*step])/2;
509 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
510 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
511
512 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
513 d= FFMAX(d, 0);
514
515 d= (5*d + 32) >> 6;
516 d*= FFSIGN(-middleEnergy);
517
518 if(q>0){
519 d = FFMAX(d, 0);
520 d = FFMIN(d, q);
521 }else{
522 d = FFMIN(d, 0);
523 d = FFMAX(d, q);
524 }
525
526 src[3*step]-= d;
527 src[4*step]+= d;
528 }
529 }
530
531 src += stride;
532 }
533 /*if(step==16){
534 STOP_TIMER("step16")
535 }else{
536 STOP_TIMER("stepX")
537 }*/
538 }
539
540 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
541 //Plain C versions
542 //we always compile C for testing which needs bitexactness
543 #define TEMPLATE_PP_C 1
544 #include "postprocess_template.c"
545
546 #if HAVE_ALTIVEC
547 # define TEMPLATE_PP_ALTIVEC 1
548 # include "postprocess_altivec_template.c"
549 # include "postprocess_template.c"
550 #endif
551
552 #if ARCH_X86 && HAVE_INLINE_ASM
553 # if CONFIG_RUNTIME_CPUDETECT
554 # define TEMPLATE_PP_MMX 1
555 # include "postprocess_template.c"
556 # define TEMPLATE_PP_MMXEXT 1
557 # include "postprocess_template.c"
558 # define TEMPLATE_PP_3DNOW 1
559 # include "postprocess_template.c"
560 # define TEMPLATE_PP_SSE2 1
561 # include "postprocess_template.c"
562 # else
563 # if HAVE_SSE2_INLINE
564 # define TEMPLATE_PP_SSE2 1
565 # include "postprocess_template.c"
566 # elif HAVE_MMXEXT_INLINE
567 # define TEMPLATE_PP_MMXEXT 1
568 # include "postprocess_template.c"
569 # elif HAVE_AMD3DNOW_INLINE
570 # define TEMPLATE_PP_3DNOW 1
571 # include "postprocess_template.c"
572 # elif HAVE_MMX_INLINE
573 # define TEMPLATE_PP_MMX 1
574 # include "postprocess_template.c"
575 # endif
576 # endif
577 #endif
578
579 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
580 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
581
postProcess(const uint8_t src[],int srcStride,uint8_t dst[],int dstStride,int width,int height,const QP_STORE_T QPs[],int QPStride,int isColor,pp_mode * vm,pp_context * vc)582 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
583 const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
584 {
585 pp_fn pp = postProcess_C;
586 PPContext *c= (PPContext *)vc;
587 PPMode *ppMode= (PPMode *)vm;
588 c->ppMode= *ppMode; //FIXME
589
590 if (!(ppMode->lumMode & BITEXACT)) {
591 #if CONFIG_RUNTIME_CPUDETECT
592 #if ARCH_X86 && HAVE_INLINE_ASM
593 // ordered per speed fastest first
594 if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
595 else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
596 else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
597 else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
598 #elif HAVE_ALTIVEC
599 if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
600 #endif
601 #else /* CONFIG_RUNTIME_CPUDETECT */
602 #if HAVE_SSE2_INLINE
603 pp = postProcess_SSE2;
604 #elif HAVE_MMXEXT_INLINE
605 pp = postProcess_MMX2;
606 #elif HAVE_AMD3DNOW_INLINE
607 pp = postProcess_3DNow;
608 #elif HAVE_MMX_INLINE
609 pp = postProcess_MMX;
610 #elif HAVE_ALTIVEC
611 pp = postProcess_altivec;
612 #endif
613 #endif /* !CONFIG_RUNTIME_CPUDETECT */
614 }
615
616 pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
617 }
618
619 /* -pp Command line Help
620 */
621 const char pp_help[] =
622 "Available postprocessing filters:\n"
623 "Filters Options\n"
624 "short long name short long option Description\n"
625 "* * a autoq CPU power dependent enabler\n"
626 " c chrom chrominance filtering enabled\n"
627 " y nochrom chrominance filtering disabled\n"
628 " n noluma luma filtering disabled\n"
629 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
630 " 1. difference factor: default=32, higher -> more deblocking\n"
631 " 2. flatness threshold: default=39, lower -> more deblocking\n"
632 " the h & v deblocking filters share these\n"
633 " so you can't set different thresholds for h / v\n"
634 "vb vdeblock (2 threshold) vertical deblocking filter\n"
635 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
636 "va vadeblock (2 threshold) vertical deblocking filter\n"
637 "h1 x1hdeblock experimental h deblock filter 1\n"
638 "v1 x1vdeblock experimental v deblock filter 1\n"
639 "dr dering deringing filter\n"
640 "al autolevels automatic brightness / contrast\n"
641 " f fullyrange stretch luminance to (0..255)\n"
642 "lb linblenddeint linear blend deinterlacer\n"
643 "li linipoldeint linear interpolating deinterlace\n"
644 "ci cubicipoldeint cubic interpolating deinterlacer\n"
645 "md mediandeint median deinterlacer\n"
646 "fd ffmpegdeint ffmpeg deinterlacer\n"
647 "l5 lowpass5 FIR lowpass deinterlacer\n"
648 "de default hb:a,vb:a,dr:a\n"
649 "fa fast h1:a,v1:a,dr:a\n"
650 "ac ha:a:128:7,va:a,dr:a\n"
651 "tn tmpnoise (3 threshold) temporal noise reducer\n"
652 " 1. <= 2. <= 3. larger -> stronger filtering\n"
653 "fq forceQuant <quantizer> force quantizer\n"
654 "Usage:\n"
655 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
656 "long form example:\n"
657 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
658 "short form example:\n"
659 "vb:a/hb:a/lb de,-vb\n"
660 "more examples:\n"
661 "tn:64:128:256\n"
662 "\n"
663 ;
664
pp_get_mode_by_name_and_quality(const char * name,int quality)665 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
666 {
667 char temp[GET_MODE_BUFFER_SIZE];
668 char *p= temp;
669 static const char filterDelimiters[] = ",/";
670 static const char optionDelimiters[] = ":|";
671 struct PPMode *ppMode;
672 char *filterToken;
673
674 if (!name) {
675 av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
676 return NULL;
677 }
678
679 if (!strcmp(name, "help")) {
680 const char *p;
681 for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
682 av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
683 av_log(NULL, AV_LOG_INFO, "%s", temp);
684 }
685 return NULL;
686 }
687
688 ppMode= av_malloc(sizeof(PPMode));
689
690 ppMode->lumMode= 0;
691 ppMode->chromMode= 0;
692 ppMode->maxTmpNoise[0]= 700;
693 ppMode->maxTmpNoise[1]= 1500;
694 ppMode->maxTmpNoise[2]= 3000;
695 ppMode->maxAllowedY= 234;
696 ppMode->minAllowedY= 16;
697 ppMode->baseDcDiff= 256/8;
698 ppMode->flatnessThreshold= 56-16-1;
699 ppMode->maxClippedThreshold= 0.01;
700 ppMode->error=0;
701
702 memset(temp, 0, GET_MODE_BUFFER_SIZE);
703 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
704
705 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
706
707 for(;;){
708 char *filterName;
709 int q= 1000000; //PP_QUALITY_MAX;
710 int chrom=-1;
711 int luma=-1;
712 char *option;
713 char *options[OPTIONS_ARRAY_SIZE];
714 int i;
715 int filterNameOk=0;
716 int numOfUnknownOptions=0;
717 int enable=1; //does the user want us to enabled or disabled the filter
718
719 filterToken= strtok(p, filterDelimiters);
720 if(!filterToken) break;
721 p+= strlen(filterToken) + 1; // p points to next filterToken
722 filterName= strtok(filterToken, optionDelimiters);
723 if (!filterName) {
724 ppMode->error++;
725 break;
726 }
727 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
728
729 if(*filterName == '-'){
730 enable=0;
731 filterName++;
732 }
733
734 for(;;){ //for all options
735 option= strtok(NULL, optionDelimiters);
736 if(!option) break;
737
738 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
739 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
740 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
741 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
742 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
743 else{
744 options[numOfUnknownOptions] = option;
745 numOfUnknownOptions++;
746 }
747 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
748 }
749 options[numOfUnknownOptions] = NULL;
750
751 /* replace stuff from the replace Table */
752 for(i=0; replaceTable[2*i]; i++){
753 if(!strcmp(replaceTable[2*i], filterName)){
754 int newlen= strlen(replaceTable[2*i + 1]);
755 int plen;
756 int spaceLeft;
757
758 p--, *p=',';
759
760 plen= strlen(p);
761 spaceLeft= p - temp + plen;
762 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
763 ppMode->error++;
764 break;
765 }
766 memmove(p + newlen, p, plen+1);
767 memcpy(p, replaceTable[2*i + 1], newlen);
768 filterNameOk=1;
769 }
770 }
771
772 for(i=0; filters[i].shortName; i++){
773 if( !strcmp(filters[i].longName, filterName)
774 || !strcmp(filters[i].shortName, filterName)){
775 ppMode->lumMode &= ~filters[i].mask;
776 ppMode->chromMode &= ~filters[i].mask;
777
778 filterNameOk=1;
779 if(!enable) break; // user wants to disable it
780
781 if(q >= filters[i].minLumQuality && luma)
782 ppMode->lumMode|= filters[i].mask;
783 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
784 if(q >= filters[i].minChromQuality)
785 ppMode->chromMode|= filters[i].mask;
786
787 if(filters[i].mask == LEVEL_FIX){
788 int o;
789 ppMode->minAllowedY= 16;
790 ppMode->maxAllowedY= 234;
791 for(o=0; options[o]; o++){
792 if( !strcmp(options[o],"fullyrange")
793 ||!strcmp(options[o],"f")){
794 ppMode->minAllowedY= 0;
795 ppMode->maxAllowedY= 255;
796 numOfUnknownOptions--;
797 }
798 }
799 }
800 else if(filters[i].mask == TEMP_NOISE_FILTER)
801 {
802 int o;
803 int numOfNoises=0;
804
805 for(o=0; options[o]; o++){
806 char *tail;
807 ppMode->maxTmpNoise[numOfNoises]=
808 strtol(options[o], &tail, 0);
809 if(tail!=options[o]){
810 numOfNoises++;
811 numOfUnknownOptions--;
812 if(numOfNoises >= 3) break;
813 }
814 }
815 }
816 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
817 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
818 int o;
819
820 for(o=0; options[o] && o<2; o++){
821 char *tail;
822 int val= strtol(options[o], &tail, 0);
823 if(tail==options[o]) break;
824
825 numOfUnknownOptions--;
826 if(o==0) ppMode->baseDcDiff= val;
827 else ppMode->flatnessThreshold= val;
828 }
829 }
830 else if(filters[i].mask == FORCE_QUANT){
831 int o;
832 ppMode->forcedQuant= 15;
833
834 for(o=0; options[o] && o<1; o++){
835 char *tail;
836 int val= strtol(options[o], &tail, 0);
837 if(tail==options[o]) break;
838
839 numOfUnknownOptions--;
840 ppMode->forcedQuant= val;
841 }
842 }
843 }
844 }
845 if(!filterNameOk) ppMode->error++;
846 ppMode->error += numOfUnknownOptions;
847 }
848
849 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
850 if(ppMode->error){
851 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
852 av_free(ppMode);
853 return NULL;
854 }
855 return ppMode;
856 }
857
pp_free_mode(pp_mode * mode)858 void pp_free_mode(pp_mode *mode){
859 av_free(mode);
860 }
861
reallocAlign(void ** p,int alignment,int size)862 static void reallocAlign(void **p, int alignment, int size){
863 av_free(*p);
864 *p= av_mallocz(size);
865 }
866
reallocBuffers(PPContext * c,int width,int height,int stride,int qpStride)867 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
868 int mbWidth = (width+15)>>4;
869 int mbHeight= (height+15)>>4;
870 int i;
871
872 c->stride= stride;
873 c->qpStride= qpStride;
874
875 reallocAlign((void **)&c->tempDst, 8, stride*24+32);
876 reallocAlign((void **)&c->tempSrc, 8, stride*24);
877 reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
878 reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
879 for(i=0; i<256; i++)
880 c->yHistogram.t_uint64_t[i]= width*height/64*15/256;
881
882 for(i=0; i<3; i++){
883 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
884 reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
885 reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
886 }
887
888 reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
889 reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
890 reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
891 reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
892 }
893
context_to_name(void * ptr)894 static const char * context_to_name(void * ptr) {
895 return "postproc";
896 }
897
898 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
899
pp_get_context(int width,int height,int cpuCaps)900 pp_context *pp_get_context(int width, int height, int cpuCaps){
901 PPContext *c= av_malloc(sizeof(PPContext));
902 int stride= FFALIGN(width, 16); //assumed / will realloc if needed
903 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
904
905 memset(c, 0, sizeof(PPContext));
906 c->av_class = &av_codec_context_class;
907 if(cpuCaps&PP_FORMAT){
908 c->hChromaSubSample= cpuCaps&0x3;
909 c->vChromaSubSample= (cpuCaps>>4)&0x3;
910 }else{
911 c->hChromaSubSample= 1;
912 c->vChromaSubSample= 1;
913 }
914 if (cpuCaps & PP_CPU_CAPS_AUTO) {
915 c->cpuCaps = av_get_cpu_flags();
916 } else {
917 c->cpuCaps = 0;
918 if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
919 if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
920 if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
921 if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
922 }
923
924 reallocBuffers(c, width, height, stride, qpStride);
925
926 c->frameNum=-1;
927
928 return c;
929 }
930
pp_free_context(void * vc)931 void pp_free_context(void *vc){
932 PPContext *c = (PPContext*)vc;
933 int i;
934
935 for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
936 for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
937
938 av_free(c->tempBlocks.t_uint8_t);
939 av_free(c->yHistogram.t_uint64_t);
940 av_free(c->tempDst.t_uint8_t);
941 av_free(c->tempSrc.t_uint8_t);
942 av_free(c->deintTemp.t_uint8_t);
943 av_free(c->stdQPTable.t_QPST);
944 av_free(c->nonBQPTable.t_QPST);
945 av_free(c->forcedQPTable.t_QPST);
946
947 memset(c, 0, sizeof(PPContext));
948
949 av_free(c);
950 }
951
pp_postprocess(const uint8_t * src[3],const int srcStride[3],uint8_t * dst[3],const int dstStride[3],int width,int height,const QP_STORE_T * QP_store,int QPStride,pp_mode * vm,void * vc,int pict_type)952 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
953 uint8_t * dst[3], const int dstStride[3],
954 int width, int height,
955 const QP_STORE_T *QP_store, int QPStride,
956 pp_mode *vm, void *vc, int pict_type)
957 {
958 int mbWidth = (width+15)>>4;
959 int mbHeight= (height+15)>>4;
960 PPMode *mode = (PPMode*)vm;
961 PPContext *c = (PPContext*)vc;
962 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
963 int absQPStride = FFABS(QPStride);
964
965 // c->stride and c->QPStride are always positive
966 if(c->stride < minStride || c->qpStride < absQPStride)
967 reallocBuffers(c, width, height,
968 FFMAX(minStride, c->stride),
969 FFMAX(c->qpStride, absQPStride));
970
971 if(!QP_store || (mode->lumMode & FORCE_QUANT)){
972 int i;
973 QP_store= c->forcedQPTable.t_QPST;
974 absQPStride = QPStride = 0;
975 if(mode->lumMode & FORCE_QUANT)
976 for(i=0; i<mbWidth; i++) c->forcedQPTable.t_QPST[i]= mode->forcedQuant;
977 else
978 for(i=0; i<mbWidth; i++) c->forcedQPTable.t_QPST[i]= 1;
979 }
980
981 if(pict_type & PP_PICT_TYPE_QP2){
982 int i;
983 const int count= mbHeight * absQPStride;
984 for(i=0; i<(count>>2); i++){
985 ((uint32_t*)c->stdQPTable.t_QPST)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
986 }
987 for(i<<=2; i<count; i++){
988 c->stdQPTable.t_QPST[i] = QP_store[i]>>1;
989 }
990 QP_store= c->stdQPTable.t_QPST;
991 QPStride= absQPStride;
992 }
993
994 if(0){
995 int x,y;
996 for(y=0; y<mbHeight; y++){
997 for(x=0; x<mbWidth; x++){
998 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
999 }
1000 av_log(c, AV_LOG_INFO, "\n");
1001 }
1002 av_log(c, AV_LOG_INFO, "\n");
1003 }
1004
1005 if((pict_type&7)!=3){
1006 if (QPStride >= 0){
1007 int i;
1008 const int count= mbHeight * QPStride;
1009 for(i=0; i<(count>>2); i++){
1010 ((uint32_t*)c->nonBQPTable.t_QPST)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1011 }
1012 for(i<<=2; i<count; i++){
1013 c->nonBQPTable.t_QPST[i] = QP_store[i] & 0x3F;
1014 }
1015 } else {
1016 int i,j;
1017 for(i=0; i<mbHeight; i++) {
1018 for(j=0; j<absQPStride; j++) {
1019 c->nonBQPTable.t_QPST[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1020 }
1021 }
1022 }
1023 }
1024
1025 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1026 mode->lumMode, mode->chromMode);
1027
1028 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1029 width, height, QP_store, QPStride, 0, mode, c);
1030
1031 width = (width )>>c->hChromaSubSample;
1032 height = (height)>>c->vChromaSubSample;
1033
1034 if(mode->chromMode){
1035 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1036 width, height, QP_store, QPStride, 1, mode, c);
1037 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1038 width, height, QP_store, QPStride, 2, mode, c);
1039 }
1040 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1041 linecpy(dst[1], src[1], height, srcStride[1]);
1042 linecpy(dst[2], src[2], height, srcStride[2]);
1043 }else{
1044 int y;
1045 for(y=0; y<height; y++){
1046 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1047 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1048 }
1049 }
1050 }
1051