1 /*
2 * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3 *
4 * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 /**
24 * @file
25 * postprocessing.
26 */
27
28 /*
29 C MMX MMX2 3DNow AltiVec
30 isVertDC Ec Ec Ec
31 isVertMinMaxOk Ec Ec Ec
32 doVertLowPass E e e Ec
33 doVertDefFilter Ec Ec e e Ec
34 isHorizDC Ec Ec Ec
35 isHorizMinMaxOk a E Ec
36 doHorizLowPass E e e Ec
37 doHorizDefFilter Ec Ec e e Ec
38 do_a_deblock Ec E Ec E
39 deRing E e e* Ecp
40 Vertical RKAlgo1 E a a
41 Horizontal RKAlgo1 a a
42 Vertical X1# a E E
43 Horizontal X1# a E E
44 LinIpolDeinterlace e E E*
45 CubicIpolDeinterlace a e e*
46 LinBlendDeinterlace e E E*
47 MedianDeinterlace# E Ec Ec
48 TempDeNoiser# E e e Ec
49
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66 (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73
74 //Changelog: use git log
75
76 #include "config.h"
77 #include "libavutil/avutil.h"
78 #include "libavutil/avassert.h"
79 #include "libavutil/intreadwrite.h"
80 #include <inttypes.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 //#undef HAVE_MMXEXT_INLINE
85 //#define HAVE_AMD3DNOW_INLINE
86 //#undef HAVE_MMX_INLINE
87 //#undef ARCH_X86
88 //#define DEBUG_BRIGHTNESS
89 #include "postprocess.h"
90 #include "postprocess_internal.h"
91 #include "libavutil/avstring.h"
92 #include "libavutil/ppc/util_altivec.h"
93
94 #include "libavutil/ffversion.h"
95 const char postproc_ffversion[] = "FFmpeg version " FFMPEG_VERSION;
96
postproc_version(void)97 unsigned postproc_version(void)
98 {
99 av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
100 return LIBPOSTPROC_VERSION_INT;
101 }
102
postproc_configuration(void)103 const char *postproc_configuration(void)
104 {
105 return FFMPEG_CONFIGURATION;
106 }
107
postproc_license(void)108 const char *postproc_license(void)
109 {
110 #define LICENSE_PREFIX "libpostproc license: "
111 return &LICENSE_PREFIX FFMPEG_LICENSE[sizeof(LICENSE_PREFIX) - 1];
112 }
113
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116 #define BLOCK_SIZE 8
117 #define TEMP_STRIDE 8
118 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
119
120 #if ARCH_X86 && HAVE_INLINE_ASM
121 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
122 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
123 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
124 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
125 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
126 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
127 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
128 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
129 #endif
130
131 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
132
133
134 static const struct PPFilter filters[]=
135 {
136 {"hb", "hdeblock", 1, 1, 3, H_DEBLOCK},
137 {"vb", "vdeblock", 1, 2, 4, V_DEBLOCK},
138 /* {"hr", "rkhdeblock", 1, 1, 3, H_RK1_FILTER},
139 {"vr", "rkvdeblock", 1, 2, 4, V_RK1_FILTER},*/
140 {"h1", "x1hdeblock", 1, 1, 3, H_X1_FILTER},
141 {"v1", "x1vdeblock", 1, 2, 4, V_X1_FILTER},
142 {"ha", "ahdeblock", 1, 1, 3, H_A_DEBLOCK},
143 {"va", "avdeblock", 1, 2, 4, V_A_DEBLOCK},
144 {"dr", "dering", 1, 5, 6, DERING},
145 {"al", "autolevels", 0, 1, 2, LEVEL_FIX},
146 {"lb", "linblenddeint", 1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
147 {"li", "linipoldeint", 1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
148 {"ci", "cubicipoldeint", 1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
149 {"md", "mediandeint", 1, 1, 4, MEDIAN_DEINT_FILTER},
150 {"fd", "ffmpegdeint", 1, 1, 4, FFMPEG_DEINT_FILTER},
151 {"l5", "lowpass5", 1, 1, 4, LOWPASS5_DEINT_FILTER},
152 {"tn", "tmpnoise", 1, 7, 8, TEMP_NOISE_FILTER},
153 {"fq", "forcequant", 1, 0, 0, FORCE_QUANT},
154 {"be", "bitexact", 1, 0, 0, BITEXACT},
155 {"vi", "visualize", 1, 0, 0, VISUALIZE},
156 {NULL, NULL,0,0,0,0} //End Marker
157 };
158
159 static const char * const replaceTable[]=
160 {
161 "default", "hb:a,vb:a,dr:a",
162 "de", "hb:a,vb:a,dr:a",
163 "fast", "h1:a,v1:a,dr:a",
164 "fa", "h1:a,v1:a,dr:a",
165 "ac", "ha:a:128:7,va:a,dr:a",
166 NULL //End Marker
167 };
168
169 /* The horizontal functions exist only in C because the MMX
170 * code is faster with vertical filters and transposing. */
171
172 /**
173 * Check if the given 8x8 Block is mostly "flat"
174 */
isHorizDC_C(const uint8_t src[],int stride,const PPContext * c)175 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
176 {
177 int numEq= 0;
178 int y;
179 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
180 const int dcThreshold= dcOffset*2 + 1;
181
182 for(y=0; y<BLOCK_SIZE; y++){
183 numEq += ((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold;
184 numEq += ((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold;
185 numEq += ((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold;
186 numEq += ((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold;
187 numEq += ((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold;
188 numEq += ((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold;
189 numEq += ((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold;
190 src+= stride;
191 }
192 return numEq > c->ppMode.flatnessThreshold;
193 }
194
195 /**
196 * Check if the middle 8x8 Block in the given 8x16 block is flat
197 */
isVertDC_C(const uint8_t src[],int stride,const PPContext * c)198 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
199 {
200 int numEq= 0;
201 int y;
202 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
203 const int dcThreshold= dcOffset*2 + 1;
204
205 src+= stride*4; // src points to begin of the 8x8 Block
206 for(y=0; y<BLOCK_SIZE-1; y++){
207 numEq += ((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold;
208 numEq += ((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold;
209 numEq += ((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold;
210 numEq += ((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold;
211 numEq += ((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold;
212 numEq += ((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold;
213 numEq += ((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold;
214 numEq += ((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold;
215 src+= stride;
216 }
217 return numEq > c->ppMode.flatnessThreshold;
218 }
219
isHorizMinMaxOk_C(const uint8_t src[],int stride,int QP)220 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
221 {
222 int i;
223 for(i=0; i<2; i++){
224 if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
225 src += stride;
226 if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
227 src += stride;
228 if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
229 src += stride;
230 if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
231 src += stride;
232 }
233 return 1;
234 }
235
isVertMinMaxOk_C(const uint8_t src[],int stride,int QP)236 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
237 {
238 int x;
239 src+= stride*4;
240 for(x=0; x<BLOCK_SIZE; x+=4){
241 if((unsigned)(src[ x + 0*stride] - src[ x + 5*stride] + 2*QP) > 4*QP) return 0;
242 if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
243 if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
244 if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
245 }
246 return 1;
247 }
248
horizClassify_C(const uint8_t src[],int stride,const PPContext * c)249 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
250 {
251 if( isHorizDC_C(src, stride, c) ){
252 return isHorizMinMaxOk_C(src, stride, c->QP);
253 }else{
254 return 2;
255 }
256 }
257
vertClassify_C(const uint8_t src[],int stride,const PPContext * c)258 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
259 {
260 if( isVertDC_C(src, stride, c) ){
261 return isVertMinMaxOk_C(src, stride, c->QP);
262 }else{
263 return 2;
264 }
265 }
266
doHorizDefFilter_C(uint8_t dst[],int stride,const PPContext * c)267 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
268 {
269 int y;
270 for(y=0; y<BLOCK_SIZE; y++){
271 const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
272
273 if(FFABS(middleEnergy) < 8*c->QP){
274 const int q=(dst[3] - dst[4])/2;
275 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
276 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
277
278 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
279 d= FFMAX(d, 0);
280
281 d= (5*d + 32) >> 6;
282 d*= FFSIGN(-middleEnergy);
283
284 if(q>0)
285 {
286 d = FFMAX(d, 0);
287 d = FFMIN(d, q);
288 }
289 else
290 {
291 d = FFMIN(d, 0);
292 d = FFMAX(d, q);
293 }
294
295 dst[3]-= d;
296 dst[4]+= d;
297 }
298 dst+= stride;
299 }
300 }
301
302 /**
303 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
304 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
305 */
doHorizLowPass_C(uint8_t dst[],int stride,const PPContext * c)306 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
307 {
308 int y;
309 for(y=0; y<BLOCK_SIZE; y++){
310 const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
311 const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
312
313 int sums[10];
314 sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
315 sums[1] = sums[0] - first + dst[3];
316 sums[2] = sums[1] - first + dst[4];
317 sums[3] = sums[2] - first + dst[5];
318 sums[4] = sums[3] - first + dst[6];
319 sums[5] = sums[4] - dst[0] + dst[7];
320 sums[6] = sums[5] - dst[1] + last;
321 sums[7] = sums[6] - dst[2] + last;
322 sums[8] = sums[7] - dst[3] + last;
323 sums[9] = sums[8] - dst[4] + last;
324
325 dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
326 dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
327 dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
328 dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
329 dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
330 dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
331 dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
332 dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
333
334 dst+= stride;
335 }
336 }
337
338 /**
339 * Experimental Filter 1 (Horizontal)
340 * will not damage linear gradients
341 * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
342 * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
343 * MMX2 version does correct clipping C version does not
344 * not identical with the vertical one
345 */
horizX1Filter(uint8_t * src,int stride,int QP)346 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
347 {
348 int y;
349 static uint64_t lut[256];
350 if(!lut[255])
351 {
352 int i;
353 for(i=0; i<256; i++)
354 {
355 int v= i < 128 ? 2*i : 2*(i-256);
356 /*
357 //Simulate 112242211 9-Tap filter
358 uint64_t a= (v/16) & 0xFF;
359 uint64_t b= (v/8) & 0xFF;
360 uint64_t c= (v/4) & 0xFF;
361 uint64_t d= (3*v/8) & 0xFF;
362 */
363 //Simulate piecewise linear interpolation
364 uint64_t a= (v/16) & 0xFF;
365 uint64_t b= (v*3/16) & 0xFF;
366 uint64_t c= (v*5/16) & 0xFF;
367 uint64_t d= (7*v/16) & 0xFF;
368 uint64_t A= (0x100 - a)&0xFF;
369 uint64_t B= (0x100 - b)&0xFF;
370 uint64_t C= (0x100 - c)&0xFF;
371 uint64_t D= (0x100 - c)&0xFF;
372
373 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
374 (D<<24) | (C<<16) | (B<<8) | (A);
375 //lut[i] = (v<<32) | (v<<24);
376 }
377 }
378
379 for(y=0; y<BLOCK_SIZE; y++){
380 int a= src[1] - src[2];
381 int b= src[3] - src[4];
382 int c= src[5] - src[6];
383
384 int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
385
386 if(d < QP){
387 int v = d * FFSIGN(-b);
388
389 src[1] +=v/8;
390 src[2] +=v/4;
391 src[3] +=3*v/8;
392 src[4] -=3*v/8;
393 src[5] -=v/4;
394 src[6] -=v/8;
395 }
396 src+=stride;
397 }
398 }
399
400 /**
401 * accurate deblock filter
402 */
do_a_deblock_C(uint8_t * src,int step,int stride,const PPContext * c,int mode)403 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
404 int stride, const PPContext *c, int mode)
405 {
406 int y;
407 const int QP= c->QP;
408 const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
409 const int dcThreshold= dcOffset*2 + 1;
410 //START_TIMER
411 src+= step*4; // src points to begin of the 8x8 Block
412 for(y=0; y<8; y++){
413 int numEq= 0;
414
415 numEq += ((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold;
416 numEq += ((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold;
417 numEq += ((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold;
418 numEq += ((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold;
419 numEq += ((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold;
420 numEq += ((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold;
421 numEq += ((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold;
422 numEq += ((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold;
423 numEq += ((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold;
424 if(numEq > c->ppMode.flatnessThreshold){
425 int min, max, x;
426
427 if(src[0] > src[step]){
428 max= src[0];
429 min= src[step];
430 }else{
431 max= src[step];
432 min= src[0];
433 }
434 for(x=2; x<8; x+=2){
435 if(src[x*step] > src[(x+1)*step]){
436 if(src[x *step] > max) max= src[ x *step];
437 if(src[(x+1)*step] < min) min= src[(x+1)*step];
438 }else{
439 if(src[(x+1)*step] > max) max= src[(x+1)*step];
440 if(src[ x *step] < min) min= src[ x *step];
441 }
442 }
443 if(max-min < 2*QP){
444 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
445 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
446
447 int sums[10];
448 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
449 sums[1] = sums[0] - first + src[3*step];
450 sums[2] = sums[1] - first + src[4*step];
451 sums[3] = sums[2] - first + src[5*step];
452 sums[4] = sums[3] - first + src[6*step];
453 sums[5] = sums[4] - src[0*step] + src[7*step];
454 sums[6] = sums[5] - src[1*step] + last;
455 sums[7] = sums[6] - src[2*step] + last;
456 sums[8] = sums[7] - src[3*step] + last;
457 sums[9] = sums[8] - src[4*step] + last;
458
459 if (mode & VISUALIZE) {
460 src[0*step] =
461 src[1*step] =
462 src[2*step] =
463 src[3*step] =
464 src[4*step] =
465 src[5*step] =
466 src[6*step] =
467 src[7*step] = 128;
468 }
469 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
470 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
471 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
472 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
473 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
474 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
475 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
476 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
477 }
478 }else{
479 const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
480
481 if(FFABS(middleEnergy) < 8*QP){
482 const int q=(src[3*step] - src[4*step])/2;
483 const int leftEnergy= 5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
484 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
485
486 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
487 d= FFMAX(d, 0);
488
489 d= (5*d + 32) >> 6;
490 d*= FFSIGN(-middleEnergy);
491
492 if(q>0){
493 d = FFMAX(d, 0);
494 d = FFMIN(d, q);
495 }else{
496 d = FFMIN(d, 0);
497 d = FFMAX(d, q);
498 }
499
500 if ((mode & VISUALIZE) && d) {
501 d= (d < 0) ? 32 : -32;
502 src[3*step]= av_clip_uint8(src[3*step] - d);
503 src[4*step]= av_clip_uint8(src[4*step] + d);
504 d = 0;
505 }
506
507 src[3*step]-= d;
508 src[4*step]+= d;
509 }
510 }
511
512 src += stride;
513 }
514 /*if(step==16){
515 STOP_TIMER("step16")
516 }else{
517 STOP_TIMER("stepX")
518 }*/
519 }
520
521 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
522 //Plain C versions
523 //we always compile C for testing which needs bitexactness
524 #define TEMPLATE_PP_C 1
525 #include "postprocess_template.c"
526
527 #if HAVE_ALTIVEC
528 # define TEMPLATE_PP_ALTIVEC 1
529 # include "postprocess_altivec_template.c"
530 # include "postprocess_template.c"
531 #endif
532
533 #if ARCH_X86 && HAVE_INLINE_ASM
534 # if CONFIG_RUNTIME_CPUDETECT
535 # define TEMPLATE_PP_MMX 1
536 # include "postprocess_template.c"
537 # define TEMPLATE_PP_MMXEXT 1
538 # include "postprocess_template.c"
539 # define TEMPLATE_PP_3DNOW 1
540 # include "postprocess_template.c"
541 # define TEMPLATE_PP_SSE2 1
542 # include "postprocess_template.c"
543 # else
544 # if HAVE_SSE2_INLINE
545 # define TEMPLATE_PP_SSE2 1
546 # include "postprocess_template.c"
547 # elif HAVE_MMXEXT_INLINE
548 # define TEMPLATE_PP_MMXEXT 1
549 # include "postprocess_template.c"
550 # elif HAVE_AMD3DNOW_INLINE
551 # define TEMPLATE_PP_3DNOW 1
552 # include "postprocess_template.c"
553 # elif HAVE_MMX_INLINE
554 # define TEMPLATE_PP_MMX 1
555 # include "postprocess_template.c"
556 # endif
557 # endif
558 #endif
559
560 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
561 const int8_t QPs[], int QPStride, int isColor, PPContext *c2);
562
postProcess(const uint8_t src[],int srcStride,uint8_t dst[],int dstStride,int width,int height,const int8_t QPs[],int QPStride,int isColor,pp_mode * vm,pp_context * vc)563 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
564 const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
565 {
566 pp_fn pp = postProcess_C;
567 PPContext *c= (PPContext *)vc;
568 PPMode *ppMode= (PPMode *)vm;
569 c->ppMode= *ppMode; //FIXME
570
571 if (!(ppMode->lumMode & BITEXACT)) {
572 #if CONFIG_RUNTIME_CPUDETECT
573 #if ARCH_X86 && HAVE_INLINE_ASM
574 // ordered per speed fastest first
575 if (c->cpuCaps & AV_CPU_FLAG_SSE2) pp = postProcess_SSE2;
576 else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT) pp = postProcess_MMX2;
577 else if (c->cpuCaps & AV_CPU_FLAG_3DNOW) pp = postProcess_3DNow;
578 else if (c->cpuCaps & AV_CPU_FLAG_MMX) pp = postProcess_MMX;
579 #elif HAVE_ALTIVEC
580 if (c->cpuCaps & AV_CPU_FLAG_ALTIVEC) pp = postProcess_altivec;
581 #endif
582 #else /* CONFIG_RUNTIME_CPUDETECT */
583 #if HAVE_SSE2_INLINE
584 pp = postProcess_SSE2;
585 #elif HAVE_MMXEXT_INLINE
586 pp = postProcess_MMX2;
587 #elif HAVE_AMD3DNOW_INLINE
588 pp = postProcess_3DNow;
589 #elif HAVE_MMX_INLINE
590 pp = postProcess_MMX;
591 #elif HAVE_ALTIVEC
592 pp = postProcess_altivec;
593 #endif
594 #endif /* !CONFIG_RUNTIME_CPUDETECT */
595 }
596
597 pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
598 }
599
600 /* -pp Command line Help
601 */
602 const char pp_help[] =
603 "Available postprocessing filters:\n"
604 "Filters Options\n"
605 "short long name short long option Description\n"
606 "* * a autoq CPU power dependent enabler\n"
607 " c chrom chrominance filtering enabled\n"
608 " y nochrom chrominance filtering disabled\n"
609 " n noluma luma filtering disabled\n"
610 "hb hdeblock (2 threshold) horizontal deblocking filter\n"
611 " 1. difference factor: default=32, higher -> more deblocking\n"
612 " 2. flatness threshold: default=39, lower -> more deblocking\n"
613 " the h & v deblocking filters share these\n"
614 " so you can't set different thresholds for h / v\n"
615 "vb vdeblock (2 threshold) vertical deblocking filter\n"
616 "ha hadeblock (2 threshold) horizontal deblocking filter\n"
617 "va vadeblock (2 threshold) vertical deblocking filter\n"
618 "h1 x1hdeblock experimental h deblock filter 1\n"
619 "v1 x1vdeblock experimental v deblock filter 1\n"
620 "dr dering deringing filter\n"
621 "al autolevels automatic brightness / contrast\n"
622 " f fullyrange stretch luminance to (0..255)\n"
623 "lb linblenddeint linear blend deinterlacer\n"
624 "li linipoldeint linear interpolating deinterlace\n"
625 "ci cubicipoldeint cubic interpolating deinterlacer\n"
626 "md mediandeint median deinterlacer\n"
627 "fd ffmpegdeint ffmpeg deinterlacer\n"
628 "l5 lowpass5 FIR lowpass deinterlacer\n"
629 "de default hb:a,vb:a,dr:a\n"
630 "fa fast h1:a,v1:a,dr:a\n"
631 "ac ha:a:128:7,va:a,dr:a\n"
632 "tn tmpnoise (3 threshold) temporal noise reducer\n"
633 " 1. <= 2. <= 3. larger -> stronger filtering\n"
634 "fq forceQuant <quantizer> force quantizer\n"
635 "Usage:\n"
636 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
637 "long form example:\n"
638 "vdeblock:autoq/hdeblock:autoq/linblenddeint default,-vdeblock\n"
639 "short form example:\n"
640 "vb:a/hb:a/lb de,-vb\n"
641 "more examples:\n"
642 "tn:64:128:256\n"
643 "\n"
644 ;
645
pp_get_mode_by_name_and_quality(const char * name,int quality)646 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
647 {
648 char temp[GET_MODE_BUFFER_SIZE];
649 char *p= temp;
650 static const char filterDelimiters[] = ",/";
651 static const char optionDelimiters[] = ":|";
652 struct PPMode *ppMode;
653 char *filterToken;
654
655 if (!name) {
656 av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
657 return NULL;
658 }
659
660 if (!strcmp(name, "help")) {
661 const char *p;
662 for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
663 av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
664 av_log(NULL, AV_LOG_INFO, "%s", temp);
665 }
666 return NULL;
667 }
668
669 ppMode= av_malloc(sizeof(PPMode));
670 if (!ppMode)
671 return NULL;
672
673 ppMode->lumMode= 0;
674 ppMode->chromMode= 0;
675 ppMode->maxTmpNoise[0]= 700;
676 ppMode->maxTmpNoise[1]= 1500;
677 ppMode->maxTmpNoise[2]= 3000;
678 ppMode->maxAllowedY= 234;
679 ppMode->minAllowedY= 16;
680 ppMode->baseDcDiff= 256/8;
681 ppMode->flatnessThreshold= 56-16-1;
682 ppMode->maxClippedThreshold= (AVRational){1,100};
683 ppMode->error=0;
684
685 memset(temp, 0, GET_MODE_BUFFER_SIZE);
686 av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
687
688 av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
689
690 for(;;){
691 const char *filterName;
692 int q= 1000000; //PP_QUALITY_MAX;
693 int chrom=-1;
694 int luma=-1;
695 const char *option;
696 const char *options[OPTIONS_ARRAY_SIZE];
697 int i;
698 int filterNameOk=0;
699 int numOfUnknownOptions=0;
700 int enable=1; //does the user want us to enabled or disabled the filter
701 char *tokstate;
702
703 filterToken= av_strtok(p, filterDelimiters, &tokstate);
704 if(!filterToken) break;
705 p+= strlen(filterToken) + 1; // p points to next filterToken
706 filterName= av_strtok(filterToken, optionDelimiters, &tokstate);
707 if (!filterName) {
708 ppMode->error++;
709 break;
710 }
711 av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
712
713 if(*filterName == '-'){
714 enable=0;
715 filterName++;
716 }
717
718 for(;;){ //for all options
719 option= av_strtok(NULL, optionDelimiters, &tokstate);
720 if(!option) break;
721
722 av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
723 if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
724 else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
725 else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
726 else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
727 else{
728 options[numOfUnknownOptions] = option;
729 numOfUnknownOptions++;
730 }
731 if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
732 }
733 options[numOfUnknownOptions] = NULL;
734
735 /* replace stuff from the replace Table */
736 for(i=0; replaceTable[2*i]; i++){
737 if(!strcmp(replaceTable[2*i], filterName)){
738 size_t newlen = strlen(replaceTable[2*i + 1]);
739 int plen;
740 int spaceLeft;
741
742 p--, *p=',';
743
744 plen= strlen(p);
745 spaceLeft= p - temp + plen;
746 if(spaceLeft + newlen >= GET_MODE_BUFFER_SIZE - 1){
747 ppMode->error++;
748 break;
749 }
750 memmove(p + newlen, p, plen+1);
751 memcpy(p, replaceTable[2*i + 1], newlen);
752 filterNameOk=1;
753 }
754 }
755
756 for(i=0; filters[i].shortName; i++){
757 if( !strcmp(filters[i].longName, filterName)
758 || !strcmp(filters[i].shortName, filterName)){
759 ppMode->lumMode &= ~filters[i].mask;
760 ppMode->chromMode &= ~filters[i].mask;
761
762 filterNameOk=1;
763 if(!enable) break; // user wants to disable it
764
765 if(q >= filters[i].minLumQuality && luma)
766 ppMode->lumMode|= filters[i].mask;
767 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
768 if(q >= filters[i].minChromQuality)
769 ppMode->chromMode|= filters[i].mask;
770
771 if(filters[i].mask == LEVEL_FIX){
772 int o;
773 ppMode->minAllowedY= 16;
774 ppMode->maxAllowedY= 234;
775 for(o=0; options[o]; o++){
776 if( !strcmp(options[o],"fullyrange")
777 ||!strcmp(options[o],"f")){
778 ppMode->minAllowedY= 0;
779 ppMode->maxAllowedY= 255;
780 numOfUnknownOptions--;
781 }
782 }
783 }
784 else if(filters[i].mask == TEMP_NOISE_FILTER)
785 {
786 int o;
787 int numOfNoises=0;
788
789 for(o=0; options[o]; o++){
790 char *tail;
791 ppMode->maxTmpNoise[numOfNoises]=
792 strtol(options[o], &tail, 0);
793 if(tail!=options[o]){
794 numOfNoises++;
795 numOfUnknownOptions--;
796 if(numOfNoises >= 3) break;
797 }
798 }
799 }
800 else if(filters[i].mask == V_DEBLOCK || filters[i].mask == H_DEBLOCK
801 || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
802 int o;
803
804 for(o=0; options[o] && o<2; o++){
805 char *tail;
806 int val= strtol(options[o], &tail, 0);
807 if(tail==options[o]) break;
808
809 numOfUnknownOptions--;
810 if(o==0) ppMode->baseDcDiff= val;
811 else ppMode->flatnessThreshold= val;
812 }
813 }
814 else if(filters[i].mask == FORCE_QUANT){
815 int o;
816 ppMode->forcedQuant= 15;
817
818 for(o=0; options[o] && o<1; o++){
819 char *tail;
820 int val= strtol(options[o], &tail, 0);
821 if(tail==options[o]) break;
822
823 numOfUnknownOptions--;
824 ppMode->forcedQuant= val;
825 }
826 }
827 }
828 }
829 if(!filterNameOk) ppMode->error++;
830 ppMode->error += numOfUnknownOptions;
831 }
832
833 av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
834 if(ppMode->error){
835 av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
836 av_free(ppMode);
837 return NULL;
838 }
839 return ppMode;
840 }
841
pp_free_mode(pp_mode * mode)842 void pp_free_mode(pp_mode *mode){
843 av_free(mode);
844 }
845
reallocAlign(void ** p,int size)846 static void reallocAlign(void **p, int size){
847 av_free(*p);
848 *p= av_mallocz(size);
849 }
850
reallocBuffers(PPContext * c,int width,int height,int stride,int qpStride)851 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
852 int mbWidth = (width+15)>>4;
853 int mbHeight= (height+15)>>4;
854 int i;
855
856 c->stride= stride;
857 c->qpStride= qpStride;
858
859 reallocAlign((void **)&c->tempDst, stride*24+32);
860 reallocAlign((void **)&c->tempSrc, stride*24);
861 reallocAlign((void **)&c->tempBlocks, 2*16*8);
862 reallocAlign((void **)&c->yHistogram, 256*sizeof(uint64_t));
863 for(i=0; i<256; i++)
864 c->yHistogram[i]= width*height/64*15/256;
865
866 for(i=0; i<3; i++){
867 //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
868 reallocAlign((void **)&c->tempBlurred[i], stride*mbHeight*16 + 17*1024);
869 reallocAlign((void **)&c->tempBlurredPast[i], 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
870 }
871
872 reallocAlign((void **)&c->deintTemp, 2*width+32);
873 reallocAlign((void **)&c->nonBQPTable, qpStride*mbHeight*sizeof(int8_t));
874 reallocAlign((void **)&c->stdQPTable, qpStride*mbHeight*sizeof(int8_t));
875 reallocAlign((void **)&c->forcedQPTable, mbWidth*sizeof(int8_t));
876 }
877
context_to_name(void * ptr)878 static const char * context_to_name(void * ptr) {
879 return "postproc";
880 }
881
882 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
883
pp_get_context(int width,int height,int cpuCaps)884 av_cold pp_context *pp_get_context(int width, int height, int cpuCaps){
885 PPContext *c= av_mallocz(sizeof(PPContext));
886 int stride= FFALIGN(width, 16); //assumed / will realloc if needed
887 int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
888
889 if (!c)
890 return NULL;
891
892 c->av_class = &av_codec_context_class;
893 if(cpuCaps&PP_FORMAT){
894 c->hChromaSubSample= cpuCaps&0x3;
895 c->vChromaSubSample= (cpuCaps>>4)&0x3;
896 }else{
897 c->hChromaSubSample= 1;
898 c->vChromaSubSample= 1;
899 }
900 if (cpuCaps & PP_CPU_CAPS_AUTO) {
901 c->cpuCaps = av_get_cpu_flags();
902 } else {
903 c->cpuCaps = 0;
904 if (cpuCaps & PP_CPU_CAPS_MMX) c->cpuCaps |= AV_CPU_FLAG_MMX;
905 if (cpuCaps & PP_CPU_CAPS_MMX2) c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
906 if (cpuCaps & PP_CPU_CAPS_3DNOW) c->cpuCaps |= AV_CPU_FLAG_3DNOW;
907 if (cpuCaps & PP_CPU_CAPS_ALTIVEC) c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
908 }
909
910 reallocBuffers(c, width, height, stride, qpStride);
911
912 c->frameNum=-1;
913
914 return c;
915 }
916
pp_free_context(void * vc)917 av_cold void pp_free_context(void *vc){
918 PPContext *c = (PPContext*)vc;
919 int i;
920
921 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurred); i++)
922 av_free(c->tempBlurred[i]);
923 for(i=0; i<FF_ARRAY_ELEMS(c->tempBlurredPast); i++)
924 av_free(c->tempBlurredPast[i]);
925
926 av_free(c->tempBlocks);
927 av_free(c->yHistogram);
928 av_free(c->tempDst);
929 av_free(c->tempSrc);
930 av_free(c->deintTemp);
931 av_free(c->stdQPTable);
932 av_free(c->nonBQPTable);
933 av_free(c->forcedQPTable);
934
935 memset(c, 0, sizeof(PPContext));
936
937 av_free(c);
938 }
939
pp_postprocess(const uint8_t * src[3],const int srcStride[3],uint8_t * dst[3],const int dstStride[3],int width,int height,const int8_t * QP_store,int QPStride,pp_mode * vm,void * vc,int pict_type)940 void pp_postprocess(const uint8_t * src[3], const int srcStride[3],
941 uint8_t * dst[3], const int dstStride[3],
942 int width, int height,
943 const int8_t *QP_store, int QPStride,
944 pp_mode *vm, void *vc, int pict_type)
945 {
946 int mbWidth = (width+15)>>4;
947 int mbHeight= (height+15)>>4;
948 PPMode *mode = vm;
949 PPContext *c = vc;
950 int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
951 int absQPStride = FFABS(QPStride);
952
953 // c->stride and c->QPStride are always positive
954 if(c->stride < minStride || c->qpStride < absQPStride)
955 reallocBuffers(c, width, height,
956 FFMAX(minStride, c->stride),
957 FFMAX(c->qpStride, absQPStride));
958
959 if(!QP_store || (mode->lumMode & FORCE_QUANT)){
960 int i;
961 QP_store= c->forcedQPTable;
962 absQPStride = QPStride = 0;
963 if(mode->lumMode & FORCE_QUANT)
964 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= mode->forcedQuant;
965 else
966 for(i=0; i<mbWidth; i++) c->forcedQPTable[i]= 1;
967 }
968
969 if(pict_type & PP_PICT_TYPE_QP2){
970 int i;
971 const int count= FFMAX(mbHeight * absQPStride, mbWidth);
972 for(i=0; i<(count>>2); i++){
973 AV_WN32(c->stdQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) >> 1 & 0x7F7F7F7F);
974 }
975 for(i<<=2; i<count; i++){
976 c->stdQPTable[i] = QP_store[i]>>1;
977 }
978 QP_store= c->stdQPTable;
979 QPStride= absQPStride;
980 }
981
982 if(0){
983 int x,y;
984 for(y=0; y<mbHeight; y++){
985 for(x=0; x<mbWidth; x++){
986 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
987 }
988 av_log(c, AV_LOG_INFO, "\n");
989 }
990 av_log(c, AV_LOG_INFO, "\n");
991 }
992
993 if((pict_type&7)!=3){
994 if (QPStride >= 0){
995 int i;
996 const int count= FFMAX(mbHeight * QPStride, mbWidth);
997 for(i=0; i<(count>>2); i++){
998 AV_WN32(c->nonBQPTable + (i<<2), AV_RN32(QP_store + (i<<2)) & 0x3F3F3F3F);
999 }
1000 for(i<<=2; i<count; i++){
1001 c->nonBQPTable[i] = QP_store[i] & 0x3F;
1002 }
1003 } else {
1004 int i,j;
1005 for(i=0; i<mbHeight; i++) {
1006 for(j=0; j<absQPStride; j++) {
1007 c->nonBQPTable[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1008 }
1009 }
1010 }
1011 }
1012
1013 av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1014 mode->lumMode, mode->chromMode);
1015
1016 postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1017 width, height, QP_store, QPStride, 0, mode, c);
1018
1019 if (!(src[1] && src[2] && dst[1] && dst[2]))
1020 return;
1021
1022 width = (width )>>c->hChromaSubSample;
1023 height = (height)>>c->vChromaSubSample;
1024
1025 if(mode->chromMode){
1026 postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1027 width, height, QP_store, QPStride, 1, mode, c);
1028 postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1029 width, height, QP_store, QPStride, 2, mode, c);
1030 }
1031 else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1032 linecpy(dst[1], src[1], height, srcStride[1]);
1033 linecpy(dst[2], src[2], height, srcStride[2]);
1034 }else{
1035 int y;
1036 for(y=0; y<height; y++){
1037 memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1038 memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1039 }
1040 }
1041 }
1042