1 /*
2  * Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 /**
24  * @file
25  * postprocessing.
26  */
27 
28 /*
29                         C       MMX     MMX2    3DNow   AltiVec
30 isVertDC                Ec      Ec                      Ec
31 isVertMinMaxOk          Ec      Ec                      Ec
32 doVertLowPass           E               e       e       Ec
33 doVertDefFilter         Ec      Ec      e       e       Ec
34 isHorizDC               Ec      Ec                      Ec
35 isHorizMinMaxOk         a       E                       Ec
36 doHorizLowPass          E               e       e       Ec
37 doHorizDefFilter        Ec      Ec      e       e       Ec
38 do_a_deblock            Ec      E       Ec      E
39 deRing                  E               e       e*      Ecp
40 Vertical RKAlgo1        E               a       a
41 Horizontal RKAlgo1                      a       a
42 Vertical X1#            a               E       E
43 Horizontal X1#          a               E       E
44 LinIpolDeinterlace      e               E       E*
45 CubicIpolDeinterlace    a               e       e*
46 LinBlendDeinterlace     e               E       E*
47 MedianDeinterlace#      E       Ec      Ec
48 TempDeNoiser#           E               e       e       Ec
49 
50 * I do not have a 3DNow! CPU -> it is untested, but no one said it does not work so it seems to work
51 # more or less selfinvented filters so the exactness is not too meaningful
52 E = Exact implementation
53 e = almost exact implementation (slightly different rounding,...)
54 a = alternative / approximate impl
55 c = checked against the other implementations (-vo md5)
56 p = partially optimized, still some work to do
57 */
58 
59 /*
60 TODO:
61 reduce the time wasted on the mem transfer
62 unroll stuff if instructions depend too much on the prior one
63 move YScale thing to the end instead of fixing QP
64 write a faster and higher quality deblocking filter :)
65 make the mainloop more flexible (variable number of blocks at once
66         (the if/else stuff per block is slowing things down)
67 compare the quality & speed of all filters
68 split this huge file
69 optimize c versions
70 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks
71 ...
72 */
73 
74 //Changelog: use git log
75 
76 #include "config.h"
77 
78 #include "libavutil/avutil.h"
79 #include "libavutil/avassert.h"
80 #include <inttypes.h>
81 #include <stdio.h>
82 #include <stdlib.h>
83 #include <string.h>
84 //#undef HAVE_MMXEXT_INLINE
85 //#define HAVE_AMD3DNOW_INLINE
86 //#undef HAVE_MMX_INLINE
87 //#undef ARCH_X86
88 //#define DEBUG_BRIGHTNESS
89 #include "postprocess.h"
90 #include "postprocess_internal.h"
91 #include "libavutil/avstring.h"
92 
postproc_version(void)93 unsigned postproc_version(void)
94 {
95     av_assert0(LIBPOSTPROC_VERSION_MICRO >= 100);
96     return LIBPOSTPROC_VERSION_INT;
97 }
98 
postproc_configuration(void)99 const char *postproc_configuration(void)
100 {
101     return FFMPEG_CONFIGURATION;
102 }
103 
postproc_license(void)104 const char *postproc_license(void)
105 {
106 #define LICENSE_PREFIX "libpostproc license: "
107     return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
108 }
109 
110 #if HAVE_ALTIVEC_H
111 #include <altivec.h>
112 #endif
113 
114 #define GET_MODE_BUFFER_SIZE 500
115 #define OPTIONS_ARRAY_SIZE 10
116 #define BLOCK_SIZE 8
117 #define TEMP_STRIDE 8
118 //#define NUM_BLOCKS_AT_ONCE 16 //not used yet
119 
120 #if ARCH_X86 && HAVE_INLINE_ASM
121 DECLARE_ASM_CONST(8, uint64_t, w05)= 0x0005000500050005LL;
122 DECLARE_ASM_CONST(8, uint64_t, w04)= 0x0004000400040004LL;
123 DECLARE_ASM_CONST(8, uint64_t, w20)= 0x0020002000200020LL;
124 DECLARE_ASM_CONST(8, uint64_t, b00)= 0x0000000000000000LL;
125 DECLARE_ASM_CONST(8, uint64_t, b01)= 0x0101010101010101LL;
126 DECLARE_ASM_CONST(8, uint64_t, b02)= 0x0202020202020202LL;
127 DECLARE_ASM_CONST(8, uint64_t, b08)= 0x0808080808080808LL;
128 DECLARE_ASM_CONST(8, uint64_t, b80)= 0x8080808080808080LL;
129 #endif
130 
131 DECLARE_ASM_CONST(8, int, deringThreshold)= 20;
132 
133 
134 static const struct PPFilter filters[]=
135 {
136     {"hb", "hdeblock",              1, 1, 3, H_DEBLOCK},
137     {"vb", "vdeblock",              1, 2, 4, V_DEBLOCK},
138 /*  {"hr", "rkhdeblock",            1, 1, 3, H_RK1_FILTER},
139     {"vr", "rkvdeblock",            1, 2, 4, V_RK1_FILTER},*/
140     {"h1", "x1hdeblock",            1, 1, 3, H_X1_FILTER},
141     {"v1", "x1vdeblock",            1, 2, 4, V_X1_FILTER},
142     {"ha", "ahdeblock",             1, 1, 3, H_A_DEBLOCK},
143     {"va", "avdeblock",             1, 2, 4, V_A_DEBLOCK},
144     {"dr", "dering",                1, 5, 6, DERING},
145     {"al", "autolevels",            0, 1, 2, LEVEL_FIX},
146     {"lb", "linblenddeint",         1, 1, 4, LINEAR_BLEND_DEINT_FILTER},
147     {"li", "linipoldeint",          1, 1, 4, LINEAR_IPOL_DEINT_FILTER},
148     {"ci", "cubicipoldeint",        1, 1, 4, CUBIC_IPOL_DEINT_FILTER},
149     {"md", "mediandeint",           1, 1, 4, MEDIAN_DEINT_FILTER},
150     {"fd", "ffmpegdeint",           1, 1, 4, FFMPEG_DEINT_FILTER},
151     {"l5", "lowpass5",              1, 1, 4, LOWPASS5_DEINT_FILTER},
152     {"tn", "tmpnoise",              1, 7, 8, TEMP_NOISE_FILTER},
153     {"fq", "forcequant",            1, 0, 0, FORCE_QUANT},
154     {"be", "bitexact",              1, 0, 0, BITEXACT},
155     {NULL, NULL,0,0,0,0} //End Marker
156 };
157 
158 static const char * const replaceTable[]=
159 {
160     "default",      "hb:a,vb:a,dr:a",
161     "de",           "hb:a,vb:a,dr:a",
162     "fast",         "h1:a,v1:a,dr:a",
163     "fa",           "h1:a,v1:a,dr:a",
164     "ac",           "ha:a:128:7,va:a,dr:a",
165     NULL //End Marker
166 };
167 
168 
169 #if ARCH_X86 && HAVE_INLINE_ASM
prefetchnta(void * p)170 static inline void prefetchnta(void *p)
171 {
172     __asm__ volatile(   "prefetchnta (%0)\n\t"
173         : : "r" (p)
174     );
175 }
176 
prefetcht0(void * p)177 static inline void prefetcht0(void *p)
178 {
179     __asm__ volatile(   "prefetcht0 (%0)\n\t"
180         : : "r" (p)
181     );
182 }
183 
prefetcht1(void * p)184 static inline void prefetcht1(void *p)
185 {
186     __asm__ volatile(   "prefetcht1 (%0)\n\t"
187         : : "r" (p)
188     );
189 }
190 
prefetcht2(void * p)191 static inline void prefetcht2(void *p)
192 {
193     __asm__ volatile(   "prefetcht2 (%0)\n\t"
194         : : "r" (p)
195     );
196 }
197 #endif
198 
199 /* The horizontal functions exist only in C because the MMX
200  * code is faster with vertical filters and transposing. */
201 
202 /**
203  * Check if the given 8x8 Block is mostly "flat"
204  */
isHorizDC_C(const uint8_t src[],int stride,const PPContext * c)205 static inline int isHorizDC_C(const uint8_t src[], int stride, const PPContext *c)
206 {
207     int numEq= 0;
208     int y;
209     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
210     const int dcThreshold= dcOffset*2 + 1;
211 
212     for(y=0; y<BLOCK_SIZE; y++){
213         if(((unsigned)(src[0] - src[1] + dcOffset)) < dcThreshold) numEq++;
214         if(((unsigned)(src[1] - src[2] + dcOffset)) < dcThreshold) numEq++;
215         if(((unsigned)(src[2] - src[3] + dcOffset)) < dcThreshold) numEq++;
216         if(((unsigned)(src[3] - src[4] + dcOffset)) < dcThreshold) numEq++;
217         if(((unsigned)(src[4] - src[5] + dcOffset)) < dcThreshold) numEq++;
218         if(((unsigned)(src[5] - src[6] + dcOffset)) < dcThreshold) numEq++;
219         if(((unsigned)(src[6] - src[7] + dcOffset)) < dcThreshold) numEq++;
220         src+= stride;
221     }
222     return numEq > c->ppMode.flatnessThreshold;
223 }
224 
225 /**
226  * Check if the middle 8x8 Block in the given 8x16 block is flat
227  */
isVertDC_C(const uint8_t src[],int stride,const PPContext * c)228 static inline int isVertDC_C(const uint8_t src[], int stride, const PPContext *c)
229 {
230     int numEq= 0;
231     int y;
232     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
233     const int dcThreshold= dcOffset*2 + 1;
234 
235     src+= stride*4; // src points to begin of the 8x8 Block
236     for(y=0; y<BLOCK_SIZE-1; y++){
237         if(((unsigned)(src[0] - src[0+stride] + dcOffset)) < dcThreshold) numEq++;
238         if(((unsigned)(src[1] - src[1+stride] + dcOffset)) < dcThreshold) numEq++;
239         if(((unsigned)(src[2] - src[2+stride] + dcOffset)) < dcThreshold) numEq++;
240         if(((unsigned)(src[3] - src[3+stride] + dcOffset)) < dcThreshold) numEq++;
241         if(((unsigned)(src[4] - src[4+stride] + dcOffset)) < dcThreshold) numEq++;
242         if(((unsigned)(src[5] - src[5+stride] + dcOffset)) < dcThreshold) numEq++;
243         if(((unsigned)(src[6] - src[6+stride] + dcOffset)) < dcThreshold) numEq++;
244         if(((unsigned)(src[7] - src[7+stride] + dcOffset)) < dcThreshold) numEq++;
245         src+= stride;
246     }
247     return numEq > c->ppMode.flatnessThreshold;
248 }
249 
isHorizMinMaxOk_C(const uint8_t src[],int stride,int QP)250 static inline int isHorizMinMaxOk_C(const uint8_t src[], int stride, int QP)
251 {
252     int i;
253     for(i=0; i<2; i++){
254         if((unsigned)(src[0] - src[5] + 2*QP) > 4*QP) return 0;
255         src += stride;
256         if((unsigned)(src[2] - src[7] + 2*QP) > 4*QP) return 0;
257         src += stride;
258         if((unsigned)(src[4] - src[1] + 2*QP) > 4*QP) return 0;
259         src += stride;
260         if((unsigned)(src[6] - src[3] + 2*QP) > 4*QP) return 0;
261         src += stride;
262     }
263     return 1;
264 }
265 
isVertMinMaxOk_C(const uint8_t src[],int stride,int QP)266 static inline int isVertMinMaxOk_C(const uint8_t src[], int stride, int QP)
267 {
268     int x;
269     src+= stride*4;
270     for(x=0; x<BLOCK_SIZE; x+=4){
271         if((unsigned)(src[  x + 0*stride] - src[  x + 5*stride] + 2*QP) > 4*QP) return 0;
272         if((unsigned)(src[1+x + 2*stride] - src[1+x + 7*stride] + 2*QP) > 4*QP) return 0;
273         if((unsigned)(src[2+x + 4*stride] - src[2+x + 1*stride] + 2*QP) > 4*QP) return 0;
274         if((unsigned)(src[3+x + 6*stride] - src[3+x + 3*stride] + 2*QP) > 4*QP) return 0;
275     }
276     return 1;
277 }
278 
horizClassify_C(const uint8_t src[],int stride,const PPContext * c)279 static inline int horizClassify_C(const uint8_t src[], int stride, const PPContext *c)
280 {
281     if( isHorizDC_C(src, stride, c) ){
282         if( isHorizMinMaxOk_C(src, stride, c->QP) )
283             return 1;
284         else
285             return 0;
286     }else{
287         return 2;
288     }
289 }
290 
vertClassify_C(const uint8_t src[],int stride,const PPContext * c)291 static inline int vertClassify_C(const uint8_t src[], int stride, const PPContext *c)
292 {
293     if( isVertDC_C(src, stride, c) ){
294         if( isVertMinMaxOk_C(src, stride, c->QP) )
295             return 1;
296         else
297             return 0;
298     }else{
299         return 2;
300     }
301 }
302 
doHorizDefFilter_C(uint8_t dst[],int stride,const PPContext * c)303 static inline void doHorizDefFilter_C(uint8_t dst[], int stride, const PPContext *c)
304 {
305     int y;
306     for(y=0; y<BLOCK_SIZE; y++){
307         const int middleEnergy= 5*(dst[4] - dst[3]) + 2*(dst[2] - dst[5]);
308 
309         if(FFABS(middleEnergy) < 8*c->QP){
310             const int q=(dst[3] - dst[4])/2;
311             const int leftEnergy=  5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
312             const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
313 
314             int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
315             d= FFMAX(d, 0);
316 
317             d= (5*d + 32) >> 6;
318             d*= FFSIGN(-middleEnergy);
319 
320             if(q>0)
321             {
322                 d = FFMAX(d, 0);
323                 d = FFMIN(d, q);
324             }
325             else
326             {
327                 d = FFMIN(d, 0);
328                 d = FFMAX(d, q);
329             }
330 
331             dst[3]-= d;
332             dst[4]+= d;
333         }
334         dst+= stride;
335     }
336 }
337 
338 /**
339  * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
340  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
341  */
doHorizLowPass_C(uint8_t dst[],int stride,const PPContext * c)342 static inline void doHorizLowPass_C(uint8_t dst[], int stride, const PPContext *c)
343 {
344     int y;
345     for(y=0; y<BLOCK_SIZE; y++){
346         const int first= FFABS(dst[-1] - dst[0]) < c->QP ? dst[-1] : dst[0];
347         const int last= FFABS(dst[8] - dst[7]) < c->QP ? dst[8] : dst[7];
348 
349         int sums[10];
350         sums[0] = 4*first + dst[0] + dst[1] + dst[2] + 4;
351         sums[1] = sums[0] - first  + dst[3];
352         sums[2] = sums[1] - first  + dst[4];
353         sums[3] = sums[2] - first  + dst[5];
354         sums[4] = sums[3] - first  + dst[6];
355         sums[5] = sums[4] - dst[0] + dst[7];
356         sums[6] = sums[5] - dst[1] + last;
357         sums[7] = sums[6] - dst[2] + last;
358         sums[8] = sums[7] - dst[3] + last;
359         sums[9] = sums[8] - dst[4] + last;
360 
361         dst[0]= (sums[0] + sums[2] + 2*dst[0])>>4;
362         dst[1]= (sums[1] + sums[3] + 2*dst[1])>>4;
363         dst[2]= (sums[2] + sums[4] + 2*dst[2])>>4;
364         dst[3]= (sums[3] + sums[5] + 2*dst[3])>>4;
365         dst[4]= (sums[4] + sums[6] + 2*dst[4])>>4;
366         dst[5]= (sums[5] + sums[7] + 2*dst[5])>>4;
367         dst[6]= (sums[6] + sums[8] + 2*dst[6])>>4;
368         dst[7]= (sums[7] + sums[9] + 2*dst[7])>>4;
369 
370         dst+= stride;
371     }
372 }
373 
374 /**
375  * Experimental Filter 1 (Horizontal)
376  * will not damage linear gradients
377  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
378  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
379  * MMX2 version does correct clipping C version does not
380  * not identical with the vertical one
381  */
horizX1Filter(uint8_t * src,int stride,int QP)382 static inline void horizX1Filter(uint8_t *src, int stride, int QP)
383 {
384     int y;
385     static uint64_t lut[256];
386     if(!lut[255])
387     {
388         int i;
389         for(i=0; i<256; i++)
390         {
391             int v= i < 128 ? 2*i : 2*(i-256);
392 /*
393 //Simulate 112242211 9-Tap filter
394             uint64_t a= (v/16)  & 0xFF;
395             uint64_t b= (v/8)   & 0xFF;
396             uint64_t c= (v/4)   & 0xFF;
397             uint64_t d= (3*v/8) & 0xFF;
398 */
399 //Simulate piecewise linear interpolation
400             uint64_t a= (v/16)   & 0xFF;
401             uint64_t b= (v*3/16) & 0xFF;
402             uint64_t c= (v*5/16) & 0xFF;
403             uint64_t d= (7*v/16) & 0xFF;
404             uint64_t A= (0x100 - a)&0xFF;
405             uint64_t B= (0x100 - b)&0xFF;
406             uint64_t C= (0x100 - c)&0xFF;
407             uint64_t D= (0x100 - c)&0xFF;
408 
409             lut[i]   = (a<<56) | (b<<48) | (c<<40) | (d<<32) |
410                        (D<<24) | (C<<16) | (B<<8)  | (A);
411             //lut[i] = (v<<32) | (v<<24);
412         }
413     }
414 
415     for(y=0; y<BLOCK_SIZE; y++){
416         int a= src[1] - src[2];
417         int b= src[3] - src[4];
418         int c= src[5] - src[6];
419 
420         int d= FFMAX(FFABS(b) - (FFABS(a) + FFABS(c))/2, 0);
421 
422         if(d < QP){
423             int v = d * FFSIGN(-b);
424 
425             src[1] +=v/8;
426             src[2] +=v/4;
427             src[3] +=3*v/8;
428             src[4] -=3*v/8;
429             src[5] -=v/4;
430             src[6] -=v/8;
431         }
432         src+=stride;
433     }
434 }
435 
436 /**
437  * accurate deblock filter
438  */
do_a_deblock_C(uint8_t * src,int step,int stride,const PPContext * c)439 static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
440                                             int stride, const PPContext *c)
441 {
442     int y;
443     const int QP= c->QP;
444     const int dcOffset= ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
445     const int dcThreshold= dcOffset*2 + 1;
446 //START_TIMER
447     src+= step*4; // src points to begin of the 8x8 Block
448     for(y=0; y<8; y++){
449         int numEq= 0;
450 
451         if(((unsigned)(src[-1*step] - src[0*step] + dcOffset)) < dcThreshold) numEq++;
452         if(((unsigned)(src[ 0*step] - src[1*step] + dcOffset)) < dcThreshold) numEq++;
453         if(((unsigned)(src[ 1*step] - src[2*step] + dcOffset)) < dcThreshold) numEq++;
454         if(((unsigned)(src[ 2*step] - src[3*step] + dcOffset)) < dcThreshold) numEq++;
455         if(((unsigned)(src[ 3*step] - src[4*step] + dcOffset)) < dcThreshold) numEq++;
456         if(((unsigned)(src[ 4*step] - src[5*step] + dcOffset)) < dcThreshold) numEq++;
457         if(((unsigned)(src[ 5*step] - src[6*step] + dcOffset)) < dcThreshold) numEq++;
458         if(((unsigned)(src[ 6*step] - src[7*step] + dcOffset)) < dcThreshold) numEq++;
459         if(((unsigned)(src[ 7*step] - src[8*step] + dcOffset)) < dcThreshold) numEq++;
460         if(numEq > c->ppMode.flatnessThreshold){
461             int min, max, x;
462 
463             if(src[0] > src[step]){
464                 max= src[0];
465                 min= src[step];
466             }else{
467                 max= src[step];
468                 min= src[0];
469             }
470             for(x=2; x<8; x+=2){
471                 if(src[x*step] > src[(x+1)*step]){
472                         if(src[x    *step] > max) max= src[ x   *step];
473                         if(src[(x+1)*step] < min) min= src[(x+1)*step];
474                 }else{
475                         if(src[(x+1)*step] > max) max= src[(x+1)*step];
476                         if(src[ x   *step] < min) min= src[ x   *step];
477                 }
478             }
479             if(max-min < 2*QP){
480                 const int first= FFABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0];
481                 const int last= FFABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step];
482 
483                 int sums[10];
484                 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4;
485                 sums[1] = sums[0] - first       + src[3*step];
486                 sums[2] = sums[1] - first       + src[4*step];
487                 sums[3] = sums[2] - first       + src[5*step];
488                 sums[4] = sums[3] - first       + src[6*step];
489                 sums[5] = sums[4] - src[0*step] + src[7*step];
490                 sums[6] = sums[5] - src[1*step] + last;
491                 sums[7] = sums[6] - src[2*step] + last;
492                 sums[8] = sums[7] - src[3*step] + last;
493                 sums[9] = sums[8] - src[4*step] + last;
494 
495                 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4;
496                 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4;
497                 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4;
498                 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4;
499                 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4;
500                 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4;
501                 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4;
502                 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4;
503             }
504         }else{
505             const int middleEnergy= 5*(src[4*step] - src[3*step]) + 2*(src[2*step] - src[5*step]);
506 
507             if(FFABS(middleEnergy) < 8*QP){
508                 const int q=(src[3*step] - src[4*step])/2;
509                 const int leftEnergy=  5*(src[2*step] - src[1*step]) + 2*(src[0*step] - src[3*step]);
510                 const int rightEnergy= 5*(src[6*step] - src[5*step]) + 2*(src[4*step] - src[7*step]);
511 
512                 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
513                 d= FFMAX(d, 0);
514 
515                 d= (5*d + 32) >> 6;
516                 d*= FFSIGN(-middleEnergy);
517 
518                 if(q>0){
519                     d = FFMAX(d, 0);
520                     d = FFMIN(d, q);
521                 }else{
522                     d = FFMIN(d, 0);
523                     d = FFMAX(d, q);
524                 }
525 
526                 src[3*step]-= d;
527                 src[4*step]+= d;
528             }
529         }
530 
531         src += stride;
532     }
533 /*if(step==16){
534     STOP_TIMER("step16")
535 }else{
536     STOP_TIMER("stepX")
537 }*/
538 }
539 
540 //Note: we have C, MMX, MMX2, 3DNOW version there is no 3DNOW+MMX2 one
541 //Plain C versions
542 //we always compile C for testing which needs bitexactness
543 #define TEMPLATE_PP_C 1
544 #include "postprocess_template.c"
545 
546 #if HAVE_ALTIVEC
547 #   define TEMPLATE_PP_ALTIVEC 1
548 #   include "postprocess_altivec_template.c"
549 #   include "postprocess_template.c"
550 #endif
551 
552 #if ARCH_X86 && HAVE_INLINE_ASM
553 #    if CONFIG_RUNTIME_CPUDETECT
554 #        define TEMPLATE_PP_MMX 1
555 #        include "postprocess_template.c"
556 #        define TEMPLATE_PP_MMXEXT 1
557 #        include "postprocess_template.c"
558 #        define TEMPLATE_PP_3DNOW 1
559 #        include "postprocess_template.c"
560 #        define TEMPLATE_PP_SSE2 1
561 #        include "postprocess_template.c"
562 #    else
563 #        if HAVE_SSE2_INLINE
564 #            define TEMPLATE_PP_SSE2 1
565 #            include "postprocess_template.c"
566 #        elif HAVE_MMXEXT_INLINE
567 #            define TEMPLATE_PP_MMXEXT 1
568 #            include "postprocess_template.c"
569 #        elif HAVE_AMD3DNOW_INLINE
570 #            define TEMPLATE_PP_3DNOW 1
571 #            include "postprocess_template.c"
572 #        elif HAVE_MMX_INLINE
573 #            define TEMPLATE_PP_MMX 1
574 #            include "postprocess_template.c"
575 #        endif
576 #    endif
577 #endif
578 
579 typedef void (*pp_fn)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
580                       const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2);
581 
postProcess(const uint8_t src[],int srcStride,uint8_t dst[],int dstStride,int width,int height,const QP_STORE_T QPs[],int QPStride,int isColor,pp_mode * vm,pp_context * vc)582 static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
583         const QP_STORE_T QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
584 {
585     pp_fn pp = postProcess_C;
586     PPContext *c= (PPContext *)vc;
587     PPMode *ppMode= (PPMode *)vm;
588     c->ppMode= *ppMode; //FIXME
589 
590     if (!(ppMode->lumMode & BITEXACT)) {
591 #if CONFIG_RUNTIME_CPUDETECT
592 #if ARCH_X86 && HAVE_INLINE_ASM
593         // ordered per speed fastest first
594         if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
595         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
596         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
597         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
598 #elif HAVE_ALTIVEC
599         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
600 #endif
601 #else /* CONFIG_RUNTIME_CPUDETECT */
602 #if     HAVE_SSE2_INLINE
603         pp = postProcess_SSE2;
604 #elif   HAVE_MMXEXT_INLINE
605         pp = postProcess_MMX2;
606 #elif HAVE_AMD3DNOW_INLINE
607         pp = postProcess_3DNow;
608 #elif HAVE_MMX_INLINE
609         pp = postProcess_MMX;
610 #elif HAVE_ALTIVEC
611         pp = postProcess_altivec;
612 #endif
613 #endif /* !CONFIG_RUNTIME_CPUDETECT */
614     }
615 
616     pp(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, c);
617 }
618 
619 /* -pp Command line Help
620 */
621 const char pp_help[] =
622 "Available postprocessing filters:\n"
623 "Filters                        Options\n"
624 "short  long name       short   long option     Description\n"
625 "*      *               a       autoq           CPU power dependent enabler\n"
626 "                       c       chrom           chrominance filtering enabled\n"
627 "                       y       nochrom         chrominance filtering disabled\n"
628 "                       n       noluma          luma filtering disabled\n"
629 "hb     hdeblock        (2 threshold)           horizontal deblocking filter\n"
630 "       1. difference factor: default=32, higher -> more deblocking\n"
631 "       2. flatness threshold: default=39, lower -> more deblocking\n"
632 "                       the h & v deblocking filters share these\n"
633 "                       so you can't set different thresholds for h / v\n"
634 "vb     vdeblock        (2 threshold)           vertical deblocking filter\n"
635 "ha     hadeblock       (2 threshold)           horizontal deblocking filter\n"
636 "va     vadeblock       (2 threshold)           vertical deblocking filter\n"
637 "h1     x1hdeblock                              experimental h deblock filter 1\n"
638 "v1     x1vdeblock                              experimental v deblock filter 1\n"
639 "dr     dering                                  deringing filter\n"
640 "al     autolevels                              automatic brightness / contrast\n"
641 "                       f        fullyrange     stretch luminance to (0..255)\n"
642 "lb     linblenddeint                           linear blend deinterlacer\n"
643 "li     linipoldeint                            linear interpolating deinterlace\n"
644 "ci     cubicipoldeint                          cubic interpolating deinterlacer\n"
645 "md     mediandeint                             median deinterlacer\n"
646 "fd     ffmpegdeint                             ffmpeg deinterlacer\n"
647 "l5     lowpass5                                FIR lowpass deinterlacer\n"
648 "de     default                                 hb:a,vb:a,dr:a\n"
649 "fa     fast                                    h1:a,v1:a,dr:a\n"
650 "ac                                             ha:a:128:7,va:a,dr:a\n"
651 "tn     tmpnoise        (3 threshold)           temporal noise reducer\n"
652 "                     1. <= 2. <= 3.            larger -> stronger filtering\n"
653 "fq     forceQuant      <quantizer>             force quantizer\n"
654 "Usage:\n"
655 "<filterName>[:<option>[:<option>...]][[,|/][-]<filterName>[:<option>...]]...\n"
656 "long form example:\n"
657 "vdeblock:autoq/hdeblock:autoq/linblenddeint    default,-vdeblock\n"
658 "short form example:\n"
659 "vb:a/hb:a/lb                                   de,-vb\n"
660 "more examples:\n"
661 "tn:64:128:256\n"
662 "\n"
663 ;
664 
pp_get_mode_by_name_and_quality(const char * name,int quality)665 pp_mode *pp_get_mode_by_name_and_quality(const char *name, int quality)
666 {
667     char temp[GET_MODE_BUFFER_SIZE];
668     char *p= temp;
669     static const char filterDelimiters[] = ",/";
670     static const char optionDelimiters[] = ":|";
671     struct PPMode *ppMode;
672     char *filterToken;
673 
674     if (!name)  {
675         av_log(NULL, AV_LOG_ERROR, "pp: Missing argument\n");
676         return NULL;
677     }
678 
679     if (!strcmp(name, "help")) {
680         const char *p;
681         for (p = pp_help; strchr(p, '\n'); p = strchr(p, '\n') + 1) {
682             av_strlcpy(temp, p, FFMIN(sizeof(temp), strchr(p, '\n') - p + 2));
683             av_log(NULL, AV_LOG_INFO, "%s", temp);
684         }
685         return NULL;
686     }
687 
688     ppMode= av_malloc(sizeof(PPMode));
689 
690     ppMode->lumMode= 0;
691     ppMode->chromMode= 0;
692     ppMode->maxTmpNoise[0]= 700;
693     ppMode->maxTmpNoise[1]= 1500;
694     ppMode->maxTmpNoise[2]= 3000;
695     ppMode->maxAllowedY= 234;
696     ppMode->minAllowedY= 16;
697     ppMode->baseDcDiff= 256/8;
698     ppMode->flatnessThreshold= 56-16-1;
699     ppMode->maxClippedThreshold= 0.01;
700     ppMode->error=0;
701 
702     memset(temp, 0, GET_MODE_BUFFER_SIZE);
703     av_strlcpy(temp, name, GET_MODE_BUFFER_SIZE - 1);
704 
705     av_log(NULL, AV_LOG_DEBUG, "pp: %s\n", name);
706 
707     for(;;){
708         char *filterName;
709         int q= 1000000; //PP_QUALITY_MAX;
710         int chrom=-1;
711         int luma=-1;
712         char *option;
713         char *options[OPTIONS_ARRAY_SIZE];
714         int i;
715         int filterNameOk=0;
716         int numOfUnknownOptions=0;
717         int enable=1; //does the user want us to enabled or disabled the filter
718 
719         filterToken= strtok(p, filterDelimiters);
720         if(!filterToken) break;
721         p+= strlen(filterToken) + 1; // p points to next filterToken
722         filterName= strtok(filterToken, optionDelimiters);
723         if (!filterName) {
724             ppMode->error++;
725             break;
726         }
727         av_log(NULL, AV_LOG_DEBUG, "pp: %s::%s\n", filterToken, filterName);
728 
729         if(*filterName == '-'){
730             enable=0;
731             filterName++;
732         }
733 
734         for(;;){ //for all options
735             option= strtok(NULL, optionDelimiters);
736             if(!option) break;
737 
738             av_log(NULL, AV_LOG_DEBUG, "pp: option: %s\n", option);
739             if(!strcmp("autoq", option) || !strcmp("a", option)) q= quality;
740             else if(!strcmp("nochrom", option) || !strcmp("y", option)) chrom=0;
741             else if(!strcmp("chrom", option) || !strcmp("c", option)) chrom=1;
742             else if(!strcmp("noluma", option) || !strcmp("n", option)) luma=0;
743             else{
744                 options[numOfUnknownOptions] = option;
745                 numOfUnknownOptions++;
746             }
747             if(numOfUnknownOptions >= OPTIONS_ARRAY_SIZE-1) break;
748         }
749         options[numOfUnknownOptions] = NULL;
750 
751         /* replace stuff from the replace Table */
752         for(i=0; replaceTable[2*i]; i++){
753             if(!strcmp(replaceTable[2*i], filterName)){
754                 int newlen= strlen(replaceTable[2*i + 1]);
755                 int plen;
756                 int spaceLeft;
757 
758                 p--, *p=',';
759 
760                 plen= strlen(p);
761                 spaceLeft= p - temp + plen;
762                 if(spaceLeft + newlen  >= GET_MODE_BUFFER_SIZE - 1){
763                     ppMode->error++;
764                     break;
765                 }
766                 memmove(p + newlen, p, plen+1);
767                 memcpy(p, replaceTable[2*i + 1], newlen);
768                 filterNameOk=1;
769             }
770         }
771 
772         for(i=0; filters[i].shortName; i++){
773             if(   !strcmp(filters[i].longName, filterName)
774                || !strcmp(filters[i].shortName, filterName)){
775                 ppMode->lumMode &= ~filters[i].mask;
776                 ppMode->chromMode &= ~filters[i].mask;
777 
778                 filterNameOk=1;
779                 if(!enable) break; // user wants to disable it
780 
781                 if(q >= filters[i].minLumQuality && luma)
782                     ppMode->lumMode|= filters[i].mask;
783                 if(chrom==1 || (chrom==-1 && filters[i].chromDefault))
784                     if(q >= filters[i].minChromQuality)
785                             ppMode->chromMode|= filters[i].mask;
786 
787                 if(filters[i].mask == LEVEL_FIX){
788                     int o;
789                     ppMode->minAllowedY= 16;
790                     ppMode->maxAllowedY= 234;
791                     for(o=0; options[o]; o++){
792                         if(  !strcmp(options[o],"fullyrange")
793                            ||!strcmp(options[o],"f")){
794                             ppMode->minAllowedY= 0;
795                             ppMode->maxAllowedY= 255;
796                             numOfUnknownOptions--;
797                         }
798                     }
799                 }
800                 else if(filters[i].mask == TEMP_NOISE_FILTER)
801                 {
802                     int o;
803                     int numOfNoises=0;
804 
805                     for(o=0; options[o]; o++){
806                         char *tail;
807                         ppMode->maxTmpNoise[numOfNoises]=
808                             strtol(options[o], &tail, 0);
809                         if(tail!=options[o]){
810                             numOfNoises++;
811                             numOfUnknownOptions--;
812                             if(numOfNoises >= 3) break;
813                         }
814                     }
815                 }
816                 else if(filters[i].mask == V_DEBLOCK   || filters[i].mask == H_DEBLOCK
817                      || filters[i].mask == V_A_DEBLOCK || filters[i].mask == H_A_DEBLOCK){
818                     int o;
819 
820                     for(o=0; options[o] && o<2; o++){
821                         char *tail;
822                         int val= strtol(options[o], &tail, 0);
823                         if(tail==options[o]) break;
824 
825                         numOfUnknownOptions--;
826                         if(o==0) ppMode->baseDcDiff= val;
827                         else ppMode->flatnessThreshold= val;
828                     }
829                 }
830                 else if(filters[i].mask == FORCE_QUANT){
831                     int o;
832                     ppMode->forcedQuant= 15;
833 
834                     for(o=0; options[o] && o<1; o++){
835                         char *tail;
836                         int val= strtol(options[o], &tail, 0);
837                         if(tail==options[o]) break;
838 
839                         numOfUnknownOptions--;
840                         ppMode->forcedQuant= val;
841                     }
842                 }
843             }
844         }
845         if(!filterNameOk) ppMode->error++;
846         ppMode->error += numOfUnknownOptions;
847     }
848 
849     av_log(NULL, AV_LOG_DEBUG, "pp: lumMode=%X, chromMode=%X\n", ppMode->lumMode, ppMode->chromMode);
850     if(ppMode->error){
851         av_log(NULL, AV_LOG_ERROR, "%d errors in postprocess string \"%s\"\n", ppMode->error, name);
852         av_free(ppMode);
853         return NULL;
854     }
855     return ppMode;
856 }
857 
pp_free_mode(pp_mode * mode)858 void pp_free_mode(pp_mode *mode){
859     av_free(mode);
860 }
861 
reallocAlign(void ** p,int alignment,int size)862 static void reallocAlign(void **p, int alignment, int size){
863     av_free(*p);
864     *p= av_mallocz(size);
865 }
866 
reallocBuffers(PPContext * c,int width,int height,int stride,int qpStride)867 static void reallocBuffers(PPContext *c, int width, int height, int stride, int qpStride){
868     int mbWidth = (width+15)>>4;
869     int mbHeight= (height+15)>>4;
870     int i;
871 
872     c->stride= stride;
873     c->qpStride= qpStride;
874 
875     reallocAlign((void **)&c->tempDst, 8, stride*24+32);
876     reallocAlign((void **)&c->tempSrc, 8, stride*24);
877     reallocAlign((void **)&c->tempBlocks, 8, 2*16*8);
878     reallocAlign((void **)&c->yHistogram, 8, 256*sizeof(uint64_t));
879     for(i=0; i<256; i++)
880             c->yHistogram.t_uint64_t[i]= width*height/64*15/256;
881 
882     for(i=0; i<3; i++){
883         //Note: The +17*1024 is just there so I do not have to worry about r/w over the end.
884         reallocAlign((void **)&c->tempBlurred[i], 8, stride*mbHeight*16 + 17*1024);
885         reallocAlign((void **)&c->tempBlurredPast[i], 8, 256*((height+7)&(~7))/2 + 17*1024);//FIXME size
886     }
887 
888     reallocAlign((void **)&c->deintTemp, 8, 2*width+32);
889     reallocAlign((void **)&c->nonBQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
890     reallocAlign((void **)&c->stdQPTable, 8, qpStride*mbHeight*sizeof(QP_STORE_T));
891     reallocAlign((void **)&c->forcedQPTable, 8, mbWidth*sizeof(QP_STORE_T));
892 }
893 
context_to_name(void * ptr)894 static const char * context_to_name(void * ptr) {
895     return "postproc";
896 }
897 
898 static const AVClass av_codec_context_class = { "Postproc", context_to_name, NULL };
899 
pp_get_context(int width,int height,int cpuCaps)900 pp_context *pp_get_context(int width, int height, int cpuCaps){
901     PPContext *c= av_malloc(sizeof(PPContext));
902     int stride= FFALIGN(width, 16);  //assumed / will realloc if needed
903     int qpStride= (width+15)/16 + 2; //assumed / will realloc if needed
904 
905     memset(c, 0, sizeof(PPContext));
906     c->av_class = &av_codec_context_class;
907     if(cpuCaps&PP_FORMAT){
908         c->hChromaSubSample= cpuCaps&0x3;
909         c->vChromaSubSample= (cpuCaps>>4)&0x3;
910     }else{
911         c->hChromaSubSample= 1;
912         c->vChromaSubSample= 1;
913     }
914     if (cpuCaps & PP_CPU_CAPS_AUTO) {
915         c->cpuCaps = av_get_cpu_flags();
916     } else {
917         c->cpuCaps = 0;
918         if (cpuCaps & PP_CPU_CAPS_MMX)      c->cpuCaps |= AV_CPU_FLAG_MMX;
919         if (cpuCaps & PP_CPU_CAPS_MMX2)     c->cpuCaps |= AV_CPU_FLAG_MMXEXT;
920         if (cpuCaps & PP_CPU_CAPS_3DNOW)    c->cpuCaps |= AV_CPU_FLAG_3DNOW;
921         if (cpuCaps & PP_CPU_CAPS_ALTIVEC)  c->cpuCaps |= AV_CPU_FLAG_ALTIVEC;
922     }
923 
924     reallocBuffers(c, width, height, stride, qpStride);
925 
926     c->frameNum=-1;
927 
928     return c;
929 }
930 
pp_free_context(void * vc)931 void pp_free_context(void *vc){
932     PPContext *c = (PPContext*)vc;
933     int i;
934 
935     for(i=0; i<3; i++) av_free(c->tempBlurred[i]);
936     for(i=0; i<3; i++) av_free(c->tempBlurredPast[i]);
937 
938     av_free(c->tempBlocks.t_uint8_t);
939     av_free(c->yHistogram.t_uint64_t);
940     av_free(c->tempDst.t_uint8_t);
941     av_free(c->tempSrc.t_uint8_t);
942     av_free(c->deintTemp.t_uint8_t);
943     av_free(c->stdQPTable.t_QPST);
944     av_free(c->nonBQPTable.t_QPST);
945     av_free(c->forcedQPTable.t_QPST);
946 
947     memset(c, 0, sizeof(PPContext));
948 
949     av_free(c);
950 }
951 
pp_postprocess(const uint8_t * src[3],const int srcStride[3],uint8_t * dst[3],const int dstStride[3],int width,int height,const QP_STORE_T * QP_store,int QPStride,pp_mode * vm,void * vc,int pict_type)952 void  pp_postprocess(const uint8_t * src[3], const int srcStride[3],
953                      uint8_t * dst[3], const int dstStride[3],
954                      int width, int height,
955                      const QP_STORE_T *QP_store,  int QPStride,
956                      pp_mode *vm,  void *vc, int pict_type)
957 {
958     int mbWidth = (width+15)>>4;
959     int mbHeight= (height+15)>>4;
960     PPMode *mode = (PPMode*)vm;
961     PPContext *c = (PPContext*)vc;
962     int minStride= FFMAX(FFABS(srcStride[0]), FFABS(dstStride[0]));
963     int absQPStride = FFABS(QPStride);
964 
965     // c->stride and c->QPStride are always positive
966     if(c->stride < minStride || c->qpStride < absQPStride)
967         reallocBuffers(c, width, height,
968                        FFMAX(minStride, c->stride),
969                        FFMAX(c->qpStride, absQPStride));
970 
971     if(!QP_store || (mode->lumMode & FORCE_QUANT)){
972         int i;
973         QP_store= c->forcedQPTable.t_QPST;
974         absQPStride = QPStride = 0;
975         if(mode->lumMode & FORCE_QUANT)
976             for(i=0; i<mbWidth; i++) c->forcedQPTable.t_QPST[i]= mode->forcedQuant;
977         else
978             for(i=0; i<mbWidth; i++) c->forcedQPTable.t_QPST[i]= 1;
979     }
980 
981     if(pict_type & PP_PICT_TYPE_QP2){
982         int i;
983         const int count= mbHeight * absQPStride;
984         for(i=0; i<(count>>2); i++){
985             ((uint32_t*)c->stdQPTable.t_QPST)[i] = (((const uint32_t*)QP_store)[i]>>1) & 0x7F7F7F7F;
986         }
987         for(i<<=2; i<count; i++){
988             c->stdQPTable.t_QPST[i] = QP_store[i]>>1;
989         }
990         QP_store= c->stdQPTable.t_QPST;
991         QPStride= absQPStride;
992     }
993 
994     if(0){
995         int x,y;
996         for(y=0; y<mbHeight; y++){
997             for(x=0; x<mbWidth; x++){
998                 av_log(c, AV_LOG_INFO, "%2d ", QP_store[x + y*QPStride]);
999             }
1000             av_log(c, AV_LOG_INFO, "\n");
1001         }
1002         av_log(c, AV_LOG_INFO, "\n");
1003     }
1004 
1005     if((pict_type&7)!=3){
1006         if (QPStride >= 0){
1007             int i;
1008             const int count= mbHeight * QPStride;
1009             for(i=0; i<(count>>2); i++){
1010                 ((uint32_t*)c->nonBQPTable.t_QPST)[i] = ((const uint32_t*)QP_store)[i] & 0x3F3F3F3F;
1011             }
1012             for(i<<=2; i<count; i++){
1013                 c->nonBQPTable.t_QPST[i] = QP_store[i] & 0x3F;
1014             }
1015         } else {
1016             int i,j;
1017             for(i=0; i<mbHeight; i++) {
1018                 for(j=0; j<absQPStride; j++) {
1019                     c->nonBQPTable.t_QPST[i*absQPStride+j] = QP_store[i*QPStride+j] & 0x3F;
1020                 }
1021             }
1022         }
1023     }
1024 
1025     av_log(c, AV_LOG_DEBUG, "using npp filters 0x%X/0x%X\n",
1026            mode->lumMode, mode->chromMode);
1027 
1028     postProcess(src[0], srcStride[0], dst[0], dstStride[0],
1029                 width, height, QP_store, QPStride, 0, mode, c);
1030 
1031     width  = (width )>>c->hChromaSubSample;
1032     height = (height)>>c->vChromaSubSample;
1033 
1034     if(mode->chromMode){
1035         postProcess(src[1], srcStride[1], dst[1], dstStride[1],
1036                     width, height, QP_store, QPStride, 1, mode, c);
1037         postProcess(src[2], srcStride[2], dst[2], dstStride[2],
1038                     width, height, QP_store, QPStride, 2, mode, c);
1039     }
1040     else if(srcStride[1] == dstStride[1] && srcStride[2] == dstStride[2]){
1041         linecpy(dst[1], src[1], height, srcStride[1]);
1042         linecpy(dst[2], src[2], height, srcStride[2]);
1043     }else{
1044         int y;
1045         for(y=0; y<height; y++){
1046             memcpy(&(dst[1][y*dstStride[1]]), &(src[1][y*srcStride[1]]), width);
1047             memcpy(&(dst[2][y*dstStride[2]]), &(src[2][y*srcStride[2]]), width);
1048         }
1049     }
1050 }
1051