1 /***********************************************************
2  * YUVDENOISER for the mjpegtools                          *
3  * ------------------------------------------------------- *
4  * (C) 2001-2004 Stefan Fendt                              *
5  *                                                         *
6  * Licensed and protected by the GNU-General-Public-       *
7  * License version 2 or if you prefer any later version of *
8  * that license). See the file LICENSE for detailed infor- *
9  * mation.                                                 *
10  *                                                         *
11  * FILE: main.c                                            *
12  *                                                         *
13  ***********************************************************/
14 
15 /* 2010-09-22, Franz Brauße <dev@karlchenofhell.org>
16  *  - added SSE2-accelerated versions of filter_plane_median()
17  *    and temporal_filter_planes()
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <math.h>
24 #include "config.h"
25 #include "mjpeg_types.h"
26 #include "yuv4mpeg.h"
27 #include "mjpeg_logging.h"
28 #include "cpu_accel.h"
29 #include "motionsearch.h"
30 
31 #if defined(__SSE3__)
32 # include <pmmintrin.h>
33 #elif defined(__SSE2__)
34 # include <emmintrin.h>
35 #endif
36 
37 int verbose = 1;
38 int width = 0;
39 int height = 0;
40 int lwidth = 0;
41 int lheight = 0;
42 int cwidth = 0;
43 int cheight = 0;
44 int input_chroma_subsampling = 0;
45 int input_interlaced = 0;
46 int hq_mode = 0;
47 
48 int renoise_Y=0;
49 int renoise_U=0;
50 int renoise_V=0;
51 
52 int temp_Y_thres = 0;
53 int temp_U_thres = 0;
54 int temp_V_thres = 0;
55 
56 int med_pre_Y_thres = 0;
57 int med_pre_U_thres = 0;
58 int med_pre_V_thres = 0;
59 
60 int med_post_Y_thres = 0;
61 int med_post_U_thres = 0;
62 int med_post_V_thres = 0;
63 
64 int gauss_Y = 0;
65 int gauss_U = 0;
66 int gauss_V = 0;
67 
68 uint8_t *frame1[3];
69 uint8_t *frame2[3];
70 uint8_t *frame3[3];
71 uint8_t *frame4[3];
72 uint8_t *frame5[3];
73 uint8_t *frame6[3];
74 uint8_t *frame7[3];
75 
76 uint8_t *scratchplane1;
77 uint8_t *scratchplane2;
78 uint8_t *outframe[3];
79 
80 int buff_offset;
81 int buff_size;
82 
83 uint16_t transform_L16[256];
84 uint8_t transform_G8[65536];
85 
86 /***********************************************************
87  * helper-functions                                        *
88  ***********************************************************/
89 
90 static void (*filter_plane_median)(uint8_t *, int, int, int);
91 static void (*temporal_filter_planes)(int, int, int, int);
92 
93 
94 void
gauss_filter_plane(uint8_t * frame,int w,int h,int t)95 gauss_filter_plane (uint8_t * frame, int w, int h, int t)
96 {
97 int i;
98 int v;
99 uint8_t * src = frame;
100 uint8_t * dst = scratchplane1;
101 
102 if(t==0) return;
103 
104 memcpy ( src-w*2, src, w );
105 memcpy ( src-w  , src, w );
106 
107 memcpy ( src+(w*h)  , src+(w*h)-w, w );
108 memcpy ( src+(w*h)+w, src+(w*h)-w, w );
109 
110 for(i=0;i<(w*h);i++)
111 	{
112 
113 	v  = *(src    -2)*1;
114 
115 	v += *(src-w  -1)*1;
116 	v += *(src    -1)*2;
117 	v += *(src+w  -1)*1;
118 
119 	v += *(src-w*2  )*1;
120 	v += *(src-w    )*2;
121 	v += *(src      )*4;
122 	v += *(src+w    )*2;
123 	v += *(src+w*2  )*1;
124 
125 	v += *(src-w  +1)*1;
126 	v += *(src    +1)*2;
127 	v += *(src+w  +1)*1;
128 
129 	v += *(src    +2)*1;
130 
131 	v /= 20;
132 
133 	v = *(src)*(256-t) + (v)*(t);
134 	v /= 256;
135 
136 	*(dst)=v;
137 
138 	dst++;
139 	src++;
140 	}
141 
142 memcpy ( frame,scratchplane1,w*h );
143 }
144 
145 void
temporal_filter_planes_MC(int idx,int w,int h,int t)146 temporal_filter_planes_MC (int idx, int w, int h, int t)
147 {
148   uint32_t sad,min;
149   uint32_t r, c, m;
150   int32_t d;
151   int x,y,sx,sy;
152 
153   uint32_t v;
154   int x1,y1;
155   int x2,y2;
156   int x3,y3;
157   int x5,y5;
158   int x6,y6;
159   int x7,y7;
160 
161   uint8_t *f1 = frame1[idx];
162   uint8_t *f2 = frame2[idx];
163   uint8_t *f3 = frame3[idx];
164   uint8_t *f4 = frame4[idx];
165   uint8_t *f5 = frame5[idx];
166   uint8_t *f6 = frame6[idx];
167   uint8_t *f7 = frame7[idx];
168   uint8_t *of = outframe[idx];
169 
170 #if 1
171 
172   if (t == 0)			// shortcircuit filter if t = 0...
173     {
174       memcpy (of, f4, w * h);
175       return;
176     }
177 #endif
178 
179       for (y = 0; y < h; y+=16)
180       for (x = 0; x < w; x+=16)
181 	{
182 
183 	// find best matching 16x16 block for f3
184 	min=psad_00 ( f4+(x)+(y)*w,f3+(x)+(y)*w,w,16,0x00ffffff );
185 	x3=y3=0;
186 	for (sy=-4; sy < 4; sy++)
187 	for (sx=-4; sx < 4; sx++)
188 	{
189 		sad  = psad_00 ( f4+(x)+(y)*w,f3+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
190 		sad += psad_00 ( f4+(x+8)+(y)*w,f3+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
191 		if(sad<min)
192 		{
193 		x3 = sx;
194 		y3 = sy;
195 		min = sad;
196 		}
197 	}
198 
199 	// find best matching 16x16 block for f5
200 	min=psad_00 ( f4+(x)+(y)*w,f5+(x)+(y)*w,w,16,0x00ffffff );
201 	x5=y5=0;
202 	for (sy=-4; sy < 4; sy++)
203 	for (sx=-4; sx < 4; sx++)
204 	{
205 		sad  = psad_00 ( f4+(x)+(y)*w,f5+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
206 		sad += psad_00 ( f4+(x+8)+(y)*w,f5+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
207 		if(sad<min)
208 		{
209 		x5 = sx;
210 		y5 = sy;
211 		min = sad;
212 		}
213 	}
214 
215 	// find best matching 16x16 block for f2
216 	min=psad_00 ( f4+(x)+(y)*w,f2+(x)+(y)*w,w,16,0x00ffffff );
217 	x2=y2=0;
218 	for (sy=(y3-4); sy < (y3+4); sy++)
219 	for (sx=(x3-4); sx < (x3+4); sx++)
220 	{
221 		sad  = psad_00 ( f4+(x)+(y)*w,f2+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
222 		sad += psad_00 ( f4+(x+8)+(y)*w,f2+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
223 		if(sad<min)
224 		{
225 		x2 = sx;
226 		y2 = sy;
227 		min = sad;
228 		}
229 	}
230 
231 	// find best matching 16x16 block for f6
232 	min=psad_00 ( f4+(x)+(y)*w,f6+(x)+(y)*w,w,16,0x00ffffff );
233 	x6=y6=0;
234 	for (sy=(y5-4); sy < (y5+4); sy++)
235 	for (sx=(x5-4); sx < (x5+4); sx++)
236 	{
237 		sad  = psad_00 ( f4+(x)+(y)*w,f6+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
238 		sad += psad_00 ( f4+(x+8)+(y)*w,f6+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
239 		if(sad<min)
240 		{
241 		x6 = sx;
242 		y6 = sy;
243 		min = sad;
244 		}
245 	}
246 
247 	// find best matching 16x16 block for f2
248 	min=psad_00 ( f4+(x)+(y)*w,f1+(x)+(y)*w,w,16,0x00ffffff );
249 	x1=y1=0;
250 	for (sy=(y2-4); sy < (y2+4); sy++)
251 	for (sx=(x2-4); sx < (x2+4); sx++)
252 	{
253 		sad  = psad_00 ( f4+(x)+(y)*w,f1+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
254 		sad += psad_00 ( f4+(x+8)+(y)*w,f1+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
255 		if(sad<min)
256 		{
257 		x1 = sx;
258 		y1 = sy;
259 		min = sad;
260 		}
261 	}
262 
263 	// find best matching 16x16 block for f7
264 	min=psad_00 ( f4+(x)+(y)*w,f7+(x)+(y)*w,w,16,0x00ffffff );
265 	x7=y7=0;
266 	for (sy=(y6-4); sy < (y6+4); sy++)
267 	for (sx=(x6-4); sx < (x6+4); sx++)
268 	{
269 		sad  = psad_00 ( f4+(x)+(y)*w,f7+(x+sx)+(y+sy)*w,w,16,0x00ffffff );
270 		sad += psad_00 ( f4+(x+8)+(y)*w,f7+(x+sx+8)+(y+sy)*w,w,16,0x00ffffff );
271 		if(sad<min)
272 		{
273 		x7 = sx;
274 		y7 = sy;
275 		min = sad;
276 		}
277 	}
278 
279 	for (sy=0; sy < 16; sy++)
280 	for (sx=0; sx < 16; sx++)
281 	{
282 		// gauss-filtered reference pixel
283 		r  = *(f4+(x+sx-1)+(y+sy-1)*w);
284 		r += *(f4+(x+sx  )+(y+sy-1)*w)*2;
285 		r += *(f4+(x+sx+1)+(y+sy-1)*w);
286 		r += *(f4+(x+sx-1)+(y+sy  )*w)*2;
287 		r += *(f4+(x+sx  )+(y+sy  )*w)*4;
288 		r += *(f4+(x+sx+1)+(y+sy  )*w)*2;
289 		r += *(f4+(x+sx-1)+(y+sy+1)*w);
290 		r += *(f4+(x+sx  )+(y+sy+1)*w)*2;
291 		r += *(f4+(x+sx+1)+(y+sy+1)*w);
292 		r /= 16;
293 
294 		// add non-filtered reference to the accummulator
295 		m = *(f4+(x+sx  )+(y+sy  )*w)*t;
296 		c = t;
297 
298 		// gauss-filtered and translated test pixel
299 		sx += x1;
300 		sy += y1;
301 		v  = *(f1+(x+sx-1)+(y+sy-1)*w);
302 		v += *(f1+(x+sx  )+(y+sy-1)*w)*2;
303 		v += *(f1+(x+sx+1)+(y+sy-1)*w);
304 		v += *(f1+(x+sx-1)+(y+sy  )*w)*2;
305 		v += *(f1+(x+sx  )+(y+sy  )*w)*4;
306 		v += *(f1+(x+sx+1)+(y+sy  )*w)*2;
307 		v += *(f1+(x+sx-1)+(y+sy+1)*w);
308 		v += *(f1+(x+sx  )+(y+sy+1)*w)*2;
309 		v += *(f1+(x+sx+1)+(y+sy+1)*w);
310 		sx -= x1;
311 		sy -= y1;
312 		v /= 16;
313 
314 		// add weighted and translated but non-filtered test-pixel
315 		d = t - abs (r-v);
316 		d = d<0? 0:d;
317 	  	c += d;
318           	m += *(f1+(x+sx+x1)+(y+sy+y1)*w)*d;
319 
320 		// gauss-filtered and translated test pixel
321 		sx += x2;
322 		sy += y2;
323 		v  = *(f2+(x+sx-1)+(y+sy-1)*w);
324 		v += *(f2+(x+sx  )+(y+sy-1)*w)*2;
325 		v += *(f2+(x+sx+1)+(y+sy-1)*w);
326 		v += *(f2+(x+sx-1)+(y+sy  )*w)*2;
327 		v += *(f2+(x+sx  )+(y+sy  )*w)*4;
328 		v += *(f2+(x+sx+1)+(y+sy  )*w)*2;
329 		v += *(f2+(x+sx-1)+(y+sy+1)*w);
330 		v += *(f2+(x+sx  )+(y+sy+1)*w)*2;
331 		v += *(f2+(x+sx+1)+(y+sy+1)*w);
332 		sx -= x2;
333 		sy -= y2;
334 		v /= 16;
335 
336 		// add weighted and translated but non-filtered test-pixel
337 		d = t - abs (r-v);
338 		d = d<0? 0:d;
339 	  	c += d;
340           	m += *(f2+(x+sx+x2)+(y+sy+y2)*w)*d;
341 
342 		// gauss-filtered and translated test pixel
343 		sx += x3;
344 		sy += y3;
345 		v  = *(f3+(x+sx-1)+(y+sy-1)*w);
346 		v += *(f3+(x+sx  )+(y+sy-1)*w)*2;
347 		v += *(f3+(x+sx+1)+(y+sy-1)*w);
348 		v += *(f3+(x+sx-1)+(y+sy  )*w)*2;
349 		v += *(f3+(x+sx  )+(y+sy  )*w)*4;
350 		v += *(f3+(x+sx+1)+(y+sy  )*w)*2;
351 		v += *(f3+(x+sx-1)+(y+sy+1)*w);
352 		v += *(f3+(x+sx  )+(y+sy+1)*w)*2;
353 		v += *(f3+(x+sx+1)+(y+sy+1)*w);
354 		sx -= x3;
355 		sy -= y3;
356 		v /= 16;
357 
358 		// add weighted and translated but non-filtered test-pixel
359 		d = t - abs (r-v);
360 		d = d<0? 0:d;
361 	  	c += d;
362           	m += *(f3+(x+sx+x3)+(y+sy+y3)*w)*d;
363 
364 		// gauss-filtered and translated test pixel
365 		sx += x5;
366 		sy += y5;
367 		v  = *(f5+(x+sx-1)+(y+sy-1)*w);
368 		v += *(f5+(x+sx  )+(y+sy-1)*w)*2;
369 		v += *(f5+(x+sx+1)+(y+sy-1)*w);
370 		v += *(f5+(x+sx-1)+(y+sy  )*w)*2;
371 		v += *(f5+(x+sx  )+(y+sy  )*w)*4;
372 		v += *(f5+(x+sx+1)+(y+sy  )*w)*2;
373 		v += *(f5+(x+sx-1)+(y+sy+1)*w);
374 		v += *(f5+(x+sx  )+(y+sy+1)*w)*2;
375 		v += *(f5+(x+sx+1)+(y+sy+1)*w);
376 		sx -= x5;
377 		sy -= y5;
378 		v /= 16;
379 
380 		// add weighted and translated but non-filtered test-pixel
381 		d = t - abs (r-v);
382 		d = d<0? 0:d;
383 	  	c += d;
384           	m += *(f5+(x+sx+x5)+(y+sy+y5)*w)*d;
385 
386 		// gauss-filtered and translated test pixel
387 		sx += x6;
388 		sy += y6;
389 		v  = *(f6+(x+sx-1)+(y+sy-1)*w);
390 		v += *(f6+(x+sx  )+(y+sy-1)*w)*2;
391 		v += *(f6+(x+sx+1)+(y+sy-1)*w);
392 		v += *(f6+(x+sx-1)+(y+sy  )*w)*2;
393 		v += *(f6+(x+sx  )+(y+sy  )*w)*4;
394 		v += *(f6+(x+sx+1)+(y+sy  )*w)*2;
395 		v += *(f6+(x+sx-1)+(y+sy+1)*w);
396 		v += *(f6+(x+sx  )+(y+sy+1)*w)*2;
397 		v += *(f6+(x+sx+1)+(y+sy+1)*w);
398 		sx -= x6;
399 		sy -= y6;
400 		v /= 16;
401 
402 		// add weighted and translated but non-filtered test-pixel
403 		d = t - abs (r-v);
404 		d = d<0? 0:d;
405 	  	c += d;
406           	m += *(f6+(x+sx+x6)+(y+sy+y6)*w)*d;
407 
408 		// gauss-filtered and translated test pixel
409 		sx += x7;
410 		sy += y7;
411 		v  = *(f7+(x+sx-1)+(y+sy-1)*w);
412 		v += *(f7+(x+sx  )+(y+sy-1)*w)*2;
413 		v += *(f7+(x+sx+1)+(y+sy-1)*w);
414 		v += *(f7+(x+sx-1)+(y+sy  )*w)*2;
415 		v += *(f7+(x+sx  )+(y+sy  )*w)*4;
416 		v += *(f7+(x+sx+1)+(y+sy  )*w)*2;
417 		v += *(f7+(x+sx-1)+(y+sy+1)*w);
418 		v += *(f7+(x+sx  )+(y+sy+1)*w)*2;
419 		v += *(f7+(x+sx+1)+(y+sy+1)*w);
420 		sx -= x7;
421 		sy -= y7;
422 		v /= 16;
423 
424 		// add weighted and translated but non-filtered test-pixel
425 		d = t - abs (r-v);
426 		d = d<0? 0:d;
427 	  	c += d;
428           	m += *(f7+(x+sx+x7)+(y+sy+y7)*w)*d;
429 
430 		*(of+(x+sx)+(y+sy)*w) = m/c;
431 
432 	}
433 	}
434 }
435 
renoise(uint8_t * frame,int w,int h,int level)436 void renoise (uint8_t * frame, int w, int h, int level )
437 {
438 uint8_t random[8192];
439 int i;
440 static int cnt=0;
441 
442 for(i=0;i<8192;i++)
443 	random[(i+cnt/2)&8191]=(i+i+i+i*i*i+1-random[i-1]+random[i-20]*random[i-5]*random[i-25])&255;
444 cnt++;
445 
446 for(i=0;i<(w*h);i++)
447 	*(frame+i)=(*(frame+i)*(255-level)+random[i&8191]*level)/255;
448 }
449 
450 #if defined(__SSE2__)
451 
tf0(const __m128i mask,const __m128i l0,const __m128i vt,const __m128i vc,const __m128i vb)452 static inline __m128i tf0(const __m128i mask, const __m128i l0, const __m128i vt, const __m128i vc, const __m128i vb) {
453 	__m128i k0, k1, k2, k3, d0; /* temp storage, pixel surroundings, 16-bit words */
454 
455 	/* even pixels */
456 	/* extract and add the pixels above and below the current one */
457 	k0 = _mm_and_si128(_mm_srli_si128(vt, 1), mask);
458 	k1 = _mm_and_si128(_mm_srli_si128(vb, 1), mask);
459 	k0 = _mm_add_epi16(k0, k1);
460 
461 	/* add together the 4 corner pixels diagonal of the current one */
462 	k2 = _mm_add_epi16(_mm_and_si128(vt, mask), _mm_and_si128(vb, mask));
463 	k2 = _mm_add_epi16(k2, _mm_srli_si128(k2, 2));
464 
465 	/* add pixels left and right of the current */
466 	k3 = _mm_and_si128(vc, mask);
467 	k3 = _mm_add_epi16(k3, _mm_srli_si128(k3, 2));
468 
469 	/* add weighted current pixel and the above results */
470 	k1 = _mm_and_si128(_mm_srli_si128(vc, 1), mask);
471 	d0 = _mm_slli_epi16(k1, 1); /* center * 4 */
472 	d0 = _mm_add_epi16(d0, k0);
473 	d0 = _mm_add_epi16(d0, k3);
474 	d0 = _mm_slli_epi16(d0, 1); /* + above,below,left,right * 2 */
475 	d0 = _mm_add_epi16(d0, k2); /* + diagonal * 1 */
476 	d0 = _mm_srli_epi16(d0, 4); /* all / 16 */
477 	return d0;
478 }
479 
tf1(const __m128i mask,const __m128i l0,const __m128i vt,const __m128i vc,const __m128i vb)480 static inline __m128i tf1(const __m128i mask, const __m128i l0, const __m128i vt, const __m128i vc, const __m128i vb) {
481 	__m128i k0, k1, k2, k3, d1;
482 
483 	k0 = _mm_srli_si128(_mm_add_epi16(_mm_and_si128(vt, mask), _mm_and_si128(vb, mask)), 2);
484 
485 	k1 = _mm_and_si128(_mm_srli_si128(vt, 1), mask);
486 	k2 = _mm_and_si128(_mm_srli_si128(vb, 1), mask);
487 	k2 = _mm_add_epi16(k1, k2);
488 	k2 = _mm_add_epi16(k2, _mm_srli_si128(k2, 2));
489 
490 	k3 = _mm_and_si128(_mm_srli_si128(vc, 1), mask);
491 	k3 = _mm_add_epi16(k3, _mm_srli_si128(k3, 2));
492 
493 	k1 = _mm_and_si128(_mm_srli_si128(vc, 2), mask);
494 	d1 = _mm_slli_epi16(k1, 1);
495 	d1 = _mm_add_epi16(d1, k0);
496 	d1 = _mm_add_epi16(d1, k3);
497 	d1 = _mm_slli_epi16(d1, 1);
498 	d1 = _mm_add_epi16(d1, k2);
499 	d1 = _mm_srli_epi16(d1, 4);
500 	return d1;
501 }
502 
503 /* 8 times as fast on x86_64, 2.2 times as fast on i686 */
temporal_filter_planes_sse2(int idx,int w,int h,int t)504 void temporal_filter_planes_sse2(int idx, int w, int h, int t)
505 {
506 	int x, k;
507 
508 	uint8_t *f4 = frame4[idx];
509 	uint8_t *of = outframe[idx];
510 
511 	uint8_t *f[6] = {
512 		frame3[idx], frame2[idx], frame1[idx], frame5[idx], frame6[idx], frame7[idx]
513 	};
514 
515 	if (t == 0)			// shortcircuit filter if t = 0...
516 	{
517 		memcpy (of, f4, w * h);
518 		return;
519 	}
520 
521 	/* vt: x x x x x x x x x x x x x x x x
522 	 * vc: x 0 1 2 3 4 5 6 7 8 9 a b c d x
523 	 * vb: x x x x x x x x x x x x x x x x
524 	 *
525 	 * c0, d0, m0 and m1 store the respective values for even pixels,
526 	 * m0 for 0, 2, 4 and 6, m1 for 8, a and c in its lower dwords;
527 	 * c1, d1, m2 and m3 store the values for the odd pixels,
528 	 * m2 for 1, 3, 5 and 7, m3 for 9, b and d in its lower dwords;
529 	 * whereas computation for each pixel is as follows (equivalent to the
530 	 * non-SSE-accelerated variant of this function):
531 	 *
532 	 * for each pixel k1 of the 14 pixels per block:
533 	 * vt:  k2  k0  k2
534 	 * vc:  k3  k1  k3
535 	 * vb:  k2  k0  k2
536 	 *
537 	 * r = *f4++;
538 	 * c = t + 1;
539 	 * m = r * (t+1);
540 	 * for (i=0; i<6; i++) {
541 	 *   d = sum(k2) + 2 * (sum(k0) + sum(k3) + 2 * k1)
542 	 *   d = saturate(t - abs(r-d));
543 	 *   c += d;
544 	 *   m += *f[i]++ * d;
545 	 * }
546 	 * m *= 2;
547 	 * *of++ = (m / c + 1) / 2;
548 	 *
549 	 * For each frame, first the 7 even pixels are being processed, then the 7
550 	 * odd ones.
551 	 */
552 
553 	__m128i vt, vc, vb;      /* top-, center- and bottom-line of 3x16 block */
554 	__m128i c0, c1, m0, m1, m2, m3; /* c: 16-bit words, m: 32-bit dwords */
555 	__m128i d0, d1, r0, r1;  /* 16-bit words, 0: even, 1: odd */
556 	const __m128i mask = _mm_set1_epi16(0x00ff);
557 	const __m128i l0 = _mm_set1_epi16(t);
558 
559 #ifndef OLD_ROUNDING
560 	_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
561 #endif
562 
563 	for (x = 0; x < (w * h); x+=14)
564 	{
565 		vt = _mm_loadu_si128((__m128i *)(f4 - 1 - w));
566 		vc = _mm_loadu_si128((__m128i *)(f4 - 1    ));
567 		vb = _mm_loadu_si128((__m128i *)(f4 - 1 + w));
568 		f4 += 14;
569 
570 		r0 = tf0(mask, l0, vt, vc, vb);  /* even pixels */
571 		r1 = tf1(mask, l0, vt, vc, vb);  /* odd pixels */
572 
573 		c0 = c1 = _mm_set1_epi16(t + 1);
574 
575 		/* m = *f4 * (t+1) */
576 		/* The low 16-bit of the multiplication suffice, because both operands are
577 		 * only 8-bit-values */
578 		d0 = _mm_mullo_epi16(_mm_and_si128(_mm_srli_si128(vc, 1), mask), c0);
579 		m0 = _mm_unpacklo_epi16(d0, _mm_setzero_si128());
580 		m1 = _mm_unpackhi_epi16(d0, _mm_setzero_si128());
581 		d1 = _mm_mullo_epi16(_mm_and_si128(_mm_srli_si128(vc, 2), mask), c0);
582 		m2 = _mm_unpacklo_epi16(d1, _mm_setzero_si128());
583 		m3 = _mm_unpackhi_epi16(d1, _mm_setzero_si128());
584 
585 		for (k=0; k<sizeof(f)/sizeof(*f); k++) {
586 			vt = _mm_loadu_si128((__m128i *)(f[k] - 1 - w));
587 			vc = _mm_loadu_si128((__m128i *)(f[k] - 1    ));
588 			vb = _mm_loadu_si128((__m128i *)(f[k] - 1 + w));
589 			f[k] += 14;
590 
591 			/* even pixels */
592 			d0 = tf0(mask, l0, vt, vc, vb);
593 			/* d = l - abs(r-d) */
594 			d0 = _mm_subs_epu16(l0, _mm_sub_epi16(_mm_max_epi16(r0, d0), _mm_min_epi16(r0, d0)));
595 			c0 = _mm_add_epi16(c0, d0);
596 			/* d *= *f[k] */
597 			d0 = _mm_mullo_epi16(_mm_and_si128(_mm_srli_si128(vc, 1), mask), d0);
598 			m0 = _mm_add_epi32(m0, _mm_unpacklo_epi16(d0, _mm_setzero_si128()));
599 			m1 = _mm_add_epi32(m1, _mm_unpackhi_epi16(d0, _mm_setzero_si128()));
600 
601 			/* odd pixels */
602 			d1 = tf1(mask, l0, vt, vc, vb);
603 			d1 = _mm_subs_epu16(l0, _mm_sub_epi16(_mm_max_epi16(r1, d1), _mm_min_epi16(r1, d1)));
604 			c1 = _mm_add_epi16(c1, d1);
605 			d1 = _mm_mullo_epi16(_mm_and_si128(_mm_srli_si128(vc, 2), mask), d1);
606 			m2 = _mm_add_epi32(m2, _mm_unpacklo_epi16(d1, _mm_setzero_si128()));
607 			m3 = _mm_add_epi32(m3, _mm_unpackhi_epi16(d1, _mm_setzero_si128()));
608 		}
609 
610 		/* extract results from c0, c1 and m0 to m3:
611 		 * 14 byte result = interleave_bytes((m0,m1) / c0, (m2,m3) / c1) */
612 #ifdef OLD_ROUNDING
613 		/* r = m*2/c */
614 		/* multiply each m with 2 */
615 		m0 = _mm_slli_epi32(m0, 1);
616 		m1 = _mm_slli_epi32(m1, 1);
617 		m2 = _mm_slli_epi32(m2, 1);
618 		m3 = _mm_slli_epi32(m3, 1);
619 #endif
620 
621 		/* m0-m3 each contain 4 19-bit values ((8-bit)^2 * 8), so a single precision
622 		 * float with 23-bit mantissa can hold these without precision loss */
623 		/* r = m/c */
624 		__m128i k0 = _mm_setzero_si128();
625 		__m128 f0, f1, f2, f3;
626 		f0 = _mm_div_ps(_mm_cvtepi32_ps(m0), _mm_cvtepi32_ps(_mm_unpacklo_epi16(c0, k0)));
627 		f1 = _mm_div_ps(_mm_cvtepi32_ps(m1), _mm_cvtepi32_ps(_mm_unpackhi_epi16(c0, k0)));
628 		f2 = _mm_div_ps(_mm_cvtepi32_ps(m2), _mm_cvtepi32_ps(_mm_unpacklo_epi16(c1, k0)));
629 		f3 = _mm_div_ps(_mm_cvtepi32_ps(m3), _mm_cvtepi32_ps(_mm_unpackhi_epi16(c1, k0)));
630 
631 #ifdef OLD_ROUNDING
632 		m0 = _mm_cvttps_epi32(f0);
633 		m1 = _mm_cvttps_epi32(f1);
634 		m2 = _mm_cvttps_epi32(f2);
635 		m3 = _mm_cvttps_epi32(f3);
636 #else
637 		m0 = _mm_cvtps_epi32(f0);
638 		m1 = _mm_cvtps_epi32(f1);
639 		m2 = _mm_cvtps_epi32(f2);
640 		m3 = _mm_cvtps_epi32(f3);
641 #endif
642 
643 		r0 = _mm_packs_epi32(m0, m1); /* 7 words f0,f1 */
644 		r1 = _mm_packs_epi32(m2, m3); /* 7 words f2,f3 */
645 
646 #ifdef OLD_ROUNDING
647 		/* (r+1)/2 */
648 		k0 = _mm_set1_epi16(1);
649 		r0 = _mm_srli_epi16(_mm_add_epi16(r0, k0), 1);
650 		r1 = _mm_srli_epi16(_mm_add_epi16(r1, k0), 1);
651 #endif
652 
653 		/* 7 words r0 interleaved with 7 words r1, all converted to bytes */
654 		r0 = _mm_packus_epi16(_mm_unpacklo_epi16(r0, r1), _mm_unpackhi_epi16(r0, r1));
655 		/* write 16, but the 2 bytes overlap will be overwritten by the next pass */
656 		_mm_storeu_si128((__m128i *)of, r0);
657 		of += 14;
658 	}
659 	_mm_empty();
660 }
661 #endif
662 
temporal_filter_planes_p(int idx,int w,int h,int t)663 void temporal_filter_planes_p (int idx, int w, int h, int t)
664 {
665 	uint32_t r, c, m;
666 	int32_t d;
667 	int x;
668 
669 	uint8_t *f1 = frame1[idx];
670 	uint8_t *f2 = frame2[idx];
671 	uint8_t *f3 = frame3[idx];
672 	uint8_t *f4 = frame4[idx];
673 	uint8_t *f5 = frame5[idx];
674 	uint8_t *f6 = frame6[idx];
675 	uint8_t *f7 = frame7[idx];
676 	uint8_t *of = outframe[idx];
677 
678 	if (t == 0)			// shortcircuit filter if t = 0...
679 	{
680 		memcpy (of, f4, w * h);
681 		return;
682 	}
683 
684 	for (x = 0; x < (w * h); x++)
685 	{
686 		r  = *(f4-1-w);
687 		r += *(f4  -w)*2;
688 		r += *(f4+1-w);
689 		r += *(f4-1  )*2;
690 		r += *(f4    )*4;
691 		r += *(f4+1  )*2;
692 		r += *(f4-1+w);
693 		r += *(f4  +w)*2;
694 		r += *(f4+1+w);
695 		r /= 16;
696 
697 		m = *(f4)*(t+1)*2;
698 		c = t+1;
699 
700 		d  = *(f3-1-w);
701 		d += *(f3  -w)*2;
702 		d += *(f3+1-w);
703 		d += *(f3-1  )*2;
704 		d += *(f3    )*4;
705 		d += *(f3+1  )*2;
706 		d += *(f3-1+w);
707 		d += *(f3  +w)*2;
708 		d += *(f3+1+w);
709 		d /= 16;
710 
711 		d = t - abs (r-d);
712 		d = d<0? 0:d;
713 		c += d;
714 		m += *(f3)*d*2;
715 
716 		d  = *(f2-1-w);
717 		d += *(f2  -w)*2;
718 		d += *(f2+1-w);
719 		d += *(f2-1  )*2;
720 		d += *(f2    )*4;
721 		d += *(f2+1  )*2;
722 		d += *(f2-1+w);
723 		d += *(f2  +w)*2;
724 		d += *(f2+1+w);
725 		d /= 16;
726 
727 		d = t - abs (r-d);
728 		d = d<0? 0:d;
729 		c += d;
730 		m += *(f2)*d*2;
731 
732 		d  = *(f1-1-w);
733 		d += *(f1  -w)*2;
734 		d += *(f1+1-w);
735 		d += *(f1-1  )*2;
736 		d += *(f1    )*4;
737 		d += *(f1+1  )*2;
738 		d += *(f1-1+w);
739 		d += *(f1  +w)*2;
740 		d += *(f1+1+w);
741 		d /= 16;
742 
743 		d = t - abs (r-d);
744 		d = d<0? 0:d;
745 		c += d;
746 		m += *(f1)*d*2;
747 
748 		d  = *(f5-1-w);
749 		d += *(f5  -w)*2;
750 		d += *(f5+1-w);
751 		d += *(f5-1  )*2;
752 		d += *(f5    )*4;
753 		d += *(f5+1  )*2;
754 		d += *(f5-1+w);
755 		d += *(f5  +w)*2;
756 		d += *(f5+1+w);
757 		d /= 16;
758 
759 		d = t - abs (r-d);
760 		d = d<0? 0:d;
761 		c += d;
762 		m += *(f5)*d*2;
763 
764 		d  = *(f6-1-w);
765 		d += *(f6  -w)*2;
766 		d += *(f6+1-w);
767 		d += *(f6-1  )*2;
768 		d += *(f6    )*4;
769 		d += *(f6+1  )*2;
770 		d += *(f6-1+w);
771 		d += *(f6  +w)*2;
772 		d += *(f6+1+w);
773 		d /= 16;
774 
775 		d = t - abs (r-d);
776 		d = d<0? 0:d;
777 		c += d;
778 		m += *(f6)*d*2;
779 
780 		d  = *(f7-1-w);
781 		d += *(f7  -w)*2;
782 		d += *(f7+1-w);
783 		d += *(f7-1  )*2;
784 		d += *(f7    )*4;
785 		d += *(f7+1  )*2;
786 		d += *(f7-1+w);
787 		d += *(f7  +w)*2;
788 		d += *(f7+1+w);
789 		d /= 16;
790 
791 		d = t - abs (r-d);
792 		d = d<0? 0:d;
793 		c += d;
794 		m += *(f7)*d*2;
795 
796 		*(of) = ((m/c)+1)/2;
797 
798 		f1++;
799 		f2++;
800 		f3++;
801 		f4++;
802 		f5++;
803 		f6++;
804 		f7++;
805 		of++;
806 	}
807 }
808 
809 #if defined(__SSE2__)
810 /* 4 to 5 times faster */
filter_plane_median_sse2(uint8_t * plane,int w,int h,int level)811 void filter_plane_median_sse2(uint8_t *plane, int w, int h, int level) {
812 	int i;
813 	int avg; /*should not be needed any more */
814 	int cnt; /* should not be needed any more */
815 	uint8_t * p;
816 	uint8_t * d;
817 
818 	if(level==0) return;
819 
820 	p = plane;
821 	d = scratchplane1;
822 
823 	// remove strong outliers from the image. An outlier is a pixel which lies outside
824 	// of max-thres and min+thres of the surrounding pixels. This should not cause blurring
825 	// and it should leave an evenly spread noise-floor to the image.
826 	for (i=0; i<=(w*h); i+=14) {
827 		__m128i t, c, b, min, max, minmin, maxmax;
828 
829 		t = _mm_loadu_si128((__m128i *)&p[i-1-w]);
830 		c = _mm_loadu_si128((__m128i *)&p[i-1  ]);
831 		b = _mm_loadu_si128((__m128i *)&p[i-1+w]);
832 		min = _mm_min_epu8(t, b);      /* k: (0,k), (2,k) */
833 		max = _mm_max_epu8(t, b);
834 		minmin = _mm_min_epu8(min, c); /* k: (0,k), (1,k), (2,k) */
835 		maxmax = _mm_max_epu8(max, c);
836 
837 		/* k: (0,k), (1,k), (2,k). (0,k+2), (1,k+2), (2,k+2) */
838 		minmin = _mm_min_epu8(minmin, _mm_srli_si128(minmin, 2));
839 		maxmax = _mm_max_epu8(maxmax, _mm_srli_si128(maxmax, 2));
840 		/* k: (0,k), (1,k), (2,k). (0,k+2), (1,k+2), (2,k+2), (0,k+1), (2,k+1) */
841 		min = _mm_min_epu8(minmin, _mm_srli_si128(min, 1));
842 		max = _mm_max_epu8(maxmax, _mm_srli_si128(max, 1));
843 
844 		/* limit c to range [min,max] */
845 		c = _mm_max_epu8(min, _mm_min_epu8(max, _mm_srli_si128(c, 1)));
846 		/* write 14 valid pixels, the 2 remaining bytes are overwritten subsequently
847 		 * or lie outside the frame area */
848 		_mm_storeu_si128((__m128i *)&d[i], c);
849 	}
850 
851 	// in the second stage we try to average similar spatial pixels, only. This, like
852 	// a median, should also not reduce sharpness but flatten the noisefloor. This
853 	// part is quite similar to what 2dclean/yuvmedianfilter do. But because of the
854 	// different weights given to the pixels it is less aggressive...
855 
856 	p = scratchplane1;
857 	d = scratchplane2;
858 
859 	// this filter needs values outside of the imageplane, so we just copy the first line
860 	// and the last line into the out-of-range area...
861 
862 	memcpy ( p-w  , p, w );
863 	memcpy ( p-w*2, p, w );
864 
865 	memcpy ( p+(w*h)  , p+(w*h)-w, w );
866 	memcpy ( p+(w*h)+w, p+(w*h)-w, w );
867 
868 	__m128i lvl = _mm_set1_epi16(level);
869 
870 #ifndef OLD_ROUNDING
871 	_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
872 #endif
873 
874 	for (i=0; i<=(w*h); i+=4)
875 	{
876 		uint64_t k0, k1, k2, k3, k6;
877 		__m128i c0, c1, v[4], t[4], e[4], a[4];
878 
879 		/* p points to pixel a. There are 3 stages, each processing 8 surrounding
880 		 * pixels, resulting in the complete neighbourhood of 24 pixels.
881 		 *
882 		 *     0 1 2 3 4 5 6 7
883 		 * k0: x x x x x x x x
884 		 * k1: x x x x x x x x
885 		 * k6: x x a b c d x x
886 		 * k2: x x x x x x x x
887 		 * k3: x x x x x x x x
888 		 *
889 		 * a,b and c,d each share two 2x4-blocks in their surrounding area, which
890 		 * are processed by stages 1 and 2. These blocks are referred to as c0 and
891 		 * c1 by the code below. The remaining surrounding pixels are processed for
892 		 * each pixel out of a,b,c,d individually in stage 3.
893 		 * Stage 4 assembles the results, adds the weighted averages and weights
894 		 * together for each pixel, computes the median and stores it in the
895 		 * scratch plane.
896 		 * Coordinates (y,x) originate from the top left of the above diagram. */
897 
898 		k0 = *(uint64_t *)(p-w*2-2);
899 		k1 = *(uint64_t *)(p-w*1-2);
900 		k6 = *(uint64_t *)(p    -2);
901 		k2 = *(uint64_t *)(p+w*1-2);
902 		k3 = *(uint64_t *)(p+w*2-2);
903 
904 		v[0] = _mm_set1_epi16((k6 >> 16) & 0xff); /* pixel a */
905 		v[1] = _mm_set1_epi16((k6 >> 24) & 0xff); /* pixel b */
906 		v[2] = _mm_set1_epi16((k6 >> 32) & 0xff); /* pixel c */
907 		v[3] = _mm_set1_epi16((k6 >> 40) & 0xff); /* pixel d */
908 
909 		// stage 1: c0 for a,b: (0,1) -> (1,4), c1 for c,d: (0,3) -> (1,6)
910 		c0  = _mm_set_epi32((k0 >>  8) & 0xff00ff, (k0 >> 16) & 0xff00ff,
911 												(k1 >>  8) & 0xff00ff, (k1 >> 16) & 0xff00ff);
912 		c1  = _mm_set_epi32((k0 >> 24) & 0xff00ff, (k0 >> 32) & 0xff00ff,
913 												(k1 >> 24) & 0xff00ff, (k1 >> 32) & 0xff00ff);
914 		t[0] = _mm_sub_epi16(_mm_max_epu8(c0, v[0]), _mm_min_epu8(c0, v[0]));
915 		t[1] = _mm_sub_epi16(_mm_max_epu8(c0, v[1]), _mm_min_epu8(c0, v[1]));
916 		t[2] = _mm_sub_epi16(_mm_max_epu8(c1, v[2]), _mm_min_epu8(c1, v[2]));
917 		t[3] = _mm_sub_epi16(_mm_max_epu8(c1, v[3]), _mm_min_epu8(c1, v[3]));
918 		e[0] = _mm_subs_epu16(lvl, t[0]);
919 		e[1] = _mm_subs_epu16(lvl, t[1]);
920 		e[2] = _mm_subs_epu16(lvl, t[2]);
921 		e[3] = _mm_subs_epu16(lvl, t[3]);
922 		a[0] = _mm_madd_epi16(e[0], c0);
923 		a[1] = _mm_madd_epi16(e[1], c0);
924 		a[2] = _mm_madd_epi16(e[2], c1);
925 		a[3] = _mm_madd_epi16(e[3], c1);
926 
927 		// stage 2: c0 for a,b: (3,1) -> (4,4), c1 for c,d: (3,3) -> (4,6)
928 		c0  = _mm_set_epi32((k2 >>  8) & 0xff00ff, (k2 >> 16) & 0xff00ff,
929 												(k3 >>  8) & 0xff00ff, (k3 >> 16) & 0xff00ff);
930 		c1  = _mm_set_epi32((k2 >> 24) & 0xff00ff, (k2 >> 32) & 0xff00ff,
931 												(k3 >> 24) & 0xff00ff, (k3 >> 32) & 0xff00ff);
932 		t[0] = _mm_sub_epi16(_mm_max_epu8(c0, v[0]), _mm_min_epu8(c0, v[0]));
933 		t[1] = _mm_sub_epi16(_mm_max_epu8(c0, v[1]), _mm_min_epu8(c0, v[1]));
934 		t[2] = _mm_sub_epi16(_mm_max_epu8(c1, v[2]), _mm_min_epu8(c1, v[2]));
935 		t[3] = _mm_sub_epi16(_mm_max_epu8(c1, v[3]), _mm_min_epu8(c1, v[3]));
936 		t[0] = _mm_subs_epu16(lvl, t[0]);
937 		t[1] = _mm_subs_epu16(lvl, t[1]);
938 		t[2] = _mm_subs_epu16(lvl, t[2]);
939 		t[3] = _mm_subs_epu16(lvl, t[3]);
940 		e[0] = _mm_add_epi16(t[0], e[0]);
941 		e[1] = _mm_add_epi16(t[1], e[1]);
942 		e[2] = _mm_add_epi16(t[2], e[2]);
943 		e[3] = _mm_add_epi16(t[3], e[3]);
944 		a[0] = _mm_add_epi32(_mm_madd_epi16(t[0], c0), a[0]);
945 		a[1] = _mm_add_epi32(_mm_madd_epi16(t[1], c0), a[1]);
946 		a[2] = _mm_add_epi32(_mm_madd_epi16(t[2], c1), a[2]);
947 		a[3] = _mm_add_epi32(_mm_madd_epi16(t[3], c1), a[3]);
948 
949 		// stage 3:
950 		// pixel a: (0,0) -> (4,0), (2,1), (2,3), (2,4)
951 		c0 = _mm_set_epi32(((k0 & 0xff) << 16) | (k1 & 0xff),
952 		                   ((k2 & 0xff) << 16) | (k3 & 0xff),
953 		                   (k6 >> 8) & 0xff00ff,
954 		                   (k6 & 0xff) | ((k6 >> 16) & 0xff0000));
955 		t[0] = _mm_sub_epi16(_mm_max_epu8(c0, v[0]), _mm_min_epu8(c0, v[0]));
956 		t[0] = _mm_subs_epu16(lvl, t[0]);
957 		e[0] = _mm_add_epi16(t[0], e[0]);
958 		a[0] = _mm_add_epi32(_mm_madd_epi16(t[0], c0), a[0]);
959 
960 		// pixel c: (0,2) -> (4,2), (2,3), (2,5), (2,6)
961 		c1 = _mm_set_epi32((k0 & 0xff0000) | ((k1 >> 16) & 0xff),
962 		                   (k2 & 0xff0000) | ((k3 >> 16) & 0xff),
963 		                   (k6 >> 24) & 0xff00ff,
964 		                   ((k6 >> 16) & 0xff) | ((k6 >> 32) & 0xff0000));
965 		t[2] = _mm_sub_epi16(_mm_max_epu8(c1, v[2]), _mm_min_epu8(c1, v[2]));
966 		t[2] = _mm_subs_epu16(lvl, t[2]);
967 		e[2] = _mm_add_epi16(t[2], e[2]);
968 		a[2] = _mm_add_epi32(_mm_madd_epi16(t[2], c1), a[2]);
969 
970 		// pixel b: (0,5) -> (4,5), (2,1), (2,2), (2,4), (2,5)
971 		c0 = _mm_set_epi32(((k0 >> 24) & 0xff0000) | ((k1 >> 40) & 0xff),
972 		                   ((k2 >> 24) & 0xff0000) | ((k3 >> 40) & 0xff),
973 		                   (k6 >> 16) & 0xff00ff,
974 		                   ((k6 >> 8) & 0xff) | ((k6 >> 24) & 0xff0000));
975 		t[1] = _mm_sub_epi16(_mm_max_epu8(c0, v[1]), _mm_min_epu8(c0, v[1]));
976 		t[1] = _mm_subs_epu16(lvl, t[1]);
977 		e[1] = _mm_add_epi16(t[1], e[1]);
978 		a[1] = _mm_add_epi32(_mm_madd_epi16(t[1], c0), a[1]);
979 
980 		// pixel d: (0,7) -> (4,7), (2,3), (2,4), (2,6), (2,7)
981 		c1 = _mm_set_epi32(((k0 >> 40) & 0xff0000) | (k1 >> 56),
982 		                   ((k2 >> 40) & 0xff0000) | (k3 >> 56),
983 		                   (k6 >> 32) & 0xff00ff,
984 		                   ((k6 >> 24) & 0xff) | ((k6 >> 40) & 0xff0000));
985 		t[3] = _mm_sub_epi16(_mm_max_epu8(c1, v[3]), _mm_min_epu8(c1, v[3]));
986 		t[3] = _mm_subs_epu16(lvl, t[3]);
987 		e[3] = _mm_add_epi16(t[3], e[3]);
988 		a[3] = _mm_add_epi32(_mm_madd_epi16(t[3], c1), a[3]);
989 
990 		/* add them all together (a loop j=0 to 4 slows things down with gcc 4.4.4) */
991 		uint32_t tmp;
992 
993 #if defined(__SSE3__)
994 		int j;
995 		__m128 f0, f1, f2, flvl;
996 		__m128i zero, vv;
997 		flvl = _mm_set1_ps((float)level);
998 		zero = _mm_setzero_si128();
999 		vv = _mm_set_epi32((k6 >> 40) & 0xff, (k6 >> 32) & 0xff, (k6 >> 24) & 0xff, (k6 >> 16) & 0xff);
1000 
1001 		f0 = _mm_hadd_ps(_mm_cvtepi32_ps(a[0]), _mm_cvtepi32_ps(a[1]));
1002 		f1 = _mm_hadd_ps(_mm_cvtepi32_ps(a[2]), _mm_cvtepi32_ps(a[3]));
1003 		f0 = _mm_hadd_ps(f0, f1);
1004 		f0 = _mm_add_ps(f0, _mm_mul_ps(flvl, _mm_cvtepi32_ps(vv)));
1005 
1006 # ifdef OLD_ROUNDING
1007 		/* avg *= 2 */
1008 		f0 = _mm_mul_ps(f0, _mm_set1_ps(2.0f));
1009 # endif
1010 
1011 		for (j=0; j<4; j++)
1012 			e[j] = _mm_unpacklo_epi16(_mm_add_epi16(e[j], _mm_srli_si128(e[j], 8)), zero);
1013 		f1 = _mm_hadd_ps(_mm_cvtepi32_ps(e[0]), _mm_cvtepi32_ps(e[1]));
1014 		f2 = _mm_hadd_ps(_mm_cvtepi32_ps(e[2]), _mm_cvtepi32_ps(e[3]));
1015 		f1 = _mm_hadd_ps(f1, f2);
1016 		f1 = _mm_add_ps(f1, flvl);
1017 
1018 		f0 = _mm_div_ps(f0, f1);
1019 # ifdef OLD_ROUNDING
1020 		/* r = (r+1) / 2 */
1021 		vv = _mm_cvttps_epi32(f0);
1022 		vv = _mm_srli_epi32(_mm_add_epi32(vv, _mm_set1_epi32(1)), 1);
1023 # else
1024 		vv = _mm_cvtps_epi32(f0);
1025 # endif
1026 		vv = _mm_packus_epi16(_mm_packs_epi32(vv, zero), zero);
1027 		tmp = _mm_cvtsi128_si32(vv);
1028 
1029 #else
1030 
1031 		// p[0]
1032 		e[0] = _mm_add_epi16(e[0], _mm_srli_si128(e[0], 8)); /* 8 words */
1033 		e[0] = _mm_add_epi16(e[0], _mm_srli_si128(e[0], 4));
1034 		e[0] = _mm_add_epi16(e[0], _mm_srli_si128(e[0], 2));
1035 		cnt = level + (_mm_cvtsi128_si32(e[0]) & 0xffff);
1036 		a[0] = _mm_add_epi32(a[0], _mm_srli_si128(a[0], 8)); /* 4 dwords */
1037 		a[0] = _mm_add_epi32(a[0], _mm_srli_si128(a[0], 4));
1038 		avg = p[0] * level * 2 + (_mm_cvtsi128_si32(a[0]) << 1);
1039 		tmp = ((avg/cnt) + 1) / 2;
1040 
1041 		// p[1]
1042 		e[1] = _mm_add_epi16(e[1], _mm_srli_si128(e[1], 8)); /* 8 words */
1043 		e[1] = _mm_add_epi16(e[1], _mm_srli_si128(e[1], 4));
1044 		e[1] = _mm_add_epi16(e[1], _mm_srli_si128(e[1], 2));
1045 		cnt = level + (_mm_cvtsi128_si32(e[1]) & 0xffff);
1046 		a[1] = _mm_add_epi32(a[1], _mm_srli_si128(a[1], 8)); /* 4 dwords */
1047 		a[1] = _mm_add_epi32(a[1], _mm_srli_si128(a[1], 4));
1048 		avg = p[1] * level * 2 + (_mm_cvtsi128_si32(a[1]) << 1);
1049 		tmp |= (((avg/cnt) + 1) / 2) << 8;
1050 
1051 		// p[2]
1052 		e[2] = _mm_add_epi16(e[2], _mm_srli_si128(e[2], 8)); /* 8 words */
1053 		e[2] = _mm_add_epi16(e[2], _mm_srli_si128(e[2], 4));
1054 		e[2] = _mm_add_epi16(e[2], _mm_srli_si128(e[2], 2));
1055 		cnt = level + (_mm_cvtsi128_si32(e[2]) & 0xffff);
1056 		a[2] = _mm_add_epi32(a[2], _mm_srli_si128(a[2], 8)); /* 4 dwords */
1057 		a[2] = _mm_add_epi32(a[2], _mm_srli_si128(a[2], 4));
1058 		avg = p[2] * level * 2 + (_mm_cvtsi128_si32(a[2]) << 1);
1059 		tmp |= (((avg/cnt) + 1) / 2) << 16;
1060 
1061 		// p[3]
1062 		e[3] = _mm_add_epi16(e[3], _mm_srli_si128(e[3], 8)); /* 8 words */
1063 		e[3] = _mm_add_epi16(e[3], _mm_srli_si128(e[3], 4));
1064 		e[3] = _mm_add_epi16(e[3], _mm_srli_si128(e[3], 2));
1065 		cnt = level + (_mm_cvtsi128_si32(e[3]) & 0xffff);
1066 		a[3] = _mm_add_epi32(a[3], _mm_srli_si128(a[3], 8)); /* 4 dwords */
1067 		a[3] = _mm_add_epi32(a[3], _mm_srli_si128(a[3], 4));
1068 		avg = p[3] * level * 2 + (_mm_cvtsi128_si32(a[3]) << 1);
1069 		tmp |= (((avg/cnt) + 1) / 2) << 24;
1070 #endif
1071 
1072 		*(uint32_t *)d = tmp;
1073 
1074 		d += 4;
1075 		p += 4;
1076 	}
1077 	_mm_empty();
1078 
1079 	memcpy(plane,scratchplane2,w*h);
1080 }
1081 #endif
1082 
filter_plane_median_p(uint8_t * plane,int w,int h,int level)1083 void filter_plane_median_p ( uint8_t * plane, int w, int h, int level)
1084 {
1085 	int i;
1086 	int min;
1087 	int max;
1088 	int avg;
1089 	int cnt;
1090 	int c;
1091 	int e;
1092 	uint8_t * p;
1093 	uint8_t * d;
1094 
1095 	if(level==0) return;
1096 
1097 	p = plane;
1098 	d = scratchplane1;
1099 
1100 	// remove strong outliers from the image. An outlier is a pixel which lies outside
1101 	// of max-thres and min+thres of the surrounding pixels. This should not cause blurring
1102 	// and it should leave an evenly spread noise-floor to the image.
1103 	for(i=0;i<=(w*h);i++)
1104 	{
1105 	// reset min/max-filter
1106 	min=255;
1107 	max=0;
1108 	// check every remaining position arround the reference-pixel for being min/max...
1109 
1110 	min=(min>*(p-w-1))? *(p-w-1):min;
1111 	max=(max<*(p-w-1))? *(p-w-1):max;
1112 	min=(min>*(p-w+0))? *(p-w+0):min;
1113 	max=(max<*(p-w+0))? *(p-w+0):max;
1114 	min=(min>*(p-w+1))? *(p-w+1):min;
1115 	max=(max<*(p-w+1))? *(p-w+1):max;
1116 
1117 	min=(min>*(p  -1))? *(p  -1):min;
1118 	max=(max<*(p  -1))? *(p  -1):max;
1119 	min=(min>*(p  +1))? *(p  +1):min;
1120 	max=(max<*(p  +1))? *(p  +1):max;
1121 
1122 	min=(min>*(p+w-1))? *(p+w-1):min;
1123 	max=(max<*(p+w-1))? *(p+w-1):max;
1124 	min=(min>*(p+w+0))? *(p+w+0):min;
1125 	max=(max<*(p+w+0))? *(p+w+0):max;
1126 	min=(min>*(p+w+1))? *(p+w+1):min;
1127 	max=(max<*(p+w+1))? *(p+w+1):max;
1128 
1129 	if( *(p)<(min) )
1130 	{
1131 	*(d)=min;
1132 	}
1133 	else
1134 	if( *(p)>(max) )
1135 	{
1136 	*(d)=max;
1137 	}
1138 	else
1139 	{
1140 	*(d)=*(p);
1141 	}
1142 
1143 	d++;
1144 	p++;
1145 	}
1146 
1147 	// in the second stage we try to average similar spatial pixels, only. This, like
1148 	// a median, should also not reduce sharpness but flatten the noisefloor. This
1149 	// part is quite similar to what 2dclean/yuvmedianfilter do. But because of the
1150 	// different weights given to the pixels it is less aggressive...
1151 
1152 	p = scratchplane1;
1153 	d = scratchplane2;
1154 
1155 	// this filter needs values outside of the imageplane, so we just copy the first line
1156 	// and the last line into the out-of-range area...
1157 
1158 	memcpy ( p-w  , p, w );
1159 	memcpy ( p-w*2, p, w );
1160 
1161 	memcpy ( p+(w*h)  , p+(w*h)-w, w );
1162 	memcpy ( p+(w*h)+w, p+(w*h)-w, w );
1163 
1164 	for(i=0;i<=(w*h);i++)
1165 	{
1166 		avg=*(p)*level*2;
1167 		cnt=level;
1168 
1169 		c = *(p-w*2-2);
1170 		e = abs(c-*(p));
1171 		e = ((level-e)<0)? 0:level-e;
1172 		avg += e*c*2;
1173 		cnt += e;
1174 
1175 		c = *(p-w*2-1);
1176 		e = abs(c-*(p));
1177 		e = ((level-e)<0)? 0:level-e;
1178 		avg += e*c*2;
1179 		cnt += e;
1180 
1181 		c = *(p-w*2);
1182 		e = abs(c-*(p));
1183 		e = ((level-e)<0)? 0:level-e;
1184 		avg += e*c*2;
1185 		cnt += e;
1186 
1187 		c = *(p-w*2+1);
1188 		e = abs(c-*(p));
1189 		e = ((level-e)<0)? 0:level-e;
1190 		avg += e*c*2;
1191 		cnt += e;
1192 
1193 		c = *(p-w*2+2);
1194 		e = abs(c-*(p));
1195 		e = ((level-e)<0)? 0:level-e;
1196 		avg += e*c*2;
1197 		cnt += e;
1198 
1199 		c = *(p-w*1-2);
1200 		e = abs(c-*(p));
1201 		e = ((level-e)<0)? 0:level-e;
1202 		avg += e*c*2;
1203 		cnt += e;
1204 
1205 		c = *(p-w*1-1);
1206 		e = abs(c-*(p));
1207 		e = ((level-e)<0)? 0:level-e;
1208 		avg += e*c*2;
1209 		cnt += e;
1210 
1211 		c = *(p-w*1);
1212 		e = abs(c-*(p));
1213 		e = ((level-e)<0)? 0:level-e;
1214 		avg += e*c*2;
1215 		cnt += e;
1216 
1217 		c = *(p-w*1+1);
1218 		e = abs(c-*(p));
1219 		e = ((level-e)<0)? 0:level-e;
1220 		avg += e*c*2;
1221 		cnt += e;
1222 
1223 		c = *(p-w*1+2);
1224 		e = abs(c-*(p));
1225 		e = ((level-e)<0)? 0:level-e;
1226 		avg += e*c*2;
1227 		cnt += e;
1228 
1229 		c = *(p-2);
1230 		e = abs(c-*(p));
1231 		e = ((level-e)<0)? 0:level-e;
1232 		avg += e*c*2;
1233 		cnt += e;
1234 
1235 		c = *(p-1);
1236 		e = abs(c-*(p));
1237 		e = ((level-e)<0)? 0:level-e;
1238 		avg += e*c*2;
1239 		cnt += e;
1240 
1241 		c = *(p+1);
1242 		e = abs(c-*(p));
1243 		e = ((level-e)<0)? 0:level-e;
1244 		avg += e*c*2;
1245 		cnt += e;
1246 
1247 		c = *(p+2);
1248 		e = abs(c-*(p));
1249 		e = ((level-e)<0)? 0:level-e;
1250 		avg += e*c*2;
1251 		cnt += e;
1252 
1253 		c = *(p+w*1-2);
1254 		e = abs(c-*(p));
1255 		e = ((level-e)<0)? 0:level-e;
1256 		avg += e*c*2;
1257 		cnt += e;
1258 
1259 		c = *(p+w*1-1);
1260 		e = abs(c-*(p));
1261 		e = ((level-e)<0)? 0:level-e;
1262 		avg += e*c*2;
1263 		cnt += e;
1264 
1265 		c = *(p+w*1);
1266 		e = abs(c-*(p));
1267 		e = ((level-e)<0)? 0:level-e;
1268 		avg += e*c*2;
1269 		cnt += e;
1270 
1271 		c = *(p+w*1+1);
1272 		e = abs(c-*(p));
1273 		e = ((level-e)<0)? 0:level-e;
1274 		avg += e*c*2;
1275 		cnt += e;
1276 
1277 		c = *(p+w*1+2);
1278 		e = abs(c-*(p));
1279 		e = ((level-e)<0)? 0:level-e;
1280 		avg += e*c*2;
1281 		cnt += e;
1282 
1283 		c = *(p+w*2-2);
1284 		e = abs(c-*(p));
1285 		e = ((level-e)<0)? 0:level-e;
1286 		avg += e*c*2;
1287 		cnt += e;
1288 
1289 		c = *(p+w*2-1);
1290 		e = abs(c-*(p));
1291 		e = ((level-e)<0)? 0:level-e;
1292 		avg += e*c*2;
1293 		cnt += e;
1294 
1295 		c = *(p+w*2);
1296 		e = abs(c-*(p));
1297 		e = ((level-e)<0)? 0:level-e;
1298 		avg += e*c*2;
1299 		cnt += e;
1300 
1301 		c = *(p+w*2+1);
1302 		e = abs(c-*(p));
1303 		e = ((level-e)<0)? 0:level-e;
1304 		avg += e*c*2;
1305 		cnt += e;
1306 
1307 		c = *(p+w*2+2);
1308 		e = abs(c-*(p));
1309 		e = ((level-e)<0)? 0:level-e;
1310 		avg += e*c*2;
1311 		cnt += e;
1312 
1313 		*(d)=(((avg/cnt)+1)/2);
1314 
1315 		d++;
1316 		p++;
1317 	}
1318 
1319 	memcpy(plane,scratchplane2,w*h);
1320 }
1321 
1322 /***********************************************************
1323  * Main Loop                                               *
1324  ***********************************************************/
1325 
init_accel()1326 static void init_accel() {
1327 	filter_plane_median = filter_plane_median_p;
1328 	temporal_filter_planes = temporal_filter_planes_p;
1329 	uint32_t tmp;
1330 
1331 #if defined(__SSE2__)
1332 	int d = 0;
1333 /*	__asm__ volatile("cpuid" : "=d"(d) : "a"(1) : "ebx", "ecx"); */
1334 	__asm__ volatile("movl %%ebx, %1; cpuid; movl %1, %%ebx" : "=d"(d), "=&g"(tmp) : "a"(1) : "ecx");
1335 	if ((d & (1 << 26))) {
1336 		mjpeg_info("SETTING SSE2 for standard Temporal-Noise-Filter");
1337 		temporal_filter_planes = temporal_filter_planes_sse2;
1338 
1339 		/*__asm__ volatile("cpuid" : "=d"(d) : "a"(0x80000001) : "ebx", "ecx");*/
1340 		__asm__ volatile("movl %%ebx, %1; cpuid; movl %1, %%ebx" : "=d"(d), "=&g"(tmp) : "a"(0x80000001) : "ecx");
1341 		if ((d & (1 << 29))) {
1342 			/* x86_64 processor */
1343 			mjpeg_info("SETTING SSE2 for Median-Filter");
1344 			filter_plane_median = filter_plane_median_sse2;
1345 		}
1346 	}
1347 #endif
1348 }
1349 
1350 int
main(int argc,char * argv[])1351 main (int argc, char *argv[])
1352 {
1353   int c;
1354   int fd_in = fileno(stdin);
1355   int fd_out = fileno(stdout);
1356   int err = 0;
1357   char *msg = NULL;
1358   y4m_ratio_t rx, ry;
1359   y4m_frame_info_t iframeinfo;
1360   y4m_stream_info_t istreaminfo;
1361   y4m_frame_info_t oframeinfo;
1362   y4m_stream_info_t ostreaminfo;
1363 
1364   mjpeg_info("yuvdenoise version %s", VERSION);
1365 
1366   while ((c = getopt (argc, argv, "qhvt:g:m:M:r:G:")) != -1)
1367     {
1368       switch (c)
1369 	{
1370 	case 'h':
1371 	  {
1372   	    mjpeg_info("... | yuvdenoise [OPTIONS] | ...");
1373   	    mjpeg_info("Brief description of the accepted options:\n");
1374   	    mjpeg_info("-g [Y=0...255],[U=0...255],[V=0...255]");
1375   	    mjpeg_info("   This sets the parameters [Y,U,V] for the gauss-filter. 0 means no filtering,");
1376   	    mjpeg_info("    which is the default, 255 is maximum gaussfiltering. Some camera manufacturers");
1377   	    mjpeg_info("    think it's a good idea to sharpen the frames extremly. This however will raise");
1378   	    mjpeg_info("    typical video-compression-artifacts (ringing, blocking, alias, etc...). If you");
1379   	    mjpeg_info("    desire to have a video free of these, you can raise the gauss-filter-values");
1380   	    mjpeg_info("    until your image is undesirable soft... (Short: setting decent values helps");
1381   	    mjpeg_info("    both: the viewer and the encoder -- setting too high values is ugly.) This may");
1382   	    mjpeg_info("    also help with extremly noisy images from a weak air-captured signal...");
1383   	    mjpeg_info("    Unlike y4mspatialfilter this will not boost ringing artifacts.\n");
1384   	    mjpeg_info("-m [0...255],[0...255],[0...255]");
1385   	    mjpeg_info("    Spatial-Pre-Filter. This one helps with really noisy signals. You should not");
1386   	    mjpeg_info("    use this with moderate to low noise material. Used with care it helps the tem-");
1387 	     mjpeg_info("   poral-filter a lot. Misuse will lead to rather dull images (like an overly median-filtered image...");
1388   	    mjpeg_info("-t [0...255],[0...255],[0...255]");
1389   	    mjpeg_info("    Temporal-Noise-Filter. This one dramaticaly reduces noise without loosing sharpness. If set too high, however, it may introduce visable ghost-images or smear. ");
1390   	    mjpeg_info("-M [0...255],[0...255],[0...255]");
1391   	    mjpeg_info("    Spatial-Post-Filter. This one removes spatial noise left by the temporal-filter.");
1392   	    mjpeg_info("    Used with care it can dramaticaly lower the bitrate. Using it with to high settings will lead to the same artifacts as the spatial-pre-filter produces.");
1393   	    mjpeg_info("-G [0...255],[0...255],[0...255]");
1394   	    mjpeg_info("    Set -m,-t,-M values at once. -t is boosted by a factor of 2. Useful if you do");
1395   	    mjpeg_info("    not want to tweak all the values by hand... which is better.");
1396   	    mjpeg_info("-r [0...255],[0...255],[0...255]");
1397   	    mjpeg_info("    Add some static masking noise. Might be used as an effect, too... *g*");
1398   	    mjpeg_info("-q  HighQuality-Mode. Warning: On almost any machine this is dead slow...");
1399 	    exit (0);
1400 	    break;
1401 	  }
1402 	case 'v':
1403 	  {
1404 	    verbose = 1;
1405 	    break;
1406 	  }
1407 	case 'q':
1408 	  {
1409 	    hq_mode = 1;
1410 	    break;
1411 	  }
1412 	case 't':
1413 	  {
1414 	    sscanf (optarg, "%i,%i,%i", &temp_Y_thres, &temp_U_thres,
1415 		    &temp_V_thres);
1416 	    break;
1417 	  }
1418 	case 'g':
1419 	  {
1420 	    sscanf (optarg, "%i,%i,%i", &gauss_Y, &gauss_U, &gauss_V);
1421 	    break;
1422 	  }
1423 	case 'm':
1424 	  {
1425 	    sscanf (optarg, "%i,%i,%i", &med_pre_Y_thres, &med_pre_U_thres, &med_pre_V_thres);
1426 	    break;
1427 	  }
1428 	case 'M':
1429 	  {
1430 	    sscanf (optarg, "%i,%i,%i", &med_post_Y_thres, &med_post_U_thres, &med_post_V_thres);
1431 	    break;
1432 	  }
1433 	case 'G':
1434 	  {
1435 	    sscanf (optarg, "%i,%i,%i", &med_pre_Y_thres, &med_pre_U_thres, &med_pre_V_thres);
1436 	    med_post_Y_thres = temp_Y_thres = med_pre_Y_thres;
1437 	    med_post_U_thres = temp_U_thres = med_pre_U_thres;
1438 	    med_post_V_thres = temp_V_thres = med_pre_V_thres;
1439 	    temp_Y_thres *= 2;
1440 	    temp_U_thres *= 2;
1441 	    temp_V_thres *= 2;
1442 	    break;
1443 	  }
1444 	case 'r':
1445 	  {
1446 	    sscanf (optarg, "%i,%i,%i", &renoise_Y, &renoise_U, &renoise_V);
1447 	    break;
1448 	  }
1449 	case '?':
1450 	default:
1451 	  exit (1);
1452 	}
1453     }
1454 
1455   mjpeg_info("Using the following thresholds/settings:");
1456   mjpeg_info("Gauss-Pre-Filter      [Y,U,V] : [%i,%i,%i]",
1457 	     gauss_Y, gauss_U, gauss_V);
1458   mjpeg_info("Median-Pre-Filter     [Y,U,V] : [%i,%i,%i]",
1459 	     med_pre_Y_thres, med_pre_U_thres, med_pre_V_thres);
1460   mjpeg_info("Temporal-Noise-Filter [Y,U,V] : [%i,%i,%i]",
1461 	     temp_Y_thres, temp_U_thres, temp_V_thres);
1462   mjpeg_info("Median-Post-Filter    [Y,U,V] : [%i,%i,%i]",
1463 	     med_post_Y_thres, med_post_U_thres, med_post_V_thres);
1464   mjpeg_info("Renoise               [Y,U,V] : [%i,%i,%i]",
1465 	     renoise_Y, renoise_U, renoise_V);
1466   mjpeg_info("HQ-Mode                       : %s",
1467 	     (hq_mode==0? "off":"on"));
1468 
1469   /* initialize stream-information */
1470   y4m_accept_extensions (1);
1471   y4m_init_stream_info (&istreaminfo);
1472   y4m_init_frame_info (&iframeinfo);
1473   y4m_init_stream_info (&ostreaminfo);
1474   y4m_init_frame_info (&oframeinfo);
1475 
1476   /* open input stream */
1477   if ((err = y4m_read_stream_header (fd_in, &istreaminfo)) != Y4M_OK)
1478       mjpeg_error_exit1("Couldn't read YUV4MPEG header: %s!", y4m_strerr (err));
1479 
1480   /* get format information */
1481   width = y4m_si_get_width (&istreaminfo);
1482   height = y4m_si_get_height (&istreaminfo);
1483   input_chroma_subsampling = y4m_si_get_chroma (&istreaminfo);
1484   input_interlaced = y4m_si_get_interlace (&istreaminfo);
1485   mjpeg_info("Y4M-Stream %ix%i(%s)",
1486 	     width,
1487 	     height, y4m_chroma_description (input_chroma_subsampling));
1488 
1489   lwidth = width;
1490   lheight = height;
1491 
1492   // Setup the denoiser to use the appropriate chroma processing
1493   if (input_chroma_subsampling == Y4M_CHROMA_420JPEG ||
1494       input_chroma_subsampling == Y4M_CHROMA_420MPEG2 ||
1495       input_chroma_subsampling == Y4M_CHROMA_420PALDV)
1496     msg = "Processing Mode : 4:2:0";
1497   else if (input_chroma_subsampling == Y4M_CHROMA_411)
1498     msg = "Processing Mode : 4:1:1";
1499   else if (input_chroma_subsampling == Y4M_CHROMA_422)
1500     msg = "Processing Mode : 4:2:2";
1501   else if (input_chroma_subsampling == Y4M_CHROMA_444)
1502     msg = "Processing Mode : 4:4:4";
1503   else
1504     mjpeg_error_exit1 (" ### Unsupported Y4M Chroma sampling ### ");
1505 
1506   rx = y4m_chroma_ss_x_ratio (input_chroma_subsampling);
1507   ry = y4m_chroma_ss_y_ratio (input_chroma_subsampling);
1508   cwidth = width / rx.d;
1509   cheight = height / ry.d;
1510 
1511   mjpeg_info("%s %s", msg,
1512 	     (input_interlaced ==
1513 	      Y4M_ILACE_NONE) ? "progressive" : "interlaced");
1514   mjpeg_info("Luma-Plane      : %ix%i pixels", lwidth, lheight);
1515   mjpeg_info("Chroma-Plane    : %ix%i pixels", cwidth, cheight);
1516 
1517   if (input_interlaced != Y4M_ILACE_NONE)
1518     {
1519       // process the fields as images side by side
1520       lwidth *= 2;
1521       cwidth *= 2;
1522       lheight /= 2;
1523       cheight /= 2;
1524     }
1525 
1526   y4m_si_set_interlace (&ostreaminfo, y4m_si_get_interlace (&istreaminfo));
1527   y4m_si_set_chroma (&ostreaminfo, y4m_si_get_chroma (&istreaminfo));
1528   y4m_si_set_width (&ostreaminfo, width);
1529   y4m_si_set_height (&ostreaminfo, height);
1530   y4m_si_set_framerate (&ostreaminfo, y4m_si_get_framerate (&istreaminfo));
1531   y4m_si_set_sampleaspect (&ostreaminfo,
1532 			   y4m_si_get_sampleaspect (&istreaminfo));
1533 
1534   /* write the outstream header */
1535   y4m_write_stream_header (fd_out, &ostreaminfo);
1536 
1537   /* now allocate the needed buffers */
1538   {
1539     /* calculate the memory offset needed to allow the processing
1540      * functions to overshot. The biggest overshot is needed for the
1541      * MC-functions, so we'll use 8*width...
1542      */
1543     buff_offset = lwidth * 8;
1544     buff_size = buff_offset * 2 + lwidth * lheight;
1545 
1546     frame1[0] = buff_offset + (uint8_t *) malloc (buff_size);
1547     frame1[1] = buff_offset + (uint8_t *) malloc (buff_size);
1548     frame1[2] = buff_offset + (uint8_t *) malloc (buff_size);
1549 
1550     frame2[0] = buff_offset + (uint8_t *) malloc (buff_size);
1551     frame2[1] = buff_offset + (uint8_t *) malloc (buff_size);
1552     frame2[2] = buff_offset + (uint8_t *) malloc (buff_size);
1553 
1554     frame3[0] = buff_offset + (uint8_t *) malloc (buff_size);
1555     frame3[1] = buff_offset + (uint8_t *) malloc (buff_size);
1556     frame3[2] = buff_offset + (uint8_t *) malloc (buff_size);
1557 
1558     frame4[0] = buff_offset + (uint8_t *) malloc (buff_size);
1559     frame4[1] = buff_offset + (uint8_t *) malloc (buff_size);
1560     frame4[2] = buff_offset + (uint8_t *) malloc (buff_size);
1561 
1562     frame5[0] = buff_offset + (uint8_t *) malloc (buff_size);
1563     frame5[1] = buff_offset + (uint8_t *) malloc (buff_size);
1564     frame5[2] = buff_offset + (uint8_t *) malloc (buff_size);
1565 
1566     frame6[0] = buff_offset + (uint8_t *) malloc (buff_size);
1567     frame6[1] = buff_offset + (uint8_t *) malloc (buff_size);
1568     frame6[2] = buff_offset + (uint8_t *) malloc (buff_size);
1569 
1570     frame7[0] = buff_offset + (uint8_t *) malloc (buff_size);
1571     frame7[1] = buff_offset + (uint8_t *) malloc (buff_size);
1572     frame7[2] = buff_offset + (uint8_t *) malloc (buff_size);
1573 
1574     outframe[0] = buff_offset + (uint8_t *) malloc (buff_size);
1575     outframe[1] = buff_offset + (uint8_t *) malloc (buff_size);
1576     outframe[2] = buff_offset + (uint8_t *) malloc (buff_size);
1577 
1578     scratchplane1 = buff_offset + (uint8_t *) malloc (buff_size);
1579     scratchplane2 = buff_offset + (uint8_t *) malloc (buff_size);
1580 
1581     mjpeg_info("Buffers allocated.");
1582   }
1583 
1584   /* initialize motion_library */
1585   init_motion_search ();
1586 
1587 	init_accel();
1588 
1589   /* read every frame until the end of the input stream and process it */
1590   while (Y4M_OK == (err = y4m_read_frame (fd_in,
1591 					    &istreaminfo,
1592 					    &iframeinfo, frame1)))
1593     {
1594 
1595       static uint32_t frame_nr = 0;
1596       uint8_t *temp[3];
1597 
1598       frame_nr++;
1599 
1600 	gauss_filter_plane (frame1[0], lwidth, lheight, gauss_Y);
1601 	gauss_filter_plane (frame1[1], cwidth, cheight, gauss_U);
1602 	gauss_filter_plane (frame1[2], cwidth, cheight, gauss_V);
1603 
1604 	filter_plane_median (frame1[0], lwidth, lheight, med_pre_Y_thres);
1605 	filter_plane_median (frame1[1], cwidth, cheight, med_pre_U_thres);
1606 	filter_plane_median (frame1[2], cwidth, cheight, med_pre_V_thres);
1607 
1608 	if(hq_mode==1)
1609 	{
1610 		temporal_filter_planes_MC (0, lwidth, lheight, temp_Y_thres);
1611       		temporal_filter_planes_MC (1, cwidth, cheight, temp_U_thres);
1612       		temporal_filter_planes_MC (2, cwidth, cheight, temp_V_thres);
1613 	}
1614 	else
1615 	{
1616 		temporal_filter_planes (0, lwidth, lheight, temp_Y_thres);
1617       		temporal_filter_planes (1, cwidth, cheight, temp_U_thres);
1618       		temporal_filter_planes (2, cwidth, cheight, temp_V_thres);
1619 	}
1620 
1621 	filter_plane_median (outframe[0], lwidth, lheight, med_post_Y_thres);
1622 	filter_plane_median (outframe[1], cwidth, cheight, med_post_U_thres);
1623 	filter_plane_median (outframe[2], cwidth, cheight, med_post_V_thres);
1624 
1625       	renoise (outframe[0], lwidth, lheight, renoise_Y );
1626       	renoise (outframe[1], cwidth, cheight, renoise_U );
1627       	renoise (outframe[2], cwidth, cheight, renoise_V );
1628 
1629       if (frame_nr >= 4)
1630 	y4m_write_frame (fd_out, &ostreaminfo, &oframeinfo, outframe);
1631 
1632       // rotate buffer pointers to rotate input-buffers
1633       temp[0] = frame7[0];
1634       temp[1] = frame7[1];
1635       temp[2] = frame7[2];
1636 
1637       frame7[0] = frame6[0];
1638       frame7[1] = frame6[1];
1639       frame7[2] = frame6[2];
1640 
1641       frame6[0] = frame5[0];
1642       frame6[1] = frame5[1];
1643       frame6[2] = frame5[2];
1644 
1645       frame5[0] = frame4[0];
1646       frame5[1] = frame4[1];
1647       frame5[2] = frame4[2];
1648 
1649       frame4[0] = frame3[0];
1650       frame4[1] = frame3[1];
1651       frame4[2] = frame3[2];
1652 
1653       frame3[0] = frame2[0];
1654       frame3[1] = frame2[1];
1655       frame3[2] = frame2[2];
1656 
1657       frame2[0] = frame1[0];
1658       frame2[1] = frame1[1];
1659       frame2[2] = frame1[2];
1660 
1661       frame1[0] = temp[0];
1662       frame1[1] = temp[1];
1663       frame1[2] = temp[2];
1664 
1665     }
1666 	// write out the left frames...
1667 	y4m_write_frame (fd_out, &ostreaminfo, &oframeinfo, frame4);
1668 	y4m_write_frame (fd_out, &ostreaminfo, &oframeinfo, frame3);
1669 	y4m_write_frame (fd_out, &ostreaminfo, &oframeinfo, frame2);
1670 	y4m_write_frame (fd_out, &ostreaminfo, &oframeinfo, frame1);
1671 
1672   /* free allocated buffers */
1673   {
1674     free (frame1[0] - buff_offset);
1675     free (frame1[1] - buff_offset);
1676     free (frame1[2] - buff_offset);
1677 
1678     free (frame2[0] - buff_offset);
1679     free (frame2[1] - buff_offset);
1680     free (frame2[2] - buff_offset);
1681 
1682     free (frame3[0] - buff_offset);
1683     free (frame3[1] - buff_offset);
1684     free (frame3[2] - buff_offset);
1685 
1686     free (frame4[0] - buff_offset);
1687     free (frame4[1] - buff_offset);
1688     free (frame4[2] - buff_offset);
1689 
1690     free (frame5[0] - buff_offset);
1691     free (frame5[1] - buff_offset);
1692     free (frame5[2] - buff_offset);
1693 
1694     free (frame6[0] - buff_offset);
1695     free (frame6[1] - buff_offset);
1696     free (frame6[2] - buff_offset);
1697 
1698     free (frame7[0] - buff_offset);
1699     free (frame7[1] - buff_offset);
1700     free (frame7[2] - buff_offset);
1701 
1702     free (outframe[0] - buff_offset);
1703     free (outframe[1] - buff_offset);
1704     free (outframe[2] - buff_offset);
1705 
1706 	free (scratchplane1 - buff_offset);
1707 	free (scratchplane2 - buff_offset);
1708 
1709     mjpeg_info("Buffers freed.");
1710   }
1711 
1712   /* did stream end unexpectedly ? */
1713   if (err != Y4M_ERR_EOF)
1714     mjpeg_error_exit1 ("%s", y4m_strerr (err));
1715 
1716   /* Exit gently */
1717   return (0);
1718 }
1719