1 /*
2  *
3  *  This file is part of libmpeg3
4  *
5  * LibMPEG3
6  * Author: Adam Williams <broadcast@earthling.net>
7  * Page: heroine.linuxbox.com
8  * Page: http://www.smalltalkconsulting.com/html/mpeg3source.html (for Squeak)
9  *
10     LibMPEG3 was originally licenced under GPL. It was relicensed by
11     the author under the LGPL and the Squeak license on Nov 1st, 2000
12 
13     This library is free software; you can redistribute it and/or
14     modify it under the terms of the GNU Lesser General Public
15     License as published by the Free Software Foundation; either
16     version 2.1 of the License, or (at your option) any later version.
17 
18     This library is distributed in the hope that it will be useful,
19     but WITHOUT ANY WARRANTY; without even the implied warranty of
20     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21     Lesser General Public License for more details.
22 
23     You should have received a copy of the GNU Lesser General Public
24     License along with this library; if not, write to the Free Software
25     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
26 
27     Also licensed under the Squeak license.
28     http://www.squeak.org/license.html
29  */
30 #include "mpeg3video.h"
31 #include <stdio.h>
32 
33 #ifdef HAVE_MMX
34 
35 #ifdef HAVE_3Dnow
recva_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)36 static inline void recva_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
37 {
38 	__asm__(
39 		".align	8\n"
40 		"1:"
41 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
42 	   		"movq		8(%1),	%%mm1\n"      /* 8 s */
43 	   		"movq		(%4),	%%mm2\n"      /* 8 s +lx */
44 	   		"movq		8(%4),	%%mm3\n"      /* 8 s +lx **/
45 
46 			"pavgusb %%mm2, 	%%mm0\n"
47 			"addl		%3,		%1\n"
48 			"pavgusb %%mm3, 	%%mm1\n"
49 
50 	   		"movq		(%2),	%%mm2\n"      /* 8 d */
51 	   		"movq		8(%2),	%%mm3\n"      /* 8 d */
52 			"pavgusb %%mm2, 	%%mm0\n"
53 			"addl		%3,		%4\n"
54 			"pavgusb %%mm3, 	%%mm1\n"
55 
56 			"movq		%%mm0,	(%2)\n"
57 			"movq		%%mm1,	8(%2)\n"
58 			"addl		%3,		%2\n"
59 		"loop		1b\n"
60       :
61       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
62 	);
63 }
64 
recvac_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)65 static inline void recvac_mmx(unsigned char *s, unsigned char *d, int lx,int lx2, int h)
66 {
67 	__asm__(
68 		".align	8\n"
69 		"1:"
70 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
71 	   		"movq		(%4),	%%mm2\n"      /* 8 s +lx */
72 			"addl		%3,		%1\n"
73 			"pavgusb %%mm2, 	%%mm0\n"
74 	   		"movq		(%2),	%%mm3\n"      /* 8 d */
75 			"addl		%3,		%4\n"
76 			"pavgusb %%mm3, 	%%mm0\n"
77 			"movq		%%mm0,	(%2)\n"
78 			"addl		%3,		%2\n"
79 		"loop		1b\n"
80       :
81       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
82 	);
83 }
84 
rech_mmx(unsigned char * s,unsigned char * d,int lx2,int h)85 static inline void rech_mmx(unsigned char *s, unsigned char *d, int lx2, int h)
86 {
87 	__asm__ (
88 		".align	8\n"
89 		"1:"
90 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
91 	   		"movq		8(%1),	%%mm1\n"      /* 8 s */
92 	   		"movq		1(%1),	%%mm2\n"      /* 8 s */
93 	   		"movq		9(%1),	%%mm3\n"      /* 8 s */
94 
95 			"pavgusb 	%%mm2, 	%%mm0\n"
96 			"addl		%3,		%1\n"
97 			"pavgusb 	%%mm3, 	%%mm1\n"
98 
99 			"movq		%%mm0,	(%2)\n"
100 			"movq		%%mm1,	8(%2)\n"
101 			"addl		%3,		%2\n"
102 		"loop		1b\n"
103       :
104       : "c" (h), "r" (s), "r" (d), "r" (lx2)
105 	);
106 }
107 
rechc_mmx(unsigned char * s,unsigned char * d,int lx2,int h)108 static inline void rechc_mmx(unsigned char *s, unsigned char *d, int lx2, int h)
109 {
110 	__asm__ (
111 		".align	8\n"
112 		"1:"
113 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
114 	   		"movq		1(%1),	%%mm2\n"      /* 8 s +1 */
115 			"addl		%3,		%1\n"
116 			"pavgusb 	%%mm2, 	%%mm0\n"
117 			"movq		%%mm0,	(%2)\n"
118 			"addl		%3,		%2\n"
119 		"loop		1b\n"
120       :
121       : "c" (h), "r" (s), "r" (d), "r" (lx2)
122 	);
123 }
124 
recha_mmx(unsigned char * s,unsigned char * d,int lx2,int h)125 static inline void recha_mmx(unsigned char *s, unsigned char *d,int lx2, int h)
126 {
127 	__asm__ (
128 		".align	8\n"
129 		"1:"
130 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
131 	   		"movq		8(%1),	%%mm1\n"      /* 8 s */
132 	   		"movq		1(%1),	%%mm2\n"      /* 8 s */
133 	   		"movq		9(%1),	%%mm3\n"      /* 8 s */
134 
135 			"pavgusb 	%%mm2, 	%%mm0\n"
136 			"addl		%3,		%1\n"
137 			"pavgusb 	%%mm3, 	%%mm1\n"
138 
139 	   		"movq		(%2),	%%mm2\n"      /* 8 d */
140 	   		"movq		8(%2),	%%mm3\n"      /* 8 d */
141 			"pavgusb 	%%mm2, 	%%mm0\n"
142 			"pavgusb 	%%mm3, 	%%mm1\n"
143 
144 			"movq		%%mm0,	(%2)\n"
145 			"movq		%%mm1,	8(%2)\n"
146 			"addl		%3,		%2\n"
147 		"loop		1b\n"
148       :
149       : "c" (h), "r" (s), "r" (d), "r" (lx2)
150 	);
151 }
152 
rechac_mmx(unsigned char * s,unsigned char * d,int lx2,int h)153 static inline void rechac_mmx(unsigned char *s,unsigned char  *d, int lx2, int h)
154 {
155 	__asm__ (
156 		".align	8\n"
157 		"1:"
158 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
159 	   		"movq		1(%1),	%%mm2\n"      /* 8 s */
160 
161 			"addl		%3,		%1\n"
162 			"pavgusb 	%%mm2, 	%%mm0\n"
163 
164 	   		"movq		(%2),	%%mm1\n"      /* 8 d */
165 			"pavgusb 	%%mm1, 	%%mm0\n"
166 
167 			"movq		%%mm0,	(%2)\n"
168 			"addl		%3,		%2\n"
169 		"loop		1b\n"
170       :
171       : "c" (h), "r" (s), "r" (d), "r" (lx2)
172 	);
173 }
174 
rec4_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)175 static inline void rec4_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
176 {
177 	__asm__ __volatile__(
178 	   	"movq		(%1),	%%mm0\n"  /* 8 s */
179 	   	"movq		8(%1),	%%mm1\n"  /* 8 s */
180 	   	"movq		1(%1),	%%mm2\n"  /* 8 s +1*/
181 	   	"movq		9(%1),	%%mm3\n"  /* 8 s +1*/
182 		".align 8\n"
183 		"1:"
184 	   		"movq		(%4),	%%mm4\n"  /* 8 s+lx */
185 			"pavgusb 	%%mm2, 	%%mm0\n"
186 	   		"movq		8(%4),	%%mm5\n"  /* 8 s+lx */
187 			"pavgusb 	%%mm3, 	%%mm1\n"
188 
189 	   		"movq		1(%4),	%%mm6\n"  /* 8 s+lx +1*/
190 			"pavgusb 	%%mm4, 	%%mm0\n"
191 	   		"movq		9(%4),	%%mm7\n"  /* 8 s+lx +1*/
192 			"pavgusb 	%%mm5, 	%%mm1\n"
193 
194 			"pavgusb 	%%mm6, 	%%mm0\n"
195 			"addl		%3,		%4\n"
196 			"pavgusb 	%%mm7, 	%%mm1\n"
197 			"movq		%%mm0,	(%2)\n"
198 			"movq		%%mm6,	%%mm2\n"
199 			"movq		%%mm7,	%%mm3\n"
200 			"movq		%%mm1,	8(%2)\n"
201 			"movq		%%mm4,	%%mm0\n"
202 			"movq		%%mm5,	%%mm1\n"
203 			"addl		%3,		%2\n"
204 		"loop		1b\n"
205       	:
206 		: "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
207 	);
208 }
209 
rec4c_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)210 static inline void rec4c_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
211 {
212 	__asm__ __volatile__(
213 	   	"movq		(%1),	%%mm0\n"  /* 8 s */
214 	   	"movq		1(%1),	%%mm2\n"  /* 8 s +1*/
215 		".align 8\n"
216 		"1:"
217 	   		"movq		(%4),	%%mm4\n"  /* 8 s+lx */
218 			"pavgusb 	%%mm2, 	%%mm0\n"
219 
220 	   		"movq		1(%4),	%%mm6\n"  /* 8 s+lx +1*/
221 			"pavgusb 	%%mm4, 	%%mm0\n"
222 
223 			"addl		%3,		%4\n"
224 			"pavgusb 	%%mm6, 	%%mm0\n"
225 			"movq		%%mm0,	(%2)\n"
226 			"movq		%%mm6,	%%mm2\n"
227 			"movq		%%mm4,	%%mm0\n"
228 			"addl		%3,		%2\n"
229 		"loop		1b\n"
230       	:
231 		: "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
232 	);
233 }
234 
rec4a_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)235 static inline void rec4a_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
236 {
237 	__asm__ __volatile__(
238 	   	"movq		(%1),	%%mm0\n"  /* 8 s */
239 	   	"movq		8(%1),	%%mm1\n"  /* 8 s */
240 	   	"movq		1(%1),	%%mm2\n"  /* 8 s +1*/
241 	   	"movq		9(%1),	%%mm3\n"  /* 8 s +1*/
242 		".align 8\n"
243 		"1:"
244 	   		"movq		(%4),	%%mm4\n"  /* 8 s+lx */
245 			"pavgusb 	%%mm2, 	%%mm0\n"
246 	   		"movq		8(%4),	%%mm5\n"  /* 8 s+lx */
247 			"pavgusb 	%%mm3, 	%%mm1\n"
248 
249 	   		"movq		1(%4),	%%mm6\n"  /* 8 s+lx +1*/
250 			"pavgusb 	%%mm4, 	%%mm0\n"
251 	   		"movq		9(%4),	%%mm7\n"  /* 8 s+lx +1*/
252 			"pavgusb 	%%mm5, 	%%mm1\n"
253 			"movq		(%2),	%%mm2\n"
254 			"pavgusb 	%%mm6, 	%%mm0\n"
255 			"movq		8(%2),	%%mm3\n"
256 
257 			"pavgusb 	%%mm2, 	%%mm0\n"
258 			"addl		%3,		%4\n"
259 			"pavgusb 	%%mm3, 	%%mm1\n"
260 			"movq		%%mm0,	(%2)\n"
261 
262 			"pavgusb 	%%mm7, 	%%mm1\n"
263 			"movq		%%mm6,	%%mm2\n"
264 			"movq		%%mm7,	%%mm3\n"
265 			"movq		%%mm1,	8(%2)\n"
266 			"movq		%%mm4,	%%mm0\n"
267 			"movq		%%mm5,	%%mm1\n"
268 			"addl		%3,		%2\n"
269 		"loop		1b\n"
270       	:
271 		: "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
272 	);
273 }
274 
rec4ac_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)275 static inline void rec4ac_mmx(unsigned char *s, unsigned char  *d, int lx, int lx2, int h)
276 {
277 	__asm__ __volatile__(
278 	   	"movq		(%1),	%%mm0\n"  /* 8 s */
279 	   	"movq		1(%1),	%%mm2\n"  /* 8 s +1*/
280 		".align 8\n"
281 		"1:"
282 	   		"movq		(%4),	%%mm4\n"  /* 8 s+lx */
283 			"pavgusb 	%%mm2, 	%%mm0\n"
284 
285 	   		"movq		1(%4),	%%mm6\n"  /* 8 s+lx +1*/
286 			"pavgusb 	%%mm4, 	%%mm0\n"
287 			"movq		(%2),	%%mm1\n"  /* 8 d */
288 			"pavgusb 	%%mm6, 	%%mm0\n"
289 			"addl		%3,		%4\n"
290 			"pavgusb 	%%mm1, 	%%mm0\n"
291 			"movq		%%mm6,	%%mm2\n"
292 			"movq		%%mm0,	(%2)\n"
293 			"movq		%%mm4,	%%mm0\n"
294 			"addl		%3,		%2\n"
295 		"loop		1b\n"
296       	:
297 		: "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
298 	);
299 }
300 
301 #else  // HAVE_3DNOW
302 	static long long ADD_1	=	0x0101010101010101LL;
303 	static long long MASK_AND = 0x7f7f7f7f7f7f7f7fLL;
304 #endif
305 
rec_mmx(unsigned char * s,unsigned char * d,int lx2,int h)306 static inline void rec_mmx(unsigned char *s, unsigned char *d, int lx2, int h)
307 {
308 	__asm__ __volatile__(
309 		".align 8\n"
310 		"1:\t"
311 			"movq ( %1 ),     	%%mm0\n"			/* 8 s */
312 		    "movq 8( %1 ),    	%%mm2\n"			/* 16 s */
313 		    "movq %%mm0,  		( %2 )\n"
314 			"addl %3,			%1\n"
315 		    "movq %%mm2,  		8( %2 )\n"
316 			"decl %0\n"
317 		    "leal (%2, %3), %2\n"
318 	 	"jnz    1b"
319 		:
320 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
321 	);
322 }
323 
324 
recc_mmx(unsigned char * s,unsigned char * d,int lx2,int h)325 static inline void recc_mmx(unsigned char *s, unsigned char *d, int lx2, int h)
326 {
327 	__asm__ __volatile__(
328 		".align 8\n"
329 	    "1:\t"
330 	    	"movq ( %1 ),     %%mm0\n"
331 	      	"addl %3,   	  %1\n"
332 	      	"movq %%mm0,  	  ( %2 )\n"
333 			"decl %0\n"
334 	      	"leal (%2, %3), %2\n"
335 	   	"jnz    1b"
336 		:
337 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
338 	);
339 }
340 
341 
reca_mmx(unsigned char * s,unsigned char * d,int lx2,int h)342 static inline void reca_mmx(unsigned char *s, unsigned char  *d, int lx2, int h)
343 {
344 #ifdef HAVE_3Dnow
345 	__asm__ (
346 		".align	8\n"
347 		"1:"
348 		   	"movq		(%1),	%%mm0\n"      /* 8 s */
349 			"movq		(%2),	%%mm2\n"      /* 8 d */
350 		   	"movq		8(%1),	%%mm1\n"      /* 8 s */
351 		   	"movq		8(%2),	%%mm3\n"      /* 8 d */
352 			"pavgusb 	%%mm2, 	%%mm0\n"
353 			"addl		%3,		%1\n"
354 			"pavgusb 	%%mm3, 	%%mm1\n"
355 
356 			"movq		%%mm0,	(%2)\n"
357 			"movq		%%mm1,	8(%2)\n"
358 			"addl		%3,		%2\n"
359 		"loop		1b\n"
360 		:
361 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
362 	);
363 #else   /* No 3dnow */
364 	__asm__ (
365   		"movq	     _MASK_AND, 	%%mm5\n"
366   		"movq	     _ADD_1, 	%%mm6\n"
367 		"1:\t"
368   			"movq        (%1),%%mm0\n"        /* Load 16 pixels from each row */
369   			"movq        (%2),%%mm1\n"
370   			"movq       8(%1),%%mm2\n"
371   			"movq       8(%2),%%mm3\n"
372   			"psrlw		$1,%%mm0\n"           /* Shift pixels down */
373   			"psrlw	    $1,%%mm1\n"
374   			"pand	    %%mm5,%%mm0\n"        /* Zero out significant bit */
375   			"psrlw	    $1,%%mm2\n"
376   			"pand	    %%mm5,%%mm1\n"
377   			"psrlw	    $1,%%mm3\n"
378   			"pand	    %%mm5,%%mm2\n"
379   			"paddusb    %%mm1,%%mm0\n"        /* Add pixels */
380   			"pand	    %%mm5,%%mm3\n"
381   			"paddusb    %%mm3,%%mm2\n"
382   			"paddusb    %%mm6,%%mm0\n"        /* Add 1 to results */
383   			"paddusb    %%mm6,%%mm2\n"
384   			"movq       %%mm0,(%2)\n"
385   			"addl       %3,%1\n"
386   			"movq	    %%mm2, 8(%2)\n"
387   			"decl       %0\n"
388   			"leal		(%2, %3), %2\n"
389   		"jnz        1b\n"
390 		:
391 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
392 	);
393 #endif
394 }
395 
396 
recac_mmx(unsigned char * s,unsigned char * d,int lx2,int h)397 static inline void recac_mmx(unsigned char *s, unsigned char *d, int lx2, int h)
398 {
399 #ifdef HAVE_3Dnow
400 	__asm__ (
401 		".align	8\n"
402 		"1:"
403 		   	"movq		(%1),	%%mm0\n"      /* 8 s */
404 	   		"movq		(%2),	%%mm2\n"      /* 8 d */
405 			"pavgusb 	%%mm2, 	%%mm0\n"
406 			"addl		%3,		%1\n"
407 			"movq		%%mm0,	(%2)\n"
408 			"addl		%3,		%2\n"
409 		"loop		1b\n"
410 		:
411 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
412 	);
413 #else /* No 3dnow */
414 	__asm__ (
415   		"movq	     _MASK_AND, 	%%mm5\n"
416   		"movq	     _ADD_1, 	%%mm6\n"
417 		"1:\t"
418   			"movq       (%1),%%mm0\n"
419   			"movq       (%2),%%mm1\n"
420   			"psrlw		$1,%%mm0\n"
421   			"psrlw	    $1,%%mm1\n"
422   			"pand	    %%mm5,%%mm0\n"
423   			"pand	    %%mm5,%%mm1\n"
424   			"paddusb    %%mm1,%%mm0\n"
425   			"paddusb    %%mm6,%%mm0\n"
426   			"addl       %3,%1\n"
427   			"movq       %%mm0,(%2)\n"
428   			"decl       %0\n"
429   			"leal		(%2, %3), %2\n"
430   		"jnz        1b\n"
431 		:
432 		: "c" (h), "r" (s), "r" (d), "r" (lx2)
433 		);
434 #endif
435 }
436 
437 
recv_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)438 static inline void recv_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
439 {
440 #ifdef HAVE_3Dnow
441 	__asm__(
442 		".align	8\n"
443 		"1:"
444 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
445 	   		"movq		(%4),	%%mm2\n"      /* 8 s +lx */
446 	   		"movq		8(%1),	%%mm1\n"      /* 8 s */
447 	   		"movq		8(%4),	%%mm3\n"      /* 8 s +lx **/
448 
449 			"pavgusb %%mm2, 	%%mm0\n"
450 			"addl		%3,		%1\n"
451 			"pavgusb %%mm3, 	%%mm1\n"
452 
453 			"movq		%%mm0,	(%2)\n"
454 			"addl		%3,		%4\n"
455 			"movq		%%mm1,	8(%2)\n"
456 			"addl		%3,		%2\n"
457 		"loop		1b\n"
458       :
459       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
460 	);
461 #else
462 	__asm__ (
463   		"movq	     _MASK_AND, 	%%mm5\n"
464   		"movq	     _ADD_1, 	%%mm6\n"
465 		"1:\t"
466 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
467 	   		"movq		(%4),	%%mm1\n"      /* 8 s +lx */
468 	   		"movq		8(%1),	%%mm2\n"      /* 8 s */
469 	   		"movq		8(%4),	%%mm3\n"      /* 8 s +lx **/
470   			"psrlw      $1,%%mm0\n"
471   			"psrlw      $1,%%mm1\n"
472   			"pand       %%mm5,%%mm0\n"
473   			"psrlw      $1,%%mm2\n"
474   			"pand       %%mm5,%%mm1\n"
475   			"psrlw      $1,%%mm3\n"
476   			"pand       %%mm5,%%mm2\n"
477   			"paddusb    %%mm1,%%mm0\n"
478   			"pand       %%mm5,%%mm3\n"
479   			"paddusb    %%mm3,%%mm2\n"
480   			"paddusb    %%mm6,%%mm0\n"
481   			"paddusb    %%mm6,%%mm2\n"
482   			"movq       %%mm0,(%2)\n"
483   			"addl       %3,%1\n"
484   			"movq	    %%mm2, 8(%2)\n"
485   			"addl       %3,%4\n"
486   			"decl       %0\n"
487   			"leal		(%2, %3), %2\n"
488   		"jnz        1b\n"
489       :
490       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
491 	);
492 #endif
493 }
494 
495 
recvc_mmx(unsigned char * s,unsigned char * d,int lx,int lx2,int h)496 static inline void recvc_mmx(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
497 {
498 #ifdef HAVE_3Dnow
499 	__asm__(
500 		".align	8\n"
501 		"1:"
502 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
503 	   		"movq		(%4),	%%mm2\n"      /* 8 s +lx */
504 			"addl		%3,		%1\n"
505 			"pavgusb %%mm2, 	%%mm0\n"
506 			"addl		%3,		%4\n"
507 			"movq		%%mm0,	(%2)\n"
508 			"addl		%3,		%2\n"
509 		"loop		1b\n"
510       :
511       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
512 	);
513 #else
514 	__asm__ (
515   		"movq	     _MASK_AND, 	%%mm5\n"
516   		"movq	     _ADD_1, 	%%mm6\n"
517 		"1:\t"
518 	   		"movq		(%1),	%%mm0\n"      /* 8 s */
519 	   		"movq		(%4),	%%mm1\n"      /* 8 s +lx */
520   			"psrlw      $1,%%mm0\n"
521   			"psrlw      $1,%%mm1\n"
522   			"pand       %%mm5,%%mm0\n"
523   			"pand       %%mm5,%%mm1\n"
524   			"paddusb    %%mm1,%%mm0\n"
525   			"addl       %3,%1\n"
526   			"paddusb    %%mm6,%%mm0\n"
527   			"addl       %3,%4\n"
528   			"movq       %%mm0,(%2)\n"
529   			"decl       %0\n"
530   			"leal		(%2, %3), %2\n"
531   		"jnz        1b\n"
532       :
533       : "c" (h), "r" (s), "r" (d), "r" (lx2), "r" (s +lx)
534 	);
535 #endif
536 }
537 
538 #endif  // HAVE_MMX
539 
rec(unsigned char * s,unsigned char * d,int lx2,int h)540 static inline void rec(unsigned char *s, unsigned char *d, int lx2, int h)
541 {
542 	int j;
543 # ifdef __POWERPC__
544         unsigned long t1,t2,t3,t4;
545 #endif
546 	for(j = 0; j < h; j++, s += lx2, d += lx2)
547 	{
548 # ifdef __POWERPC__
549         t1 = *(unsigned long *) s;
550         t2 = *(((unsigned long *) s)+1);
551         t3 = *(((unsigned long *) s)+2);
552         t4 = *(((unsigned long *) s)+3);
553         *(unsigned long *) d = t1;
554         *(((unsigned long *) d)+1) = t2;
555         *(((unsigned long *) d)+2) = t3;
556         *(((unsigned long *) d)+3) = t4;
557 #else
558     	d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
559     	d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
560     	d[8] = s[8]; d[9] = s[9]; d[10] = s[10]; d[11] = s[11];
561     	d[12] = s[12]; d[13] = s[13]; d[14] = s[14]; d[15] = s[15];
562 #endif
563 	}
564 }
565 
566 
567 
recc(unsigned char * s,unsigned char * d,int lx2,int h)568 static inline void recc(unsigned char *s, unsigned char *d, int lx2, int h)
569 {
570 	int j;
571 # ifdef __POWERPC__
572         long t1,t2;
573 #endif
574 	for(j = 0; j < h; j++, s += lx2, d += lx2)
575 	{
576 # ifdef __POWERPC__
577         t1 = *(long *) s;
578         t2 = *(((long *) s)+1);
579         *(long *) d = t1;
580         *(((long *) d)+1) = t2;
581 #else
582     	d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
583     	d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
584 #endif
585 	}
586 }
587 
reca(unsigned char * s,unsigned char * d,int lx2,int h)588 static inline void reca(unsigned char *s, unsigned char  *d, int lx2, int h)
589 {
590 	int j;
591 	for(j = 0; j < h; j++, s +=lx2, d +=lx2)
592 	{
593     	d[0] = (unsigned int)(d[0] + s[0] + 1) >> 1;
594     	d[1] = (unsigned int)(d[1] + s[1] + 1) >> 1;
595     	d[2] = (unsigned int)(d[2] + s[2] + 1) >> 1;
596     	d[3] = (unsigned int)(d[3] + s[3] + 1) >> 1;
597     	d[4] = (unsigned int)(d[4] + s[4] + 1) >> 1;
598     	d[5] = (unsigned int)(d[5] + s[5] + 1) >> 1;
599     	d[6] = (unsigned int)(d[6] + s[6] + 1) >> 1;
600     	d[7] = (unsigned int)(d[7] + s[7] + 1) >> 1;
601     	d[8] = (unsigned int)(d[8] + s[8] + 1) >> 1;
602     	d[9] = (unsigned int)(d[9] + s[9] + 1) >> 1;
603     	d[10] = (unsigned int)(d[10] + s[10] + 1) >> 1;
604     	d[11] = (unsigned int)(d[11] + s[11] + 1) >> 1;
605     	d[12] = (unsigned int)(d[12] + s[12] + 1) >> 1;
606     	d[13] = (unsigned int)(d[13] + s[13] + 1) >> 1;
607     	d[14] = (unsigned int)(d[14] + s[14] + 1) >> 1;
608     	d[15] = (unsigned int)(d[15] + s[15] + 1) >> 1;
609 	}
610 }
611 
recac(unsigned char * s,unsigned char * d,int lx2,int h)612 static inline void recac(unsigned char *s, unsigned char *d, int lx2, int h)
613 {
614 	int j;
615 	for(j = 0; j < h; j++, s += lx2, d += lx2)
616 	{
617     	d[0] = (unsigned int)(d[0] + s[0] + 1)>>1;
618     	d[1] = (unsigned int)(d[1] + s[1] + 1)>>1;
619     	d[2] = (unsigned int)(d[2] + s[2] + 1)>>1;
620     	d[3] = (unsigned int)(d[3] + s[3] + 1)>>1;
621     	d[4] = (unsigned int)(d[4] + s[4] + 1)>>1;
622     	d[5] = (unsigned int)(d[5] + s[5] + 1)>>1;
623     	d[6] = (unsigned int)(d[6] + s[6] + 1)>>1;
624     	d[7] = (unsigned int)(d[7] + s[7] + 1)>>1;
625 	}
626 }
627 
recv(unsigned char * s,unsigned char * d,int lx,int lx2,int h)628 static inline void recv(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
629 {
630 	unsigned char *dp,*sp,*sp2;
631 	int j;
632 	sp = s;
633 	sp2 = s + lx;
634 	dp = d;
635 	for(j = 0; j < h; j++)
636 	{
637     	dp[0] = (unsigned int)(sp[0] + sp2[0] + 1) >> 1;
638     	dp[1] = (unsigned int)(sp[1] + sp2[1] + 1) >> 1;
639     	dp[2] = (unsigned int)(sp[2] + sp2[2] + 1) >> 1;
640     	dp[3] = (unsigned int)(sp[3] + sp2[3] + 1) >> 1;
641     	dp[4] = (unsigned int)(sp[4] + sp2[4] + 1) >> 1;
642     	dp[5] = (unsigned int)(sp[5] + sp2[5] + 1) >> 1;
643     	dp[6] = (unsigned int)(sp[6] + sp2[6] + 1) >> 1;
644     	dp[7] = (unsigned int)(sp[7] + sp2[7] + 1) >> 1;
645     	dp[8] = (unsigned int)(sp[8] + sp2[8] + 1) >> 1;
646     	dp[9] = (unsigned int)(sp[9] + sp2[9] + 1) >> 1;
647     	dp[10] = (unsigned int)(sp[10] + sp2[10] + 1) >> 1;
648     	dp[11] = (unsigned int)(sp[11] + sp2[11] + 1) >> 1;
649     	dp[12] = (unsigned int)(sp[12] + sp2[12] + 1) >> 1;
650     	dp[13] = (unsigned int)(sp[13] + sp2[13] + 1) >> 1;
651     	dp[14] = (unsigned int)(sp[14] + sp2[14] + 1) >> 1;
652     	dp[15] = (unsigned int)(sp[15] + sp2[15] + 1) >> 1;
653     	sp+= lx2;
654     	sp2+= lx2;
655     	dp+= lx2;
656 	}
657 }
658 
recvc(unsigned char * s,unsigned char * d,int lx,int lx2,int h)659 static inline void recvc(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
660 {
661 	unsigned char *dp,*sp,*sp2;
662 	int j;
663 
664 	sp = s;
665 	sp2 = s+lx;
666 	dp = d;
667 	for(j = 0; j < h; j++)
668 	{
669     	dp[0] = (unsigned int)(sp[0]+sp2[0]+1)>>1;
670     	dp[1] = (unsigned int)(sp[1]+sp2[1]+1)>>1;
671     	dp[2] = (unsigned int)(sp[2]+sp2[2]+1)>>1;
672     	dp[3] = (unsigned int)(sp[3]+sp2[3]+1)>>1;
673     	dp[4] = (unsigned int)(sp[4]+sp2[4]+1)>>1;
674     	dp[5] = (unsigned int)(sp[5]+sp2[5]+1)>>1;
675     	dp[6] = (unsigned int)(sp[6]+sp2[6]+1)>>1;
676     	dp[7] = (unsigned int)(sp[7]+sp2[7]+1)>>1;
677     	sp+= lx2;
678     	sp2+= lx2;
679     	dp+= lx2;
680 	}
681 }
682 
683 
recva(unsigned char * s,unsigned char * d,int lx,int lx2,int h)684 static inline void recva(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
685 {
686 	unsigned char *dp,*sp,*sp2;
687 	int j;
688 
689 	sp = s;
690 	sp2 = s+lx;
691 	dp = d;
692 	for (j=0; j<h; j++){
693     	dp[0] = (dp[0] + ((unsigned int)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
694     	dp[1] = (dp[1] + ((unsigned int)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
695     	dp[2] = (dp[2] + ((unsigned int)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
696     	dp[3] = (dp[3] + ((unsigned int)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
697     	dp[4] = (dp[4] + ((unsigned int)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
698     	dp[5] = (dp[5] + ((unsigned int)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
699     	dp[6] = (dp[6] + ((unsigned int)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
700     	dp[7] = (dp[7] + ((unsigned int)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
701     	dp[8] = (dp[8] + ((unsigned int)(sp[8]+sp2[8]+1)>>1) + 1)>>1;
702     	dp[9] = (dp[9] + ((unsigned int)(sp[9]+sp2[9]+1)>>1) + 1)>>1;
703     	dp[10] = (dp[10] + ((unsigned int)(sp[10]+sp2[10]+1)>>1) + 1)>>1;
704     	dp[11] = (dp[11] + ((unsigned int)(sp[11]+sp2[11]+1)>>1) + 1)>>1;
705     	dp[12] = (dp[12] + ((unsigned int)(sp[12]+sp2[12]+1)>>1) + 1)>>1;
706     	dp[13] = (dp[13] + ((unsigned int)(sp[13]+sp2[13]+1)>>1) + 1)>>1;
707     	dp[14] = (dp[14] + ((unsigned int)(sp[14]+sp2[14]+1)>>1) + 1)>>1;
708     	dp[15] = (dp[15] + ((unsigned int)(sp[15]+sp2[15]+1)>>1) + 1)>>1;
709     	sp+= lx2;
710     	sp2+= lx2;
711     	dp+= lx2;
712 	}
713 }
714 
715 
recvac(unsigned char * s,unsigned char * d,int lx,int lx2,int h)716 static inline void recvac(unsigned char *s, unsigned char *d, int lx,int lx2, int h){
717   unsigned char *dp,*sp,*sp2;
718 	int j;
719 
720   sp = s;
721   sp2 = s+lx;
722   dp = d;
723   for (j=0; j<h; j++){
724     dp[0] = (dp[0] + ((unsigned int)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
725     dp[1] = (dp[1] + ((unsigned int)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
726     dp[2] = (dp[2] + ((unsigned int)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
727     dp[3] = (dp[3] + ((unsigned int)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
728     dp[4] = (dp[4] + ((unsigned int)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
729     dp[5] = (dp[5] + ((unsigned int)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
730     dp[6] = (dp[6] + ((unsigned int)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
731     dp[7] = (dp[7] + ((unsigned int)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
732     sp+= lx2;
733     sp2+= lx2;
734     dp+= lx2;
735   }
736 }
737 
738 
rech(unsigned char * s,unsigned char * d,int lx2,int h)739 static inline void rech(unsigned char *s, unsigned char *d, int lx2, int h){
740   unsigned char *dp,*sp;
741   unsigned int s1,s2;
742 	int j;
743 
744   sp = s;
745   dp = d;
746   for (j=0; j<h; j++){
747     s1=sp[0];
748     dp[0] = (unsigned int)(s1+(s2=sp[1])+1)>>1;
749     dp[1] = (unsigned int)(s2+(s1=sp[2])+1)>>1;
750     dp[2] = (unsigned int)(s1+(s2=sp[3])+1)>>1;
751     dp[3] = (unsigned int)(s2+(s1=sp[4])+1)>>1;
752     dp[4] = (unsigned int)(s1+(s2=sp[5])+1)>>1;
753     dp[5] = (unsigned int)(s2+(s1=sp[6])+1)>>1;
754     dp[6] = (unsigned int)(s1+(s2=sp[7])+1)>>1;
755     dp[7] = (unsigned int)(s2+(s1=sp[8])+1)>>1;
756     dp[8] = (unsigned int)(s1+(s2=sp[9])+1)>>1;
757     dp[9] = (unsigned int)(s2+(s1=sp[10])+1)>>1;
758     dp[10] = (unsigned int)(s1+(s2=sp[11])+1)>>1;
759     dp[11] = (unsigned int)(s2+(s1=sp[12])+1)>>1;
760     dp[12] = (unsigned int)(s1+(s2=sp[13])+1)>>1;
761     dp[13] = (unsigned int)(s2+(s1=sp[14])+1)>>1;
762     dp[14] = (unsigned int)(s1+(s2=sp[15])+1)>>1;
763     dp[15] = (unsigned int)(s2+sp[16]+1)>>1;
764     sp+= lx2;
765     dp+= lx2;
766   }
767 }
768 
769 
rechc(unsigned char * s,unsigned char * d,int lx2,int h)770 static inline void rechc(unsigned char *s,unsigned char *d, int lx2, int h){
771   unsigned char *dp,*sp;
772   unsigned int s1,s2;
773 	int j;
774 
775   sp = s;
776   dp = d;
777   for (j=0; j<h; j++){
778     s1=sp[0];
779     dp[0] = (unsigned int)(s1+(s2=sp[1])+1)>>1;
780     dp[1] = (unsigned int)(s2+(s1=sp[2])+1)>>1;
781     dp[2] = (unsigned int)(s1+(s2=sp[3])+1)>>1;
782     dp[3] = (unsigned int)(s2+(s1=sp[4])+1)>>1;
783     dp[4] = (unsigned int)(s1+(s2=sp[5])+1)>>1;
784     dp[5] = (unsigned int)(s2+(s1=sp[6])+1)>>1;
785     dp[6] = (unsigned int)(s1+(s2=sp[7])+1)>>1;
786     dp[7] = (unsigned int)(s2+sp[8]+1)>>1;
787     sp+= lx2;
788     dp+= lx2;
789   }
790 }
791 
recha(unsigned char * s,unsigned char * d,int lx2,int h)792 static inline void recha(unsigned char *s, unsigned char *d,int lx2, int h)
793 {
794 	unsigned char *dp,*sp;
795 	unsigned int s1,s2;
796 	int j;
797 
798 	sp = s;
799 	dp = d;
800 	for (j = 0; j < h; j++)
801 	{
802     	s1 = sp[0];
803     	dp[0] = (dp[0] + ((unsigned int)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
804     	dp[1] = (dp[1] + ((unsigned int)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
805     	dp[2] = (dp[2] + ((unsigned int)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
806     	dp[3] = (dp[3] + ((unsigned int)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
807     	dp[4] = (dp[4] + ((unsigned int)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
808     	dp[5] = (dp[5] + ((unsigned int)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
809     	dp[6] = (dp[6] + ((unsigned int)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
810     	dp[7] = (dp[7] + ((unsigned int)(s2 + (s1 = sp[8]) + 1) >> 1) + 1) >> 1;
811     	dp[8] = (dp[8] + ((unsigned int)(s1 + (s2 = sp[9]) + 1) >> 1) + 1) >> 1;
812     	dp[9] = (dp[9] + ((unsigned int)(s2 + (s1 = sp[10]) + 1) >> 1) + 1) >> 1;
813     	dp[10] = (dp[10] + ((unsigned int)(s1 + (s2 = sp[11]) + 1) >> 1) + 1) >> 1;
814     	dp[11] = (dp[11] + ((unsigned int)(s2 + (s1 = sp[12]) + 1) >> 1) + 1) >> 1;
815     	dp[12] = (dp[12] + ((unsigned int)(s1 + (s2 = sp[13]) + 1) >> 1) + 1) >> 1;
816     	dp[13] = (dp[13] + ((unsigned int)(s2 + (s1 = sp[14]) + 1) >> 1) + 1) >> 1;
817     	dp[14] = (dp[14] + ((unsigned int)(s1 + (s2 = sp[15]) + 1) >> 1) + 1) >> 1;
818     	dp[15] = (dp[15] + ((unsigned int)(s2 + sp[16] + 1) >> 1) + 1) >> 1;
819     	sp += lx2;
820     	dp += lx2;
821 	}
822 }
823 
824 
rechac(unsigned char * s,unsigned char * d,int lx2,int h)825 static inline void rechac(unsigned char *s,unsigned char  *d, int lx2, int h)
826 {
827 	unsigned char *dp,*sp;
828 	unsigned int s1,s2;
829 	int j;
830 
831 	sp = s;
832 	dp = d;
833 	for(j = 0; j < h; j++)
834 	{
835     	s1 = sp[0];
836     	dp[0] = (dp[0] + ((unsigned int)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
837     	dp[1] = (dp[1] + ((unsigned int)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
838     	dp[2] = (dp[2] + ((unsigned int)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
839     	dp[3] = (dp[3] + ((unsigned int)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
840     	dp[4] = (dp[4] + ((unsigned int)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
841     	dp[5] = (dp[5] + ((unsigned int)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
842     	dp[6] = (dp[6] + ((unsigned int)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
843     	dp[7] = (dp[7] + ((unsigned int)(s2 + sp[8] + 1) >> 1) + 1) >> 1;
844     	sp += lx2;
845     	dp += lx2;
846 	}
847 }
848 
849 
rec4(unsigned char * s,unsigned char * d,int lx,int lx2,int h)850 static inline void rec4(unsigned char *s, unsigned char *d, int lx, int lx2, int h)
851 {
852   unsigned char *dp,*sp,*sp2;
853   unsigned int s1,s2,s3,s4;
854 	int j;
855 
856   sp = s;
857   sp2 = s+lx;
858   dp = d;
859   for (j=0; j<h; j++){
860     s1=sp[0]; s3=sp2[0];
861     dp[0] = (unsigned int)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
862     dp[1] = (unsigned int)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
863     dp[2] = (unsigned int)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
864     dp[3] = (unsigned int)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
865     dp[4] = (unsigned int)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
866     dp[5] = (unsigned int)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
867     dp[6] = (unsigned int)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
868     dp[7] = (unsigned int)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2;
869     dp[8] = (unsigned int)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2;
870     dp[9] = (unsigned int)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2;
871     dp[10] = (unsigned int)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2;
872     dp[11] = (unsigned int)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2;
873     dp[12] = (unsigned int)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2;
874     dp[13] = (unsigned int)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2;
875     dp[14] = (unsigned int)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2;
876     dp[15] = (unsigned int)(s2+sp[16]+s4+sp2[16]+2)>>2;
877     sp+= lx2;
878     sp2+= lx2;
879     dp+= lx2;
880   }
881 }
882 
883 
rec4c(unsigned char * s,unsigned char * d,int lx,int lx2,int h)884 static inline void rec4c(unsigned char *s,unsigned char *d, int lx, int lx2, int h)
885 {
886   unsigned char *dp,*sp,*sp2;
887   unsigned int s1,s2,s3,s4;
888 	int j;
889 
890   sp = s;
891   sp2 = s+lx;
892   dp = d;
893   for (j=0; j<h; j++){
894     s1=sp[0]; s3=sp2[0];
895     dp[0] = (unsigned int)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
896     dp[1] = (unsigned int)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
897     dp[2] = (unsigned int)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
898     dp[3] = (unsigned int)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
899     dp[4] = (unsigned int)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
900     dp[5] = (unsigned int)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
901     dp[6] = (unsigned int)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
902     dp[7] = (unsigned int)(s2+sp[8]+s4+sp2[8]+2)>>2;
903     sp+= lx2;
904     sp2+= lx2;
905     dp+= lx2;
906   }
907 }
908 
909 
rec4a(unsigned char * s,unsigned char * d,int lx,int lx2,int h)910 static inline void rec4a(unsigned char *s,unsigned char *d, int lx, int lx2, int h)
911 {
912   unsigned char *dp=d, *sp=s, *sp2=s+lx;
913   unsigned int s1, s2, s3, s4;
914 	int j;
915 
916 /*
917   sp = s;
918   sp2 = s+lx;
919   dp = d;
920 */
921   for (j=0; j<h; j++){
922     s1=sp[0]; s3=sp2[0];
923     dp[0] = (dp[0] + ((unsigned int)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
924     dp[1] = (dp[1] + ((unsigned int)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
925     dp[2] = (dp[2] + ((unsigned int)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
926     dp[3] = (dp[3] + ((unsigned int)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
927     dp[4] = (dp[4] + ((unsigned int)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
928     dp[5] = (dp[5] + ((unsigned int)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
929     dp[6] = (dp[6] + ((unsigned int)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
930     dp[7] = (dp[7] + ((unsigned int)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2) + 1)>>1;
931     dp[8] = (dp[8] + ((unsigned int)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2) + 1)>>1;
932     dp[9] = (dp[9] + ((unsigned int)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2) + 1)>>1;
933     dp[10] = (dp[10] + ((unsigned int)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2) + 1)>>1;
934     dp[11] = (dp[11] + ((unsigned int)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2) + 1)>>1;
935     dp[12] = (dp[12] + ((unsigned int)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2) + 1)>>1;
936     dp[13] = (dp[13] + ((unsigned int)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2) + 1)>>1;
937     dp[14] = (dp[14] + ((unsigned int)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2) + 1)>>1;
938     dp[15] = (dp[15] + ((unsigned int)(s2+sp[16]+s4+sp2[16]+2)>>2) + 1)>>1;
939     sp+= lx2;
940     sp2+= lx2;
941     dp+= lx2;
942   }
943 }
944 
945 
rec4ac(unsigned char * s,unsigned char * d,int lx,int lx2,int h)946 static inline void rec4ac(unsigned char *s,unsigned char  *d, int lx, int lx2, int h)
947 {
948   unsigned char *dp=d, *sp=s, *sp2=s+lx;
949   unsigned int s1,s2,s3,s4;
950 	int j;
951 
952 /*
953   sp = s;
954   sp2 = s+lx;
955   dp = d;
956 */
957 	for (j=0; j<h; j++)
958 	{
959     	s1=sp[0]; s3=sp2[0];
960     	dp[0] = (dp[0] + ((unsigned int)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
961     	dp[1] = (dp[1] + ((unsigned int)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
962     	dp[2] = (dp[2] + ((unsigned int)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
963     	dp[3] = (dp[3] + ((unsigned int)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
964     	dp[4] = (dp[4] + ((unsigned int)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
965     	dp[5] = (dp[5] + ((unsigned int)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
966     	dp[6] = (dp[6] + ((unsigned int)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
967     	dp[7] = (dp[7] + ((unsigned int)(s2+sp[8]+s4+sp2[8]+2)>>2) + 1)>>1;
968     	sp+= lx2;
969     	sp2+= lx2;
970     	dp+= lx2;
971 	}
972 }
973 
974 static inline
recon_comp(mpeg3video_t * video,unsigned char * src,unsigned char * dst,int lx,int lx2,int w,int h,int x,int y,int dx,int dy,int addflag)975 void recon_comp(mpeg3video_t *video,
976 		unsigned char *src,
977 		unsigned char *dst,
978 		int lx,
979 		int lx2,
980 		int w,
981 		int h,
982 		int x,
983 		int y,
984 		int dx,
985 		int dy,
986 		int addflag)
987 {
988 	int switcher;
989 	unsigned char *s, *d;
990 
991 /* half pel scaling */
992 	switcher = (dx & 1) << 3 | (dy & 1) << 2 | w;
993 	if(addflag) switcher |= 2;
994 /* origins */
995 	s = src + lx * (y + (dy >> 1)) + x + (dx >> 1);
996 	d = dst + lx * y + x;
997 
998 // Accelerated functions
999 #ifdef HAVE_MMX
1000 	if(video->have_mmx)
1001 	{
1002 		switch(switcher)
1003 		{
1004 			case 0x3: 	reca_mmx(s, d, lx2, h);       break;
1005 			case 0x2:	recac_mmx(s, d, lx2, h);      break;
1006 			case 0x1:	rec_mmx(s, d, lx2, h);        break;
1007 			case 0x0:	recc_mmx(s, d, lx2, h);       break;
1008 			case 0x7:   recva(s, d, lx, lx2, h);  break;
1009 			case 0x6:   recvac(s, d, lx, lx2, h); break;
1010 			case 0x5:	recv_mmx(s, d, lx, lx2, h);   break;
1011 			case 0x4:	recvc_mmx(s, d, lx, lx2, h);  break;
1012 			case 0x9:	rech(s, d, lx2, h);       break;
1013 			case 0x8:   rechc(s, d, lx2, h);      break;
1014 		}
1015 	}
1016 	else
1017 #endif
1018 	{
1019 		switch(switcher)
1020 		{
1021 			case 0x3: 	reca(s, d, lx2, h);       break;
1022 			case 0x2:	recac(s, d, lx2, h);      break;
1023 			case 0x1:	rec(s, d, lx2, h);        break;
1024 			case 0x0:	recc(s, d, lx2, h);       break;
1025 			case 0x7:   recva(s, d, lx, lx2, h);  break;
1026 			case 0x6:   recvac(s, d, lx, lx2, h); break;
1027 			case 0x5:	recv(s, d, lx, lx2, h);   break;
1028 			case 0x4:	recvc(s, d, lx, lx2, h);  break;
1029 			case 0x9:	rech(s, d, lx2, h);       break;
1030 			case 0x8:   rechc(s, d, lx2, h);      break;
1031 		}
1032 	}
1033 
1034 // Unaccelerated functions
1035 	switch(switcher)
1036 	{
1037 		case 0xb: 	recha(s, d, lx2, h);      break;
1038 		case 0xa:	rechac(s, d, lx2, h);     break;
1039 		case 0xf: 	rec4a(s, d, lx, lx2, h);  break;
1040 		case 0xe:	rec4ac(s, d, lx, lx2, h); break;
1041 		case 0xd:	rec4(s, d, lx, lx2, h);   break;
1042 		case 0xc:	rec4c(s, d, lx, lx2, h);  break;
1043 	 }
1044 }
1045 
1046 /*
1047 	unsigned char *src[]; * prediction source buffer *
1048 	int sfield;           * prediction source field number (0 or 1) *
1049 	unsigned char *dst[]; * prediction destination buffer *
1050 	int dfield;           * prediction destination field number (0 or 1)*
1051 	int lx,lx2;           * horizontal offsets *
1052 	int w,h;              * prediction block/sub-block width, height *
1053 	int x,y;              * pixel co-ordinates of top-left sample in current MB *
1054 	int dx,dy;            * horizontal, vertical motion vector *
1055 	int addflag;          * add prediction error to prediction ? *
1056 */
recon(mpeg3video_t * video,unsigned char * src[],int sfield,unsigned char * dst[],int dfield,int lx,int lx2,int w,int h,int x,int y,int dx,int dy,int addflag)1057 static void recon(mpeg3video_t *video,
1058 		unsigned char *src[],
1059 		int sfield,
1060 	    unsigned char *dst[],
1061 		int dfield,
1062 		int lx,
1063 		int lx2,
1064 	    int w,
1065 		int h,
1066 		int x,
1067 		int y,
1068 		int dx,
1069 		int dy,
1070 		int addflag)
1071 {
1072 
1073 /* Y */
1074 	recon_comp(video, (src[0] + (sfield ? (lx2 >> 1) : 0)),
1075 	       dst[0] + (dfield ? (lx2 >> 1) : 0),
1076            lx, lx2, w, h, x, y, dx, dy, addflag);
1077 
1078 	if(video->chroma_format != CHROMA444)
1079 	{
1080       	lx >>= 1;
1081 		dx /= 2;
1082 		lx2 >>= 1;
1083 		w = 0;
1084 		x >>= 1;
1085 	}
1086 
1087 	if(video->chroma_format == CHROMA420)
1088 	{
1089       	h >>= 1;
1090 		dy /= 2;
1091 		y >>= 1;
1092 	}
1093 
1094 /* Cb */
1095 	recon_comp(video, (src[1] + (sfield ? (lx2 >> 1) : 0)),
1096 	       dst[1] + (dfield ? (lx2 >> 1) : 0),
1097 	       lx, lx2, w, h, x, y, dx, dy, addflag);
1098 
1099 /* Cr */
1100 	recon_comp(video, (src[2] + (sfield ? (lx2 >> 1) : 0)),
1101 	       dst[2] + (dfield ? (lx2 >> 1) : 0),
1102            lx, lx2, w, h, x, y, dx, dy, addflag);
1103 }
1104 
1105 #define WIDTH 1
1106 
mpeg3video_reconstruct(mpeg3video_t * video,int bx,int by,int mb_type,int motion_type,int PMV[2][2][2],int mv_field_sel[2][2],int dmvector[2],int stwtype)1107 int mpeg3video_reconstruct(mpeg3video_t *video,
1108 	int bx,
1109 	int by,
1110 	int mb_type,
1111 	int motion_type,
1112 	int PMV[2][2][2],
1113 	int mv_field_sel[2][2],
1114 	int dmvector[2],
1115 	int stwtype)
1116 {
1117 	int currentfield;
1118 	unsigned char **predframe;
1119 	int DMV[2][2];
1120 	int stwtop, stwbot;
1121 
1122 	stwtop = stwtype % 3; /* 0:temporal, 1 : (spat+temp) / 2, 2 : spatial */
1123 	stwbot = stwtype / 3;
1124 
1125 	if((mb_type & MB_FORWARD) || (video->pict_type == P_TYPE))
1126 	{
1127     	if(video->pict_struct == FRAME_PICTURE)
1128 		{
1129     		if((motion_type == MC_FRAME) || !(mb_type & MB_FORWARD))
1130 			{
1131 /* frame-based prediction */
1132 				{
1133         			if(stwtop < 2)
1134         				recon(video, video->oldrefframe, 0, video->newframe, 0,
1135         	    			video->coded_picture_width, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1136             			  	PMV[0][0][0], PMV[0][0][1], stwtop);
1137 
1138         			if(stwbot < 2)
1139         				recon(video, video->oldrefframe, 1, video->newframe, 1,
1140        	    				video->coded_picture_width, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1141             				PMV[0][0][0], PMV[0][0][1], stwbot);
1142     		  	}
1143     		}
1144     		else if(motion_type == MC_FIELD) /* field-based prediction */
1145     		{
1146 /* top field prediction */
1147         		if(stwtop < 2)
1148         			recon(video, video->oldrefframe, mv_field_sel[0][0], video->newframe, 0,
1149             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by >> 1,
1150             			PMV[0][0][0], PMV[0][0][1] >> 1, stwtop);
1151 
1152 /* bottom field prediction */
1153         		if(stwbot < 2)
1154         			recon(video, video->oldrefframe, mv_field_sel[1][0], video->newframe, 1,
1155             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by >> 1,
1156             			PMV[1][0][0], PMV[1][0][1] >> 1, stwbot);
1157     		}
1158     		else if(motion_type == MC_DMV)
1159 			{
1160 /* dual prime prediction */
1161 /* calculate derived motion vectors */
1162         		mpeg3video_calc_dmv(video,
1163 					DMV,
1164 					dmvector,
1165 					PMV[0][0][0],
1166 					PMV[0][0][1] >> 1);
1167 
1168         		if(stwtop < 2)
1169 				{
1170 /* predict top field from top field */
1171         			recon(video, video->oldrefframe, 0, video->newframe, 0,
1172             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by>>1,
1173             			PMV[0][0][0], PMV[0][0][1] >> 1, 0);
1174 
1175 /* predict and add to top field from bottom field */
1176         			recon(video, video->oldrefframe, 1, video->newframe, 0,
1177             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by>>1,
1178             			DMV[0][0], DMV[0][1], 1);
1179         		}
1180 
1181         		if(stwbot < 2)
1182         		{
1183 /* predict bottom field from bottom field */
1184         			recon(video, video->oldrefframe, 1, video->newframe, 1,
1185             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by>>1,
1186             			PMV[0][0][0], PMV[0][0][1]>>1, 0);
1187 
1188 /* predict and add to bottom field from top field */
1189         			recon(video, video->oldrefframe, 0, video->newframe, 1,
1190             			video->coded_picture_width << 1, video->coded_picture_width<<1, WIDTH, 8, bx, by>>1,
1191             			DMV[1][0], DMV[1][1], 1);
1192         		}
1193     		}
1194     	  	else
1195 /* invalid motion_type */
1196 /*        		fprintf(stderr, "reconstruct: invalid motion_type\n"); */
1197 				;
1198     	}
1199       	else
1200       	{
1201 /* TOP_FIELD or BOTTOM_FIELD */
1202 /* field picture */
1203     		currentfield = (video->pict_struct == BOTTOM_FIELD);
1204 
1205 /* determine which frame to use for prediction */
1206     		if((video->pict_type == P_TYPE) && video->secondfield
1207         	   && (currentfield != mv_field_sel[0][0]))
1208         		predframe = video->refframe; /* same frame */
1209     		else
1210         	 	predframe = video->oldrefframe; /* previous frame */
1211 
1212     		if((motion_type == MC_FIELD) || !(mb_type & MB_FORWARD))
1213     		{
1214 /* field-based prediction */
1215         		if(stwtop < 2)
1216         			recon(video, predframe,mv_field_sel[0][0],video->newframe,0,
1217             			video->coded_picture_width << 1,video->coded_picture_width << 1,WIDTH,16,bx,by,
1218             			PMV[0][0][0],PMV[0][0][1],stwtop);
1219     		}
1220     		else
1221 			if(motion_type == MC_16X8)
1222     		{
1223         		if(stwtop < 2)
1224         		{
1225         			recon(video, predframe, mv_field_sel[0][0], video->newframe, 0,
1226             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1227             			PMV[0][0][0], PMV[0][0][1], stwtop);
1228 
1229         			/* determine which frame to use for lower half prediction */
1230         			if((video->pict_type==P_TYPE) && video->secondfield
1231             		   && (currentfield!=mv_field_sel[1][0]))
1232             		  predframe = video->refframe; /* same frame */
1233         			else
1234             		  predframe = video->oldrefframe; /* previous frame */
1235 
1236         			recon(video, predframe, mv_field_sel[1][0], video->newframe, 0,
1237             			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by+8,
1238             			PMV[1][0][0], PMV[1][0][1], stwtop);
1239         		}
1240     		}
1241     		else
1242 			if(motion_type == MC_DMV) /* dual prime prediction */
1243     		{
1244         		if(video->secondfield)
1245         		  	predframe = video->refframe; /* same frame */
1246         		else
1247         		  	predframe = video->oldrefframe; /* previous frame */
1248 
1249 /* calculate derived motion vectors */
1250         		mpeg3video_calc_dmv(video,
1251 					DMV,
1252 					dmvector,
1253 					PMV[0][0][0],
1254 					PMV[0][0][1]);
1255 
1256 /* predict from field of same parity */
1257         		recon(video, video->oldrefframe, currentfield, video->newframe, 0,
1258         			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 16, bx, by,
1259         			PMV[0][0][0], PMV[0][0][1], 0);
1260 
1261 /* predict from field of opposite parity */
1262         		recon(video, predframe, !currentfield, video->newframe, 0,
1263         			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 16, bx, by,
1264         			DMV[0][0], DMV[0][1], 1);
1265     		}
1266     		else
1267 /* invalid motion_type */
1268 /*        	  fprintf(stderr, "reconstruct: invalid motion_type\n"); */
1269 			;
1270 		}
1271       	stwtop = stwbot = 1;
1272 	}
1273 
1274 	if(mb_type & MB_BACKWARD)
1275 	{
1276     	if(video->pict_struct == FRAME_PICTURE)
1277     	{
1278     		if(motion_type == MC_FRAME)
1279     		{
1280 /* frame-based prediction */
1281         		if(stwtop < 2)
1282         			recon(video, video->refframe, 0, video->newframe, 0,
1283             			video->coded_picture_width, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1284             			PMV[0][1][0], PMV[0][1][1], stwtop);
1285 
1286         		if(stwbot < 2)
1287         			recon(video, video->refframe, 1, video->newframe, 1,
1288 						video->coded_picture_width, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1289 						PMV[0][1][0], PMV[0][1][1], stwbot);
1290     		}
1291     		else
1292 			{
1293 /* field-based prediction */
1294 /* top field prediction */
1295 				if(stwtop < 2)
1296 				{
1297 					recon(video, video->refframe, mv_field_sel[0][1], video->newframe, 0,
1298 						(video->coded_picture_width << 1), (video->coded_picture_width<<1), WIDTH, 8, bx, (by >> 1),
1299 						PMV[0][1][0], (PMV[0][1][1] >> 1), stwtop);
1300 				}
1301 
1302 /* bottom field prediction */
1303         		if(stwbot < 2)
1304 				{
1305         			recon(video, video->refframe, mv_field_sel[1][1], video->newframe, 1, (video->coded_picture_width << 1),
1306 						(video->coded_picture_width << 1), WIDTH, 8, bx, (by>>1),
1307 						PMV[1][1][0], (PMV[1][1][1]>>1), stwbot);
1308 				}
1309     		}
1310     	}
1311     	else
1312 		{
1313 /* TOP_FIELD or BOTTOM_FIELD */
1314 /* field picture */
1315     		if(motion_type == MC_FIELD)
1316 			{
1317 /* field-based prediction */
1318         		recon(video, video->refframe, mv_field_sel[0][1], video->newframe, 0,
1319 	    			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 16, bx, by,
1320 	    			PMV[0][1][0], PMV[0][1][1], stwtop);
1321     		}
1322     		else if(motion_type==MC_16X8)
1323     		{
1324         		recon(video, video->refframe, mv_field_sel[0][1], video->newframe, 0,
1325         			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by,
1326         			PMV[0][1][0], PMV[0][1][1], stwtop);
1327 
1328         		recon(video, video->refframe, mv_field_sel[1][1], video->newframe, 0,
1329         			video->coded_picture_width << 1, video->coded_picture_width << 1, WIDTH, 8, bx, by+8,
1330         			PMV[1][1][0], PMV[1][1][1], stwtop);
1331     		}
1332     		else
1333 /* invalid motion_type */
1334 /*        	  fprintf(stderr, "reconstruct: invalid motion_type\n"); */
1335 			;
1336     	}
1337 	} /* mb_type & MB_BACKWARD */
1338 	return 0;
1339 }
1340 
1341 
1342