1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard.
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with this library; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  *
20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21  */
22 
23 /**
24  * @file dsputil.c
25  * DSP utils
26  */
27 
28 #include "avcodec.h"
29 #include "dsputil.h"
30 //#include "mpegvideo.h"
31 #include "simple_idct.h"
32 //#include "faandct.h"
33 
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
36 
37 const uint8_t ff_zigzag_direct[64] = {
38     0,   1,  8, 16,  9,  2,  3, 10,
39     17, 24, 32, 25, 18, 11,  4,  5,
40     12, 19, 26, 33, 40, 48, 41, 34,
41     27, 20, 13,  6,  7, 14, 21, 28,
42     35, 42, 49, 56, 57, 50, 43, 36,
43     29, 22, 15, 23, 30, 37, 44, 51,
44     58, 59, 52, 45, 38, 31, 39, 46,
45     53, 60, 61, 54, 47, 55, 62, 63
46 };
47 
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49    specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51      0,  8,  1,  9, 16, 24,  2, 10,
52     17, 25, 32, 40, 48, 56, 33, 41,
53     18, 26,  3, 11,  4, 12, 19, 27,
54     34, 42, 49, 57, 50, 58, 35, 43,
55     20, 28,  5, 13,  6, 14, 21, 29,
56     36, 44, 51, 59, 52, 60, 37, 45,
57     22, 30,  7, 15, 23, 31, 38, 46,
58     53, 61, 54, 62, 39, 47, 55, 63,
59 };
60 
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
63 
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65     0,  1,   2,  3,  8,  9, 16, 17,
66     10, 11,  4,  5,  6,  7, 15, 14,
67     13, 12, 19, 18, 24, 25, 32, 33,
68     26, 27, 20, 21, 22, 23, 28, 29,
69     30, 31, 34, 35, 40, 41, 48, 49,
70     42, 43, 36, 37, 38, 39, 44, 45,
71     46, 47, 50, 51, 56, 57, 58, 59,
72     52, 53, 54, 55, 60, 61, 62, 63,
73 };
74 
75 const uint8_t ff_alternate_vertical_scan[64] = {
76     0,  8,  16, 24,  1,  9,  2, 10,
77     17, 25, 32, 40, 48, 56, 57, 49,
78     41, 33, 26, 18,  3, 11,  4, 12,
79     19, 27, 34, 42, 50, 58, 35, 43,
80     51, 59, 20, 28,  5, 13,  6, 14,
81     21, 29, 36, 44, 52, 60, 37, 45,
82     53, 61, 22, 30,  7, 15, 23, 31,
83     38, 46, 54, 62, 39, 47, 55, 63,
84 };
85 
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
89  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
90  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
91  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
92  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
93  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
94   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
95   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
96   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
97   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
98   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
99   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
100   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
101   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
102   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
103   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
104   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
105   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
106   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
107   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
108   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
109   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
110   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
111   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
112   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
113   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
114   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
115   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
116   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
117   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
118   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
119   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
120 };
121 
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 	0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 	0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 	0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 	0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 	0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 	0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 	0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 	0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133 #if 0
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136     int s, i, j;
137 
138     s = 0;
139     for (i = 0; i < 16; i++) {
140 	for (j = 0; j < 16; j += 8) {
141 	    s += pix[0];
142 	    s += pix[1];
143 	    s += pix[2];
144 	    s += pix[3];
145 	    s += pix[4];
146 	    s += pix[5];
147 	    s += pix[6];
148 	    s += pix[7];
149 	    pix += 8;
150 	}
151 	pix += line_size - 16;
152     }
153     return s;
154 }
155 
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158     int s, i, j;
159     uint32_t *sq = squareTbl + 256;
160 
161     s = 0;
162     for (i = 0; i < 16; i++) {
163 	for (j = 0; j < 16; j += 8) {
164 #if 0
165 	    s += sq[pix[0]];
166 	    s += sq[pix[1]];
167 	    s += sq[pix[2]];
168 	    s += sq[pix[3]];
169 	    s += sq[pix[4]];
170 	    s += sq[pix[5]];
171 	    s += sq[pix[6]];
172 	    s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 	    register uint64_t x=*(uint64_t*)pix;
176 	    s += sq[x&0xff];
177 	    s += sq[(x>>8)&0xff];
178 	    s += sq[(x>>16)&0xff];
179 	    s += sq[(x>>24)&0xff];
180             s += sq[(x>>32)&0xff];
181             s += sq[(x>>40)&0xff];
182             s += sq[(x>>48)&0xff];
183             s += sq[(x>>56)&0xff];
184 #else
185 	    register uint32_t x=*(uint32_t*)pix;
186 	    s += sq[x&0xff];
187 	    s += sq[(x>>8)&0xff];
188 	    s += sq[(x>>16)&0xff];
189 	    s += sq[(x>>24)&0xff];
190             x=*(uint32_t*)(pix+4);
191             s += sq[x&0xff];
192             s += sq[(x>>8)&0xff];
193             s += sq[(x>>16)&0xff];
194             s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 	    pix += 8;
198 	}
199 	pix += line_size - 16;
200     }
201     return s;
202 }
203 
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205     int i;
206 
207     for(i=0; i+8<=w; i+=8){
208         dst[i+0]= bswap_32(src[i+0]);
209         dst[i+1]= bswap_32(src[i+1]);
210         dst[i+2]= bswap_32(src[i+2]);
211         dst[i+3]= bswap_32(src[i+3]);
212         dst[i+4]= bswap_32(src[i+4]);
213         dst[i+5]= bswap_32(src[i+5]);
214         dst[i+6]= bswap_32(src[i+6]);
215         dst[i+7]= bswap_32(src[i+7]);
216     }
217     for(;i<w; i++){
218         dst[i+0]= bswap_32(src[i+0]);
219     }
220 }
221 
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224     int s, i;
225     uint32_t *sq = squareTbl + 256;
226 
227     s = 0;
228     for (i = 0; i < h; i++) {
229         s += sq[pix1[0] - pix2[0]];
230         s += sq[pix1[1] - pix2[1]];
231         s += sq[pix1[2] - pix2[2]];
232         s += sq[pix1[3] - pix2[3]];
233         s += sq[pix1[4] - pix2[4]];
234         s += sq[pix1[5] - pix2[5]];
235         s += sq[pix1[6] - pix2[6]];
236         s += sq[pix1[7] - pix2[7]];
237         pix1 += line_size;
238         pix2 += line_size;
239     }
240     return s;
241 }
242 
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 {
245     int s, i;
246     uint32_t *sq = squareTbl + 256;
247 
248     s = 0;
249     for (i = 0; i < h; i++) {
250         s += sq[pix1[ 0] - pix2[ 0]];
251         s += sq[pix1[ 1] - pix2[ 1]];
252         s += sq[pix1[ 2] - pix2[ 2]];
253         s += sq[pix1[ 3] - pix2[ 3]];
254         s += sq[pix1[ 4] - pix2[ 4]];
255         s += sq[pix1[ 5] - pix2[ 5]];
256         s += sq[pix1[ 6] - pix2[ 6]];
257         s += sq[pix1[ 7] - pix2[ 7]];
258         s += sq[pix1[ 8] - pix2[ 8]];
259         s += sq[pix1[ 9] - pix2[ 9]];
260         s += sq[pix1[10] - pix2[10]];
261         s += sq[pix1[11] - pix2[11]];
262         s += sq[pix1[12] - pix2[12]];
263         s += sq[pix1[13] - pix2[13]];
264         s += sq[pix1[14] - pix2[14]];
265         s += sq[pix1[15] - pix2[15]];
266 
267         pix1 += line_size;
268         pix2 += line_size;
269     }
270     return s;
271 }
272 
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 {
275     int i;
276 
277     /* read the pixels */
278     for(i=0;i<8;i++) {
279         block[0] = pixels[0];
280         block[1] = pixels[1];
281         block[2] = pixels[2];
282         block[3] = pixels[3];
283         block[4] = pixels[4];
284         block[5] = pixels[5];
285         block[6] = pixels[6];
286         block[7] = pixels[7];
287         pixels += line_size;
288         block += 8;
289     }
290 }
291 
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 			  const uint8_t *s2, int stride){
294     int i;
295 
296     /* read the pixels */
297     for(i=0;i<8;i++) {
298         block[0] = s1[0] - s2[0];
299         block[1] = s1[1] - s2[1];
300         block[2] = s1[2] - s2[2];
301         block[3] = s1[3] - s2[3];
302         block[4] = s1[4] - s2[4];
303         block[5] = s1[5] - s2[5];
304         block[6] = s1[6] - s2[6];
305         block[7] = s1[7] - s2[7];
306         s1 += stride;
307         s2 += stride;
308         block += 8;
309     }
310 }
311 
312 
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 				 int line_size)
315 {
316     int i;
317     uint8_t *cm = cropTbl + MAX_NEG_CROP;
318 
319     /* read the pixels */
320     for(i=0;i<8;i++) {
321         pixels[0] = cm[block[0]];
322         pixels[1] = cm[block[1]];
323         pixels[2] = cm[block[2]];
324         pixels[3] = cm[block[3]];
325         pixels[4] = cm[block[4]];
326         pixels[5] = cm[block[5]];
327         pixels[6] = cm[block[6]];
328         pixels[7] = cm[block[7]];
329 
330         pixels += line_size;
331         block += 8;
332     }
333 }
334 
335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336                           int line_size)
337 {
338     int i;
339     uint8_t *cm = cropTbl + MAX_NEG_CROP;
340 
341     /* read the pixels */
342     for(i=0;i<8;i++) {
343         pixels[0] = cm[pixels[0] + block[0]];
344         pixels[1] = cm[pixels[1] + block[1]];
345         pixels[2] = cm[pixels[2] + block[2]];
346         pixels[3] = cm[pixels[3] + block[3]];
347         pixels[4] = cm[pixels[4] + block[4]];
348         pixels[5] = cm[pixels[5] + block[5]];
349         pixels[6] = cm[pixels[6] + block[6]];
350         pixels[7] = cm[pixels[7] + block[7]];
351         pixels += line_size;
352         block += 8;
353     }
354 }
355 #endif
356 #if 0
357 
358 #define PIXOP2(OPNAME, OP) \
359 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
360 {\
361     int i;\
362     for(i=0; i<h; i++){\
363         OP(*((uint64_t*)block), LD64(pixels));\
364         pixels+=line_size;\
365         block +=line_size;\
366     }\
367 }\
368 \
369 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
370 {\
371     int i;\
372     for(i=0; i<h; i++){\
373         const uint64_t a= LD64(pixels  );\
374         const uint64_t b= LD64(pixels+1);\
375         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
376         pixels+=line_size;\
377         block +=line_size;\
378     }\
379 }\
380 \
381 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
382 {\
383     int i;\
384     for(i=0; i<h; i++){\
385         const uint64_t a= LD64(pixels  );\
386         const uint64_t b= LD64(pixels+1);\
387         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
388         pixels+=line_size;\
389         block +=line_size;\
390     }\
391 }\
392 \
393 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
394 {\
395     int i;\
396     for(i=0; i<h; i++){\
397         const uint64_t a= LD64(pixels          );\
398         const uint64_t b= LD64(pixels+line_size);\
399         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
400         pixels+=line_size;\
401         block +=line_size;\
402     }\
403 }\
404 \
405 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
406 {\
407     int i;\
408     for(i=0; i<h; i++){\
409         const uint64_t a= LD64(pixels          );\
410         const uint64_t b= LD64(pixels+line_size);\
411         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
412         pixels+=line_size;\
413         block +=line_size;\
414     }\
415 }\
416 \
417 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
418 {\
419         int i;\
420         const uint64_t a= LD64(pixels  );\
421         const uint64_t b= LD64(pixels+1);\
422         uint64_t l0=  (a&0x0303030303030303ULL)\
423                     + (b&0x0303030303030303ULL)\
424                     + 0x0202020202020202ULL;\
425         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
426                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
427         uint64_t l1,h1;\
428 \
429         pixels+=line_size;\
430         for(i=0; i<h; i+=2){\
431             uint64_t a= LD64(pixels  );\
432             uint64_t b= LD64(pixels+1);\
433             l1=  (a&0x0303030303030303ULL)\
434                + (b&0x0303030303030303ULL);\
435             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
436               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
437             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
438             pixels+=line_size;\
439             block +=line_size;\
440             a= LD64(pixels  );\
441             b= LD64(pixels+1);\
442             l0=  (a&0x0303030303030303ULL)\
443                + (b&0x0303030303030303ULL)\
444                + 0x0202020202020202ULL;\
445             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
448             pixels+=line_size;\
449             block +=line_size;\
450         }\
451 }\
452 \
453 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
454 {\
455         int i;\
456         const uint64_t a= LD64(pixels  );\
457         const uint64_t b= LD64(pixels+1);\
458         uint64_t l0=  (a&0x0303030303030303ULL)\
459                     + (b&0x0303030303030303ULL)\
460                     + 0x0101010101010101ULL;\
461         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
462                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
463         uint64_t l1,h1;\
464 \
465         pixels+=line_size;\
466         for(i=0; i<h; i+=2){\
467             uint64_t a= LD64(pixels  );\
468             uint64_t b= LD64(pixels+1);\
469             l1=  (a&0x0303030303030303ULL)\
470                + (b&0x0303030303030303ULL);\
471             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
472               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
473             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
474             pixels+=line_size;\
475             block +=line_size;\
476             a= LD64(pixels  );\
477             b= LD64(pixels+1);\
478             l0=  (a&0x0303030303030303ULL)\
479                + (b&0x0303030303030303ULL)\
480                + 0x0101010101010101ULL;\
481             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
484             pixels+=line_size;\
485             block +=line_size;\
486         }\
487 }\
488 \
489 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
491 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
492 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
495 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
496 
497 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
498 #else // 64 bit variant
499 
500 #define PIXOP2(OPNAME, OP) \
501 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
502     int i;\
503     for(i=0; i<h; i++){\
504         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
505         pixels+=line_size;\
506         block +=line_size;\
507     }\
508 }\
509 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
510     int i;\
511     for(i=0; i<h; i++){\
512         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
513         pixels+=line_size;\
514         block +=line_size;\
515     }\
516 }\
517 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
518     int i;\
519     for(i=0; i<h; i++){\
520         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
521         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
522         pixels+=line_size;\
523         block +=line_size;\
524     }\
525 }\
526 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
527     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
528 }\
529 \
530 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
531                                                 int src_stride1, int src_stride2, int h){\
532     int i;\
533     for(i=0; i<h; i++){\
534         uint32_t a,b;\
535         a= LD32(&src1[i*src_stride1  ]);\
536         b= LD32(&src2[i*src_stride2  ]);\
537         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
538         a= LD32(&src1[i*src_stride1+4]);\
539         b= LD32(&src2[i*src_stride2+4]);\
540         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
541     }\
542 }\
543 \
544 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
545                                                 int src_stride1, int src_stride2, int h){\
546     int i;\
547     for(i=0; i<h; i++){\
548         uint32_t a,b;\
549         a= LD32(&src1[i*src_stride1  ]);\
550         b= LD32(&src2[i*src_stride2  ]);\
551         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
552         a= LD32(&src1[i*src_stride1+4]);\
553         b= LD32(&src2[i*src_stride2+4]);\
554         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
555     }\
556 }\
557 \
558 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
559                                                 int src_stride1, int src_stride2, int h){\
560     int i;\
561     for(i=0; i<h; i++){\
562         uint32_t a,b;\
563         a= LD32(&src1[i*src_stride1  ]);\
564         b= LD32(&src2[i*src_stride2  ]);\
565         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
566     }\
567 }\
568 \
569 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
570                                                 int src_stride1, int src_stride2, int h){\
571     int i;\
572     for(i=0; i<h; i++){\
573         uint32_t a,b;\
574         a= LD16(&src1[i*src_stride1  ]);\
575         b= LD16(&src2[i*src_stride2  ]);\
576         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
577     }\
578 }\
579 \
580 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
581                                                 int src_stride1, int src_stride2, int h){\
582     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
583     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
584 }\
585 \
586 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
587                                                 int src_stride1, int src_stride2, int h){\
588     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
589     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
590 }\
591 \
592 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
593     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
594 }\
595 \
596 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
597     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
598 }\
599 \
600 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
601     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
602 }\
603 \
604 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
605     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
606 }\
607 \
608 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
609                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610     int i;\
611     for(i=0; i<h; i++){\
612         uint32_t a, b, c, d, l0, l1, h0, h1;\
613         a= LD32(&src1[i*src_stride1]);\
614         b= LD32(&src2[i*src_stride2]);\
615         c= LD32(&src3[i*src_stride3]);\
616         d= LD32(&src4[i*src_stride4]);\
617         l0=  (a&0x03030303UL)\
618            + (b&0x03030303UL)\
619            + 0x02020202UL;\
620         h0= ((a&0xFCFCFCFCUL)>>2)\
621           + ((b&0xFCFCFCFCUL)>>2);\
622         l1=  (c&0x03030303UL)\
623            + (d&0x03030303UL);\
624         h1= ((c&0xFCFCFCFCUL)>>2)\
625           + ((d&0xFCFCFCFCUL)>>2);\
626         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
627         a= LD32(&src1[i*src_stride1+4]);\
628         b= LD32(&src2[i*src_stride2+4]);\
629         c= LD32(&src3[i*src_stride3+4]);\
630         d= LD32(&src4[i*src_stride4+4]);\
631         l0=  (a&0x03030303UL)\
632            + (b&0x03030303UL)\
633            + 0x02020202UL;\
634         h0= ((a&0xFCFCFCFCUL)>>2)\
635           + ((b&0xFCFCFCFCUL)>>2);\
636         l1=  (c&0x03030303UL)\
637            + (d&0x03030303UL);\
638         h1= ((c&0xFCFCFCFCUL)>>2)\
639           + ((d&0xFCFCFCFCUL)>>2);\
640         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
641     }\
642 }\
643 \
644 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
645     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
646 }\
647 \
648 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
649     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
650 }\
651 \
652 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
653     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
654 }\
655 \
656 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
657     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
658 }\
659 \
660 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
661                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
662     int i;\
663     for(i=0; i<h; i++){\
664         uint32_t a, b, c, d, l0, l1, h0, h1;\
665         a= LD32(&src1[i*src_stride1]);\
666         b= LD32(&src2[i*src_stride2]);\
667         c= LD32(&src3[i*src_stride3]);\
668         d= LD32(&src4[i*src_stride4]);\
669         l0=  (a&0x03030303UL)\
670            + (b&0x03030303UL)\
671            + 0x01010101UL;\
672         h0= ((a&0xFCFCFCFCUL)>>2)\
673           + ((b&0xFCFCFCFCUL)>>2);\
674         l1=  (c&0x03030303UL)\
675            + (d&0x03030303UL);\
676         h1= ((c&0xFCFCFCFCUL)>>2)\
677           + ((d&0xFCFCFCFCUL)>>2);\
678         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
679         a= LD32(&src1[i*src_stride1+4]);\
680         b= LD32(&src2[i*src_stride2+4]);\
681         c= LD32(&src3[i*src_stride3+4]);\
682         d= LD32(&src4[i*src_stride4+4]);\
683         l0=  (a&0x03030303UL)\
684            + (b&0x03030303UL)\
685            + 0x01010101UL;\
686         h0= ((a&0xFCFCFCFCUL)>>2)\
687           + ((b&0xFCFCFCFCUL)>>2);\
688         l1=  (c&0x03030303UL)\
689            + (d&0x03030303UL);\
690         h1= ((c&0xFCFCFCFCUL)>>2)\
691           + ((d&0xFCFCFCFCUL)>>2);\
692         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
693     }\
694 }\
695 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
696                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
697     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
699 }\
700 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
701                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
702     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
704 }\
705 \
706 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 {\
708         int i, a0, b0, a1, b1;\
709         a0= pixels[0];\
710         b0= pixels[1] + 2;\
711         a0 += b0;\
712         b0 += pixels[2];\
713 \
714         pixels+=line_size;\
715         for(i=0; i<h; i+=2){\
716             a1= pixels[0];\
717             b1= pixels[1];\
718             a1 += b1;\
719             b1 += pixels[2];\
720 \
721             block[0]= (a1+a0)>>2; /* FIXME non put */\
722             block[1]= (b1+b0)>>2;\
723 \
724             pixels+=line_size;\
725             block +=line_size;\
726 \
727             a0= pixels[0];\
728             b0= pixels[1] + 2;\
729             a0 += b0;\
730             b0 += pixels[2];\
731 \
732             block[0]= (a1+a0)>>2;\
733             block[1]= (b1+b0)>>2;\
734             pixels+=line_size;\
735             block +=line_size;\
736         }\
737 }\
738 \
739 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
740 {\
741         int i;\
742         const uint32_t a= LD32(pixels  );\
743         const uint32_t b= LD32(pixels+1);\
744         uint32_t l0=  (a&0x03030303UL)\
745                     + (b&0x03030303UL)\
746                     + 0x02020202UL;\
747         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
748                    + ((b&0xFCFCFCFCUL)>>2);\
749         uint32_t l1,h1;\
750 \
751         pixels+=line_size;\
752         for(i=0; i<h; i+=2){\
753             uint32_t a= LD32(pixels  );\
754             uint32_t b= LD32(pixels+1);\
755             l1=  (a&0x03030303UL)\
756                + (b&0x03030303UL);\
757             h1= ((a&0xFCFCFCFCUL)>>2)\
758               + ((b&0xFCFCFCFCUL)>>2);\
759             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
760             pixels+=line_size;\
761             block +=line_size;\
762             a= LD32(pixels  );\
763             b= LD32(pixels+1);\
764             l0=  (a&0x03030303UL)\
765                + (b&0x03030303UL)\
766                + 0x02020202UL;\
767             h0= ((a&0xFCFCFCFCUL)>>2)\
768               + ((b&0xFCFCFCFCUL)>>2);\
769             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
770             pixels+=line_size;\
771             block +=line_size;\
772         }\
773 }\
774 \
775 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776 {\
777     int j;\
778     for(j=0; j<2; j++){\
779         int i;\
780         const uint32_t a= LD32(pixels  );\
781         const uint32_t b= LD32(pixels+1);\
782         uint32_t l0=  (a&0x03030303UL)\
783                     + (b&0x03030303UL)\
784                     + 0x02020202UL;\
785         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
786                    + ((b&0xFCFCFCFCUL)>>2);\
787         uint32_t l1,h1;\
788 \
789         pixels+=line_size;\
790         for(i=0; i<h; i+=2){\
791             uint32_t a= LD32(pixels  );\
792             uint32_t b= LD32(pixels+1);\
793             l1=  (a&0x03030303UL)\
794                + (b&0x03030303UL);\
795             h1= ((a&0xFCFCFCFCUL)>>2)\
796               + ((b&0xFCFCFCFCUL)>>2);\
797             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
798             pixels+=line_size;\
799             block +=line_size;\
800             a= LD32(pixels  );\
801             b= LD32(pixels+1);\
802             l0=  (a&0x03030303UL)\
803                + (b&0x03030303UL)\
804                + 0x02020202UL;\
805             h0= ((a&0xFCFCFCFCUL)>>2)\
806               + ((b&0xFCFCFCFCUL)>>2);\
807             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
808             pixels+=line_size;\
809             block +=line_size;\
810         }\
811         pixels+=4-line_size*(h+1);\
812         block +=4-line_size*h;\
813     }\
814 }\
815 \
816 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
817 {\
818     int j;\
819     for(j=0; j<2; j++){\
820         int i;\
821         const uint32_t a= LD32(pixels  );\
822         const uint32_t b= LD32(pixels+1);\
823         uint32_t l0=  (a&0x03030303UL)\
824                     + (b&0x03030303UL)\
825                     + 0x01010101UL;\
826         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
827                    + ((b&0xFCFCFCFCUL)>>2);\
828         uint32_t l1,h1;\
829 \
830         pixels+=line_size;\
831         for(i=0; i<h; i+=2){\
832             uint32_t a= LD32(pixels  );\
833             uint32_t b= LD32(pixels+1);\
834             l1=  (a&0x03030303UL)\
835                + (b&0x03030303UL);\
836             h1= ((a&0xFCFCFCFCUL)>>2)\
837               + ((b&0xFCFCFCFCUL)>>2);\
838             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
839             pixels+=line_size;\
840             block +=line_size;\
841             a= LD32(pixels  );\
842             b= LD32(pixels+1);\
843             l0=  (a&0x03030303UL)\
844                + (b&0x03030303UL)\
845                + 0x01010101UL;\
846             h0= ((a&0xFCFCFCFCUL)>>2)\
847               + ((b&0xFCFCFCFCUL)>>2);\
848             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
849             pixels+=line_size;\
850             block +=line_size;\
851         }\
852         pixels+=4-line_size*(h+1);\
853         block +=4-line_size*h;\
854     }\
855 }\
856 \
857 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
859 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
860 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
864 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
865 
866 #define op_avg(a, b) a = rnd_avg32(a, b)
867 #endif
868 #define op_put(a, b) a = b
869 
870 //PIXOP2(avg, op_avg)
871 //PIXOP2(put, op_put)
872 #undef op_avg
873 #undef op_put
874 
875 #define avg2(a,b) ((a+b+1)>>1)
876 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
877 
878 #if 0
879 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
880 {
881     const int A=(16-x16)*(16-y16);
882     const int B=(   x16)*(16-y16);
883     const int C=(16-x16)*(   y16);
884     const int D=(   x16)*(   y16);
885     int i;
886 
887     for(i=0; i<h; i++)
888     {
889         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
890         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
891         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
892         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
893         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
894         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
895         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
896         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
897         dst+= stride;
898         src+= stride;
899     }
900 }
901 
902 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
903                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
904 {
905     int y, vx, vy;
906     const int s= 1<<shift;
907 
908     width--;
909     height--;
910 
911     for(y=0; y<h; y++){
912         int x;
913 
914         vx= ox;
915         vy= oy;
916         for(x=0; x<8; x++){ //XXX FIXME optimize
917             int src_x, src_y, frac_x, frac_y, index;
918 
919             src_x= vx>>16;
920             src_y= vy>>16;
921             frac_x= src_x&(s-1);
922             frac_y= src_y&(s-1);
923             src_x>>=shift;
924             src_y>>=shift;
925 
926             if((unsigned)src_x < width){
927                 if((unsigned)src_y < height){
928                     index= src_x + src_y*stride;
929                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
930                                            + src[index       +1]*   frac_x )*(s-frac_y)
931                                         + (  src[index+stride  ]*(s-frac_x)
932                                            + src[index+stride+1]*   frac_x )*   frac_y
933                                         + r)>>(shift*2);
934                 }else{
935                     index= src_x + clip(src_y, 0, height)*stride;
936                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
937                                           + src[index       +1]*   frac_x )*s
938                                         + r)>>(shift*2);
939                 }
940             }else{
941                 if((unsigned)src_y < height){
942                     index= clip(src_x, 0, width) + src_y*stride;
943                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
944                                            + src[index+stride  ]*   frac_y )*s
945                                         + r)>>(shift*2);
946                 }else{
947                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
948                     dst[y*stride + x]=    src[index         ];
949                 }
950             }
951 
952             vx+= dxx;
953             vy+= dyx;
954         }
955         ox += dxy;
956         oy += dyy;
957     }
958 }
959 
960 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
961     switch(width){
962     case 2: put_pixels2_c (dst, src, stride, height); break;
963     case 4: put_pixels4_c (dst, src, stride, height); break;
964     case 8: put_pixels8_c (dst, src, stride, height); break;
965     case 16:put_pixels16_c(dst, src, stride, height); break;
966     }
967 }
968 
969 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
970     int i,j;
971     for (i=0; i < height; i++) {
972       for (j=0; j < width; j++) {
973 	dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
974       }
975       src += stride;
976       dst += stride;
977     }
978 }
979 
980 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
981     int i,j;
982     for (i=0; i < height; i++) {
983       for (j=0; j < width; j++) {
984 	dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
985       }
986       src += stride;
987       dst += stride;
988     }
989 }
990 
991 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
992     int i,j;
993     for (i=0; i < height; i++) {
994       for (j=0; j < width; j++) {
995 	dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
996       }
997       src += stride;
998       dst += stride;
999     }
1000 }
1001 
1002 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1003     int i,j;
1004     for (i=0; i < height; i++) {
1005       for (j=0; j < width; j++) {
1006 	dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1007       }
1008       src += stride;
1009       dst += stride;
1010     }
1011 }
1012 
1013 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1014     int i,j;
1015     for (i=0; i < height; i++) {
1016       for (j=0; j < width; j++) {
1017 	dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1018       }
1019       src += stride;
1020       dst += stride;
1021     }
1022 }
1023 
1024 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1025     int i,j;
1026     for (i=0; i < height; i++) {
1027       for (j=0; j < width; j++) {
1028 	dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1029       }
1030       src += stride;
1031       dst += stride;
1032     }
1033 }
1034 
1035 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1036     int i,j;
1037     for (i=0; i < height; i++) {
1038       for (j=0; j < width; j++) {
1039 	dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1040       }
1041       src += stride;
1042       dst += stride;
1043     }
1044 }
1045 
1046 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1047     int i,j;
1048     for (i=0; i < height; i++) {
1049       for (j=0; j < width; j++) {
1050 	dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1051       }
1052       src += stride;
1053       dst += stride;
1054     }
1055 }
1056 
1057 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1058     switch(width){
1059     case 2: avg_pixels2_c (dst, src, stride, height); break;
1060     case 4: avg_pixels4_c (dst, src, stride, height); break;
1061     case 8: avg_pixels8_c (dst, src, stride, height); break;
1062     case 16:avg_pixels16_c(dst, src, stride, height); break;
1063     }
1064 }
1065 
1066 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1067     int i,j;
1068     for (i=0; i < height; i++) {
1069       for (j=0; j < width; j++) {
1070 	dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1071       }
1072       src += stride;
1073       dst += stride;
1074     }
1075 }
1076 
1077 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1078     int i,j;
1079     for (i=0; i < height; i++) {
1080       for (j=0; j < width; j++) {
1081 	dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1082       }
1083       src += stride;
1084       dst += stride;
1085     }
1086 }
1087 
1088 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1089     int i,j;
1090     for (i=0; i < height; i++) {
1091       for (j=0; j < width; j++) {
1092 	dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1093       }
1094       src += stride;
1095       dst += stride;
1096     }
1097 }
1098 
1099 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1100     int i,j;
1101     for (i=0; i < height; i++) {
1102       for (j=0; j < width; j++) {
1103 	dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1104       }
1105       src += stride;
1106       dst += stride;
1107     }
1108 }
1109 
1110 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1111     int i,j;
1112     for (i=0; i < height; i++) {
1113       for (j=0; j < width; j++) {
1114 	dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1115       }
1116       src += stride;
1117       dst += stride;
1118     }
1119 }
1120 
1121 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1122     int i,j;
1123     for (i=0; i < height; i++) {
1124       for (j=0; j < width; j++) {
1125 	dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1126       }
1127       src += stride;
1128       dst += stride;
1129     }
1130 }
1131 
1132 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1133     int i,j;
1134     for (i=0; i < height; i++) {
1135       for (j=0; j < width; j++) {
1136 	dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1137       }
1138       src += stride;
1139       dst += stride;
1140     }
1141 }
1142 
1143 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1144     int i,j;
1145     for (i=0; i < height; i++) {
1146       for (j=0; j < width; j++) {
1147 	dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1148       }
1149       src += stride;
1150       dst += stride;
1151     }
1152 }
1153 #if 0
1154 #define TPEL_WIDTH(width)\
1155 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1171 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1173 #endif
1174 
1175 #define H264_CHROMA_MC(OPNAME, OP)\
1176 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1177     const int A=(8-x)*(8-y);\
1178     const int B=(  x)*(8-y);\
1179     const int C=(8-x)*(  y);\
1180     const int D=(  x)*(  y);\
1181     int i;\
1182     \
1183     assert(x<8 && y<8 && x>=0 && y>=0);\
1184 \
1185     for(i=0; i<h; i++)\
1186     {\
1187         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1188         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1189         dst+= stride;\
1190         src+= stride;\
1191     }\
1192 }\
1193 \
1194 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1195     const int A=(8-x)*(8-y);\
1196     const int B=(  x)*(8-y);\
1197     const int C=(8-x)*(  y);\
1198     const int D=(  x)*(  y);\
1199     int i;\
1200     \
1201     assert(x<8 && y<8 && x>=0 && y>=0);\
1202 \
1203     for(i=0; i<h; i++)\
1204     {\
1205         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1206         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1207         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1208         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1209         dst+= stride;\
1210         src+= stride;\
1211     }\
1212 }\
1213 \
1214 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1215     const int A=(8-x)*(8-y);\
1216     const int B=(  x)*(8-y);\
1217     const int C=(8-x)*(  y);\
1218     const int D=(  x)*(  y);\
1219     int i;\
1220     \
1221     assert(x<8 && y<8 && x>=0 && y>=0);\
1222 \
1223     for(i=0; i<h; i++)\
1224     {\
1225         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1226         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1227         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1228         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1229         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1230         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1231         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1232         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1233         dst+= stride;\
1234         src+= stride;\
1235     }\
1236 }
1237 
1238 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1239 #define op_put(a, b) a = (((b) + 32)>>6)
1240 
1241 H264_CHROMA_MC(put_       , op_put)
1242 H264_CHROMA_MC(avg_       , op_avg)
1243 #undef op_avg
1244 #undef op_put
1245 
1246 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1247 {
1248     int i;
1249     for(i=0; i<h; i++)
1250     {
1251         ST32(dst   , LD32(src   ));
1252         dst+=dstStride;
1253         src+=srcStride;
1254     }
1255 }
1256 
1257 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1258 {
1259     int i;
1260     for(i=0; i<h; i++)
1261     {
1262         ST32(dst   , LD32(src   ));
1263         ST32(dst+4 , LD32(src+4 ));
1264         dst+=dstStride;
1265         src+=srcStride;
1266     }
1267 }
1268 
1269 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1270 {
1271     int i;
1272     for(i=0; i<h; i++)
1273     {
1274         ST32(dst   , LD32(src   ));
1275         ST32(dst+4 , LD32(src+4 ));
1276         ST32(dst+8 , LD32(src+8 ));
1277         ST32(dst+12, LD32(src+12));
1278         dst+=dstStride;
1279         src+=srcStride;
1280     }
1281 }
1282 
1283 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1284 {
1285     int i;
1286     for(i=0; i<h; i++)
1287     {
1288         ST32(dst   , LD32(src   ));
1289         ST32(dst+4 , LD32(src+4 ));
1290         ST32(dst+8 , LD32(src+8 ));
1291         ST32(dst+12, LD32(src+12));
1292         dst[16]= src[16];
1293         dst+=dstStride;
1294         src+=srcStride;
1295     }
1296 }
1297 
1298 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1299 {
1300     int i;
1301     for(i=0; i<h; i++)
1302     {
1303         ST32(dst   , LD32(src   ));
1304         ST32(dst+4 , LD32(src+4 ));
1305         dst[8]= src[8];
1306         dst+=dstStride;
1307         src+=srcStride;
1308     }
1309 }
1310 
1311 
1312 #define QPEL_MC(r, OPNAME, RND, OP) \
1313 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1314     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1315     int i;\
1316     for(i=0; i<h; i++)\
1317     {\
1318         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1319         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1320         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1321         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1322         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1323         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1324         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1325         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1326         dst+=dstStride;\
1327         src+=srcStride;\
1328     }\
1329 }\
1330 \
1331 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1332     const int w=8;\
1333     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1334     int i;\
1335     for(i=0; i<w; i++)\
1336     {\
1337         const int src0= src[0*srcStride];\
1338         const int src1= src[1*srcStride];\
1339         const int src2= src[2*srcStride];\
1340         const int src3= src[3*srcStride];\
1341         const int src4= src[4*srcStride];\
1342         const int src5= src[5*srcStride];\
1343         const int src6= src[6*srcStride];\
1344         const int src7= src[7*srcStride];\
1345         const int src8= src[8*srcStride];\
1346         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1347         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1348         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1349         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1350         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1351         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1352         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1353         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1354         dst++;\
1355         src++;\
1356     }\
1357 }\
1358 \
1359 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1360     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361     int i;\
1362     \
1363     for(i=0; i<h; i++)\
1364     {\
1365         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1366         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1367         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1368         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1369         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1370         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1371         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1372         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1373         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1374         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1375         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1376         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1377         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1378         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1379         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1380         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1381         dst+=dstStride;\
1382         src+=srcStride;\
1383     }\
1384 }\
1385 \
1386 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1387     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388     int i;\
1389     const int w=16;\
1390     for(i=0; i<w; i++)\
1391     {\
1392         const int src0= src[0*srcStride];\
1393         const int src1= src[1*srcStride];\
1394         const int src2= src[2*srcStride];\
1395         const int src3= src[3*srcStride];\
1396         const int src4= src[4*srcStride];\
1397         const int src5= src[5*srcStride];\
1398         const int src6= src[6*srcStride];\
1399         const int src7= src[7*srcStride];\
1400         const int src8= src[8*srcStride];\
1401         const int src9= src[9*srcStride];\
1402         const int src10= src[10*srcStride];\
1403         const int src11= src[11*srcStride];\
1404         const int src12= src[12*srcStride];\
1405         const int src13= src[13*srcStride];\
1406         const int src14= src[14*srcStride];\
1407         const int src15= src[15*srcStride];\
1408         const int src16= src[16*srcStride];\
1409         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1410         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1411         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1412         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1413         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1414         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1415         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1416         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1417         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1418         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1419         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1420         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1421         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1422         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1423         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1424         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1425         dst++;\
1426         src++;\
1427     }\
1428 }\
1429 \
1430 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1431     OPNAME ## pixels8_c(dst, src, stride, 8);\
1432 }\
1433 \
1434 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1435     uint8_t half[64];\
1436     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1437     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1438 }\
1439 \
1440 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1441     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1442 }\
1443 \
1444 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1445     uint8_t half[64];\
1446     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1447     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1448 }\
1449 \
1450 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1451     uint8_t full[16*9];\
1452     uint8_t half[64];\
1453     copy_block9(full, src, 16, stride, 9);\
1454     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1455     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1456 }\
1457 \
1458 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1459     uint8_t full[16*9];\
1460     copy_block9(full, src, 16, stride, 9);\
1461     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1462 }\
1463 \
1464 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1465     uint8_t full[16*9];\
1466     uint8_t half[64];\
1467     copy_block9(full, src, 16, stride, 9);\
1468     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1469     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1470 }\
1471 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1472     uint8_t full[16*9];\
1473     uint8_t halfH[72];\
1474     uint8_t halfV[64];\
1475     uint8_t halfHV[64];\
1476     copy_block9(full, src, 16, stride, 9);\
1477     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1478     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1479     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1480     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1481 }\
1482 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1483     uint8_t full[16*9];\
1484     uint8_t halfH[72];\
1485     uint8_t halfHV[64];\
1486     copy_block9(full, src, 16, stride, 9);\
1487     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1488     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1489     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1490     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1491 }\
1492 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1493     uint8_t full[16*9];\
1494     uint8_t halfH[72];\
1495     uint8_t halfV[64];\
1496     uint8_t halfHV[64];\
1497     copy_block9(full, src, 16, stride, 9);\
1498     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1499     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1500     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1501     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1502 }\
1503 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1504     uint8_t full[16*9];\
1505     uint8_t halfH[72];\
1506     uint8_t halfHV[64];\
1507     copy_block9(full, src, 16, stride, 9);\
1508     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1509     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1510     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1511     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1512 }\
1513 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1514     uint8_t full[16*9];\
1515     uint8_t halfH[72];\
1516     uint8_t halfV[64];\
1517     uint8_t halfHV[64];\
1518     copy_block9(full, src, 16, stride, 9);\
1519     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1520     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1521     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1522     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1523 }\
1524 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1525     uint8_t full[16*9];\
1526     uint8_t halfH[72];\
1527     uint8_t halfHV[64];\
1528     copy_block9(full, src, 16, stride, 9);\
1529     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1530     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1531     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1532     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1533 }\
1534 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1535     uint8_t full[16*9];\
1536     uint8_t halfH[72];\
1537     uint8_t halfV[64];\
1538     uint8_t halfHV[64];\
1539     copy_block9(full, src, 16, stride, 9);\
1540     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1541     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1542     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1543     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1544 }\
1545 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1546     uint8_t full[16*9];\
1547     uint8_t halfH[72];\
1548     uint8_t halfHV[64];\
1549     copy_block9(full, src, 16, stride, 9);\
1550     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1551     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1552     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1553     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1554 }\
1555 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1556     uint8_t halfH[72];\
1557     uint8_t halfHV[64];\
1558     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1559     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1560     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1561 }\
1562 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1563     uint8_t halfH[72];\
1564     uint8_t halfHV[64];\
1565     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1566     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1567     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1568 }\
1569 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1570     uint8_t full[16*9];\
1571     uint8_t halfH[72];\
1572     uint8_t halfV[64];\
1573     uint8_t halfHV[64];\
1574     copy_block9(full, src, 16, stride, 9);\
1575     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1576     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1577     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1578     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1579 }\
1580 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1581     uint8_t full[16*9];\
1582     uint8_t halfH[72];\
1583     copy_block9(full, src, 16, stride, 9);\
1584     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1585     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1586     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1587 }\
1588 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1589     uint8_t full[16*9];\
1590     uint8_t halfH[72];\
1591     uint8_t halfV[64];\
1592     uint8_t halfHV[64];\
1593     copy_block9(full, src, 16, stride, 9);\
1594     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1595     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1596     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1597     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1598 }\
1599 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1600     uint8_t full[16*9];\
1601     uint8_t halfH[72];\
1602     copy_block9(full, src, 16, stride, 9);\
1603     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1604     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1605     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1606 }\
1607 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1608     uint8_t halfH[72];\
1609     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1610     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1611 }\
1612 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1613     OPNAME ## pixels16_c(dst, src, stride, 16);\
1614 }\
1615 \
1616 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1617     uint8_t half[256];\
1618     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1619     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1620 }\
1621 \
1622 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1623     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1624 }\
1625 \
1626 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1627     uint8_t half[256];\
1628     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1629     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1630 }\
1631 \
1632 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1633     uint8_t full[24*17];\
1634     uint8_t half[256];\
1635     copy_block17(full, src, 24, stride, 17);\
1636     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1637     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1638 }\
1639 \
1640 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1641     uint8_t full[24*17];\
1642     copy_block17(full, src, 24, stride, 17);\
1643     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1644 }\
1645 \
1646 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1647     uint8_t full[24*17];\
1648     uint8_t half[256];\
1649     copy_block17(full, src, 24, stride, 17);\
1650     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1651     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1652 }\
1653 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1654     uint8_t full[24*17];\
1655     uint8_t halfH[272];\
1656     uint8_t halfV[256];\
1657     uint8_t halfHV[256];\
1658     copy_block17(full, src, 24, stride, 17);\
1659     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1660     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1661     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1662     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1663 }\
1664 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1665     uint8_t full[24*17];\
1666     uint8_t halfH[272];\
1667     uint8_t halfHV[256];\
1668     copy_block17(full, src, 24, stride, 17);\
1669     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1670     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1671     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1672     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1673 }\
1674 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1675     uint8_t full[24*17];\
1676     uint8_t halfH[272];\
1677     uint8_t halfV[256];\
1678     uint8_t halfHV[256];\
1679     copy_block17(full, src, 24, stride, 17);\
1680     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1681     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1682     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1683     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1684 }\
1685 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1686     uint8_t full[24*17];\
1687     uint8_t halfH[272];\
1688     uint8_t halfHV[256];\
1689     copy_block17(full, src, 24, stride, 17);\
1690     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1691     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1692     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1693     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1694 }\
1695 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1696     uint8_t full[24*17];\
1697     uint8_t halfH[272];\
1698     uint8_t halfV[256];\
1699     uint8_t halfHV[256];\
1700     copy_block17(full, src, 24, stride, 17);\
1701     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1702     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1703     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1704     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1705 }\
1706 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1707     uint8_t full[24*17];\
1708     uint8_t halfH[272];\
1709     uint8_t halfHV[256];\
1710     copy_block17(full, src, 24, stride, 17);\
1711     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1712     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1713     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1714     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1715 }\
1716 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1717     uint8_t full[24*17];\
1718     uint8_t halfH[272];\
1719     uint8_t halfV[256];\
1720     uint8_t halfHV[256];\
1721     copy_block17(full, src, 24, stride, 17);\
1722     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1723     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1724     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1725     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1726 }\
1727 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1728     uint8_t full[24*17];\
1729     uint8_t halfH[272];\
1730     uint8_t halfHV[256];\
1731     copy_block17(full, src, 24, stride, 17);\
1732     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1733     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1734     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1735     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1736 }\
1737 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1738     uint8_t halfH[272];\
1739     uint8_t halfHV[256];\
1740     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1741     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1742     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1743 }\
1744 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1745     uint8_t halfH[272];\
1746     uint8_t halfHV[256];\
1747     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1748     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1749     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1750 }\
1751 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1752     uint8_t full[24*17];\
1753     uint8_t halfH[272];\
1754     uint8_t halfV[256];\
1755     uint8_t halfHV[256];\
1756     copy_block17(full, src, 24, stride, 17);\
1757     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1758     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1759     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1760     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1761 }\
1762 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1763     uint8_t full[24*17];\
1764     uint8_t halfH[272];\
1765     copy_block17(full, src, 24, stride, 17);\
1766     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1767     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1768     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1769 }\
1770 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1771     uint8_t full[24*17];\
1772     uint8_t halfH[272];\
1773     uint8_t halfV[256];\
1774     uint8_t halfHV[256];\
1775     copy_block17(full, src, 24, stride, 17);\
1776     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1777     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1778     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1779     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1780 }\
1781 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1782     uint8_t full[24*17];\
1783     uint8_t halfH[272];\
1784     copy_block17(full, src, 24, stride, 17);\
1785     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1786     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1787     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1788 }\
1789 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1790     uint8_t halfH[272];\
1791     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1792     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1793 }
1794 
1795 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1796 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1797 #define op_put(a, b) a = cm[((b) + 16)>>5]
1798 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1799 
1800 QPEL_MC(0, put_       , _       , op_put)
1801 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1802 QPEL_MC(0, avg_       , _       , op_avg)
1803 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
1804 #undef op_avg
1805 #undef op_avg_no_rnd
1806 #undef op_put
1807 #undef op_put_no_rnd
1808 
1809 #if 1
1810 #define H264_LOWPASS(OPNAME, OP, OP2) \
1811 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812     const int h=4;\
1813     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1814     int i;\
1815     for(i=0; i<h; i++)\
1816     {\
1817         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1818         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1819         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1820         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1821         dst+=dstStride;\
1822         src+=srcStride;\
1823     }\
1824 }\
1825 \
1826 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1827     const int w=4;\
1828     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1829     int i;\
1830     for(i=0; i<w; i++)\
1831     {\
1832         const int srcB= src[-2*srcStride];\
1833         const int srcA= src[-1*srcStride];\
1834         const int src0= src[0 *srcStride];\
1835         const int src1= src[1 *srcStride];\
1836         const int src2= src[2 *srcStride];\
1837         const int src3= src[3 *srcStride];\
1838         const int src4= src[4 *srcStride];\
1839         const int src5= src[5 *srcStride];\
1840         const int src6= src[6 *srcStride];\
1841         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1842         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1843         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1844         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1845         dst++;\
1846         src++;\
1847     }\
1848 }\
1849 \
1850 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1851     const int h=4;\
1852     const int w=4;\
1853     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1854     int i;\
1855     src -= 2*srcStride;\
1856     for(i=0; i<h+5; i++)\
1857     {\
1858         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1859         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1860         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1861         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1862         tmp+=tmpStride;\
1863         src+=srcStride;\
1864     }\
1865     tmp -= tmpStride*(h+5-2);\
1866     for(i=0; i<w; i++)\
1867     {\
1868         const int tmpB= tmp[-2*tmpStride];\
1869         const int tmpA= tmp[-1*tmpStride];\
1870         const int tmp0= tmp[0 *tmpStride];\
1871         const int tmp1= tmp[1 *tmpStride];\
1872         const int tmp2= tmp[2 *tmpStride];\
1873         const int tmp3= tmp[3 *tmpStride];\
1874         const int tmp4= tmp[4 *tmpStride];\
1875         const int tmp5= tmp[5 *tmpStride];\
1876         const int tmp6= tmp[6 *tmpStride];\
1877         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1878         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1879         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1880         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1881         dst++;\
1882         tmp++;\
1883     }\
1884 }\
1885 \
1886 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1887     const int h=8;\
1888     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1889     int i;\
1890     for(i=0; i<h; i++)\
1891     {\
1892         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1893         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1894         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1895         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1896         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1897         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1898         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1899         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1900         dst+=dstStride;\
1901         src+=srcStride;\
1902     }\
1903 }\
1904 \
1905 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1906     const int w=8;\
1907     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1908     int i;\
1909     for(i=0; i<w; i++)\
1910     {\
1911         const int srcB= src[-2*srcStride];\
1912         const int srcA= src[-1*srcStride];\
1913         const int src0= src[0 *srcStride];\
1914         const int src1= src[1 *srcStride];\
1915         const int src2= src[2 *srcStride];\
1916         const int src3= src[3 *srcStride];\
1917         const int src4= src[4 *srcStride];\
1918         const int src5= src[5 *srcStride];\
1919         const int src6= src[6 *srcStride];\
1920         const int src7= src[7 *srcStride];\
1921         const int src8= src[8 *srcStride];\
1922         const int src9= src[9 *srcStride];\
1923         const int src10=src[10*srcStride];\
1924         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1925         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1926         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1927         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1928         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1929         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1930         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1931         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1932         dst++;\
1933         src++;\
1934     }\
1935 }\
1936 \
1937 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1938     const int h=8;\
1939     const int w=8;\
1940     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1941     int i;\
1942     src -= 2*srcStride;\
1943     for(i=0; i<h+5; i++)\
1944     {\
1945         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1946         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1947         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1948         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1949         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1950         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1951         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1952         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1953         tmp+=tmpStride;\
1954         src+=srcStride;\
1955     }\
1956     tmp -= tmpStride*(h+5-2);\
1957     for(i=0; i<w; i++)\
1958     {\
1959         const int tmpB= tmp[-2*tmpStride];\
1960         const int tmpA= tmp[-1*tmpStride];\
1961         const int tmp0= tmp[0 *tmpStride];\
1962         const int tmp1= tmp[1 *tmpStride];\
1963         const int tmp2= tmp[2 *tmpStride];\
1964         const int tmp3= tmp[3 *tmpStride];\
1965         const int tmp4= tmp[4 *tmpStride];\
1966         const int tmp5= tmp[5 *tmpStride];\
1967         const int tmp6= tmp[6 *tmpStride];\
1968         const int tmp7= tmp[7 *tmpStride];\
1969         const int tmp8= tmp[8 *tmpStride];\
1970         const int tmp9= tmp[9 *tmpStride];\
1971         const int tmp10=tmp[10*tmpStride];\
1972         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1973         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1974         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1975         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1976         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1977         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1978         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1979         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1980         dst++;\
1981         tmp++;\
1982     }\
1983 }\
1984 \
1985 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1986     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1987     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1988     src += 8*srcStride;\
1989     dst += 8*dstStride;\
1990     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
1991     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1992 }\
1993 \
1994 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1995     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
1996     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1997     src += 8*srcStride;\
1998     dst += 8*dstStride;\
1999     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2000     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2001 }\
2002 \
2003 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2004     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2005     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2006     src += 8*srcStride;\
2007     tmp += 8*tmpStride;\
2008     dst += 8*dstStride;\
2009     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2010     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2011 }\
2012 
2013 #define H264_MC(OPNAME, SIZE) \
2014 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2015     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2016 }\
2017 \
2018 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2019     uint8_t half[SIZE*SIZE];\
2020     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2021     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2022 }\
2023 \
2024 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2025     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2026 }\
2027 \
2028 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2029     uint8_t half[SIZE*SIZE];\
2030     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2031     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2032 }\
2033 \
2034 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2035     uint8_t full[SIZE*(SIZE+5)];\
2036     uint8_t * const full_mid= full + SIZE*2;\
2037     uint8_t half[SIZE*SIZE];\
2038     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2039     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2040     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2041 }\
2042 \
2043 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2044     uint8_t full[SIZE*(SIZE+5)];\
2045     uint8_t * const full_mid= full + SIZE*2;\
2046     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2047     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2048 }\
2049 \
2050 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2051     uint8_t full[SIZE*(SIZE+5)];\
2052     uint8_t * const full_mid= full + SIZE*2;\
2053     uint8_t half[SIZE*SIZE];\
2054     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2055     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2056     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2057 }\
2058 \
2059 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2060     uint8_t full[SIZE*(SIZE+5)];\
2061     uint8_t * const full_mid= full + SIZE*2;\
2062     uint8_t halfH[SIZE*SIZE];\
2063     uint8_t halfV[SIZE*SIZE];\
2064     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2065     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2066     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2067     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2071     uint8_t full[SIZE*(SIZE+5)];\
2072     uint8_t * const full_mid= full + SIZE*2;\
2073     uint8_t halfH[SIZE*SIZE];\
2074     uint8_t halfV[SIZE*SIZE];\
2075     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2076     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2077     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2078     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2079 }\
2080 \
2081 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2082     uint8_t full[SIZE*(SIZE+5)];\
2083     uint8_t * const full_mid= full + SIZE*2;\
2084     uint8_t halfH[SIZE*SIZE];\
2085     uint8_t halfV[SIZE*SIZE];\
2086     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2087     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2088     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2089     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2090 }\
2091 \
2092 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2093     uint8_t full[SIZE*(SIZE+5)];\
2094     uint8_t * const full_mid= full + SIZE*2;\
2095     uint8_t halfH[SIZE*SIZE];\
2096     uint8_t halfV[SIZE*SIZE];\
2097     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2098     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2099     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2100     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2101 }\
2102 \
2103 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2104     int16_t tmp[SIZE*(SIZE+5)];\
2105     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2106 }\
2107 \
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2109     int16_t tmp[SIZE*(SIZE+5)];\
2110     uint8_t halfH[SIZE*SIZE];\
2111     uint8_t halfHV[SIZE*SIZE];\
2112     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2113     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2114     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2115 }\
2116 \
2117 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2118     int16_t tmp[SIZE*(SIZE+5)];\
2119     uint8_t halfH[SIZE*SIZE];\
2120     uint8_t halfHV[SIZE*SIZE];\
2121     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2122     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2123     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2124 }\
2125 \
2126 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2127     uint8_t full[SIZE*(SIZE+5)];\
2128     uint8_t * const full_mid= full + SIZE*2;\
2129     int16_t tmp[SIZE*(SIZE+5)];\
2130     uint8_t halfV[SIZE*SIZE];\
2131     uint8_t halfHV[SIZE*SIZE];\
2132     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2133     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2134     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2135     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2136 }\
2137 \
2138 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2139     uint8_t full[SIZE*(SIZE+5)];\
2140     uint8_t * const full_mid= full + SIZE*2;\
2141     int16_t tmp[SIZE*(SIZE+5)];\
2142     uint8_t halfV[SIZE*SIZE];\
2143     uint8_t halfHV[SIZE*SIZE];\
2144     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2145     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2146     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2147     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2148 }\
2149 
2150 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2151 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2152 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2153 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2154 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2155 
2156 H264_LOWPASS(put_       , op_put, op2_put)
2157 H264_LOWPASS(avg_       , op_avg, op2_avg)
2158 H264_MC(put_, 4)
2159 H264_MC(put_, 8)
2160 H264_MC(put_, 16)
2161 H264_MC(avg_, 4)
2162 H264_MC(avg_, 8)
2163 H264_MC(avg_, 16)
2164 
2165 #undef op_avg
2166 #undef op_put
2167 #undef op2_avg
2168 #undef op2_put
2169 #endif
2170 
2171 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2172     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2173     int i;
2174 
2175     for(i=0; i<h; i++){
2176         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2177         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2178         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2179         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2180         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2181         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2182         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2183         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2184         dst+=dstStride;
2185         src+=srcStride;
2186     }
2187 }
2188 
2189 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2190     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2191     int i;
2192 
2193     for(i=0; i<w; i++){
2194         const int src_1= src[ -srcStride];
2195         const int src0 = src[0          ];
2196         const int src1 = src[  srcStride];
2197         const int src2 = src[2*srcStride];
2198         const int src3 = src[3*srcStride];
2199         const int src4 = src[4*srcStride];
2200         const int src5 = src[5*srcStride];
2201         const int src6 = src[6*srcStride];
2202         const int src7 = src[7*srcStride];
2203         const int src8 = src[8*srcStride];
2204         const int src9 = src[9*srcStride];
2205         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2206         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2207         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2208         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2209         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2210         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2211         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2212         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2213         src++;
2214         dst++;
2215     }
2216 }
2217 
2218 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2219     put_pixels8_c(dst, src, stride, 8);
2220 }
2221 
2222 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2223     uint8_t half[64];
2224     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2225     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2226 }
2227 
2228 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2229     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2230 }
2231 
2232 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2233     uint8_t half[64];
2234     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2235     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2236 }
2237 
2238 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2239     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2240 }
2241 
2242 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2243     uint8_t halfH[88];
2244     uint8_t halfV[64];
2245     uint8_t halfHV[64];
2246     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2247     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2248     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2249     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2250 }
2251 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2252     uint8_t halfH[88];
2253     uint8_t halfV[64];
2254     uint8_t halfHV[64];
2255     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2256     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2257     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2258     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2259 }
2260 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2261     uint8_t halfH[88];
2262     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2263     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2264 }
2265 #endif
2266 #if 0
2267 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2268     int x;
2269     const int strength= ff_h263_loop_filter_strength[qscale];
2270 
2271     for(x=0; x<8; x++){
2272         int d1, d2, ad1;
2273         int p0= src[x-2*stride];
2274         int p1= src[x-1*stride];
2275         int p2= src[x+0*stride];
2276         int p3= src[x+1*stride];
2277         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2278 
2279         if     (d<-2*strength) d1= 0;
2280         else if(d<-  strength) d1=-2*strength - d;
2281         else if(d<   strength) d1= d;
2282         else if(d< 2*strength) d1= 2*strength - d;
2283         else                   d1= 0;
2284 
2285         p1 += d1;
2286         p2 -= d1;
2287         if(p1&256) p1= ~(p1>>31);
2288         if(p2&256) p2= ~(p2>>31);
2289 
2290         src[x-1*stride] = p1;
2291         src[x+0*stride] = p2;
2292 
2293         ad1= ABS(d1)>>1;
2294 
2295         d2= clip((p0-p3)/4, -ad1, ad1);
2296 
2297         src[x-2*stride] = p0 - d2;
2298         src[x+  stride] = p3 + d2;
2299     }
2300 }
2301 
2302 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2303     int y;
2304     const int strength= ff_h263_loop_filter_strength[qscale];
2305 
2306     for(y=0; y<8; y++){
2307         int d1, d2, ad1;
2308         int p0= src[y*stride-2];
2309         int p1= src[y*stride-1];
2310         int p2= src[y*stride+0];
2311         int p3= src[y*stride+1];
2312         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2313 
2314         if     (d<-2*strength) d1= 0;
2315         else if(d<-  strength) d1=-2*strength - d;
2316         else if(d<   strength) d1= d;
2317         else if(d< 2*strength) d1= 2*strength - d;
2318         else                   d1= 0;
2319 
2320         p1 += d1;
2321         p2 -= d1;
2322         if(p1&256) p1= ~(p1>>31);
2323         if(p2&256) p2= ~(p2>>31);
2324 
2325         src[y*stride-1] = p1;
2326         src[y*stride+0] = p2;
2327 
2328         ad1= ABS(d1)>>1;
2329 
2330         d2= clip((p0-p3)/4, -ad1, ad1);
2331 
2332         src[y*stride-2] = p0 - d2;
2333         src[y*stride+1] = p3 + d2;
2334     }
2335 }
2336 #endif
2337 #if 0
2338 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2339 {
2340     int s, i;
2341 
2342     s = 0;
2343     for(i=0;i<h;i++) {
2344         s += abs(pix1[0] - pix2[0]);
2345         s += abs(pix1[1] - pix2[1]);
2346         s += abs(pix1[2] - pix2[2]);
2347         s += abs(pix1[3] - pix2[3]);
2348         s += abs(pix1[4] - pix2[4]);
2349         s += abs(pix1[5] - pix2[5]);
2350         s += abs(pix1[6] - pix2[6]);
2351         s += abs(pix1[7] - pix2[7]);
2352         s += abs(pix1[8] - pix2[8]);
2353         s += abs(pix1[9] - pix2[9]);
2354         s += abs(pix1[10] - pix2[10]);
2355         s += abs(pix1[11] - pix2[11]);
2356         s += abs(pix1[12] - pix2[12]);
2357         s += abs(pix1[13] - pix2[13]);
2358         s += abs(pix1[14] - pix2[14]);
2359         s += abs(pix1[15] - pix2[15]);
2360         pix1 += line_size;
2361         pix2 += line_size;
2362     }
2363     return s;
2364 }
2365 
2366 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2367 {
2368     int s, i;
2369 
2370     s = 0;
2371     for(i=0;i<h;i++) {
2372         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2373         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2374         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2375         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2376         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2377         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2378         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2379         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2380         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2381         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2382         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2383         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2384         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2385         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2386         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2387         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2388         pix1 += line_size;
2389         pix2 += line_size;
2390     }
2391     return s;
2392 }
2393 
2394 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2395 {
2396     int s, i;
2397     uint8_t *pix3 = pix2 + line_size;
2398 
2399     s = 0;
2400     for(i=0;i<h;i++) {
2401         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2402         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2403         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2404         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2405         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2406         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2407         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2408         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2409         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2410         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2411         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2412         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2413         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2414         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2415         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2416         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2417         pix1 += line_size;
2418         pix2 += line_size;
2419         pix3 += line_size;
2420     }
2421     return s;
2422 }
2423 
2424 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2425 {
2426     int s, i;
2427     uint8_t *pix3 = pix2 + line_size;
2428 
2429     s = 0;
2430     for(i=0;i<h;i++) {
2431         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2432         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2433         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2434         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2435         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2436         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2437         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2438         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2439         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2440         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2441         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2442         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2443         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2444         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2445         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2446         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2447         pix1 += line_size;
2448         pix2 += line_size;
2449         pix3 += line_size;
2450     }
2451     return s;
2452 }
2453 
2454 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2455 {
2456     int s, i;
2457 
2458     s = 0;
2459     for(i=0;i<h;i++) {
2460         s += abs(pix1[0] - pix2[0]);
2461         s += abs(pix1[1] - pix2[1]);
2462         s += abs(pix1[2] - pix2[2]);
2463         s += abs(pix1[3] - pix2[3]);
2464         s += abs(pix1[4] - pix2[4]);
2465         s += abs(pix1[5] - pix2[5]);
2466         s += abs(pix1[6] - pix2[6]);
2467         s += abs(pix1[7] - pix2[7]);
2468         pix1 += line_size;
2469         pix2 += line_size;
2470     }
2471     return s;
2472 }
2473 
2474 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2475 {
2476     int s, i;
2477 
2478     s = 0;
2479     for(i=0;i<h;i++) {
2480         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2481         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2482         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2483         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2484         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2485         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2486         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2487         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2488         pix1 += line_size;
2489         pix2 += line_size;
2490     }
2491     return s;
2492 }
2493 
2494 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2495 {
2496     int s, i;
2497     uint8_t *pix3 = pix2 + line_size;
2498 
2499     s = 0;
2500     for(i=0;i<h;i++) {
2501         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2502         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2503         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2504         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2505         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2506         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2507         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2508         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2509         pix1 += line_size;
2510         pix2 += line_size;
2511         pix3 += line_size;
2512     }
2513     return s;
2514 }
2515 
2516 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2517 {
2518     int s, i;
2519     uint8_t *pix3 = pix2 + line_size;
2520 
2521     s = 0;
2522     for(i=0;i<h;i++) {
2523         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2524         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2525         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2526         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2527         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2528         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2529         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2530         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2531         pix1 += line_size;
2532         pix2 += line_size;
2533         pix3 += line_size;
2534     }
2535     return s;
2536 }
2537 
2538 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2539     int i;
2540     unsigned int sum=0;
2541 
2542     for(i=0; i<8*8; i++){
2543         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2544         int w= weight[i];
2545         b>>= RECON_SHIFT;
2546         assert(-512<b && b<512);
2547 
2548         sum += (w*b)*(w*b)>>4;
2549     }
2550     return sum>>2;
2551 }
2552 
2553 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2554     int i;
2555 
2556     for(i=0; i<8*8; i++){
2557         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2558     }
2559 }
2560 
2561 /**
2562  * permutes an 8x8 block.
2563  * @param block the block which will be permuted according to the given permutation vector
2564  * @param permutation the permutation vector
2565  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2566  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2567  *                  (inverse) permutated to scantable order!
2568  */
2569 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2570 {
2571     int i;
2572     DCTELEM temp[64];
2573 
2574     if(last<=0) return;
2575     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2576 
2577     for(i=0; i<=last; i++){
2578         const int j= scantable[i];
2579         temp[j]= block[j];
2580         block[j]=0;
2581     }
2582 
2583     for(i=0; i<=last; i++){
2584         const int j= scantable[i];
2585         const int perm_j= permutation[j];
2586         block[perm_j]= temp[j];
2587     }
2588 }
2589 
2590 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2591     return 0;
2592 }
2593 
2594 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2595     int i;
2596 
2597     memset(cmp, 0, sizeof(void*)*5);
2598 
2599     for(i=0; i<5; i++){
2600         switch(type&0xFF){
2601         case FF_CMP_SAD:
2602             cmp[i]= c->sad[i];
2603             break;
2604         case FF_CMP_SATD:
2605             cmp[i]= c->hadamard8_diff[i];
2606             break;
2607         case FF_CMP_SSE:
2608             cmp[i]= c->sse[i];
2609             break;
2610         case FF_CMP_DCT:
2611             cmp[i]= c->dct_sad[i];
2612             break;
2613         case FF_CMP_PSNR:
2614             cmp[i]= c->quant_psnr[i];
2615             break;
2616         case FF_CMP_BIT:
2617             cmp[i]= c->bit[i];
2618             break;
2619         case FF_CMP_RD:
2620             cmp[i]= c->rd[i];
2621             break;
2622         case FF_CMP_VSAD:
2623             cmp[i]= c->vsad[i];
2624             break;
2625         case FF_CMP_VSSE:
2626             cmp[i]= c->vsse[i];
2627             break;
2628         case FF_CMP_ZERO:
2629             cmp[i]= zero_cmp;
2630             break;
2631         default:
2632             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2633         }
2634     }
2635 }
2636 
2637 /**
2638  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2639  */
2640 static void clear_blocks_c(DCTELEM *blocks)
2641 {
2642     memset(blocks, 0, sizeof(DCTELEM)*6*64);
2643 }
2644 
2645 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2646     int i;
2647     for(i=0; i+7<w; i+=8){
2648         dst[i+0] += src[i+0];
2649         dst[i+1] += src[i+1];
2650         dst[i+2] += src[i+2];
2651         dst[i+3] += src[i+3];
2652         dst[i+4] += src[i+4];
2653         dst[i+5] += src[i+5];
2654         dst[i+6] += src[i+6];
2655         dst[i+7] += src[i+7];
2656     }
2657     for(; i<w; i++)
2658         dst[i+0] += src[i+0];
2659 }
2660 
2661 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2662     int i;
2663     for(i=0; i+7<w; i+=8){
2664         dst[i+0] = src1[i+0]-src2[i+0];
2665         dst[i+1] = src1[i+1]-src2[i+1];
2666         dst[i+2] = src1[i+2]-src2[i+2];
2667         dst[i+3] = src1[i+3]-src2[i+3];
2668         dst[i+4] = src1[i+4]-src2[i+4];
2669         dst[i+5] = src1[i+5]-src2[i+5];
2670         dst[i+6] = src1[i+6]-src2[i+6];
2671         dst[i+7] = src1[i+7]-src2[i+7];
2672     }
2673     for(; i<w; i++)
2674         dst[i+0] = src1[i+0]-src2[i+0];
2675 }
2676 
2677 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2678     int i;
2679     uint8_t l, lt;
2680 
2681     l= *left;
2682     lt= *left_top;
2683 
2684     for(i=0; i<w; i++){
2685         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2686         lt= src1[i];
2687         l= src2[i];
2688         dst[i]= l - pred;
2689     }
2690 
2691     *left= l;
2692     *left_top= lt;
2693 }
2694 #endif
2695 #if 0
2696 #define BUTTERFLY2(o1,o2,i1,i2) \
2697 o1= (i1)+(i2);\
2698 o2= (i1)-(i2);
2699 
2700 #define BUTTERFLY1(x,y) \
2701 {\
2702     int a,b;\
2703     a= x;\
2704     b= y;\
2705     x= a+b;\
2706     y= a-b;\
2707 }
2708 
2709 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2710 
2711 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2712     int i;
2713     int temp[64];
2714     int sum=0;
2715 
2716     assert(h==8);
2717 
2718     for(i=0; i<8; i++){
2719         //FIXME try pointer walks
2720         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2721         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2722         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2723         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2724 
2725         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2726         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2727         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2728         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2729 
2730         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2731         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2732         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2733         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2734     }
2735 
2736     for(i=0; i<8; i++){
2737         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2738         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2739         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2740         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2741 
2742         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2743         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2744         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2745         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2746 
2747         sum +=
2748              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2749             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2750             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2751             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2752     }
2753 #if 0
2754 static int maxi=0;
2755 if(sum>maxi){
2756     maxi=sum;
2757     printf("MAX:%d\n", maxi);
2758 }
2759 #endif
2760     return sum;
2761 }
2762 
2763 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2764     int i;
2765     int temp[64];
2766     int sum=0;
2767 
2768     assert(h==8);
2769 
2770     for(i=0; i<8; i++){
2771         //FIXME try pointer walks
2772         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2773         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2774         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2775         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2776 
2777         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2778         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2779         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2780         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2781 
2782         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2783         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2784         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2785         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2786     }
2787 
2788     for(i=0; i<8; i++){
2789         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2790         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2791         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2792         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2793 
2794         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2795         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2796         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2797         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2798 
2799         sum +=
2800              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2801             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2802             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2803             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2804     }
2805 
2806     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2807 
2808     return sum;
2809 }
2810 
2811 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2812     MpegEncContext * const s= (MpegEncContext *)c;
2813     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2814     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2815     int sum=0, i;
2816 
2817     assert(h==8);
2818 
2819     s->dsp.diff_pixels(temp, src1, src2, stride);
2820     s->dsp.fdct(temp);
2821 
2822     for(i=0; i<64; i++)
2823         sum+= ABS(temp[i]);
2824 
2825     return sum;
2826 }
2827 
2828 void simple_idct(DCTELEM *block); //FIXME
2829 
2830 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2831     MpegEncContext * const s= (MpegEncContext *)c;
2832     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2833     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2834     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2835     int sum=0, i;
2836 
2837     assert(h==8);
2838     s->mb_intra=0;
2839 
2840     s->dsp.diff_pixels(temp, src1, src2, stride);
2841 
2842     memcpy(bak, temp, 64*sizeof(DCTELEM));
2843 
2844     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2845     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2846     simple_idct(temp); //FIXME
2847 
2848     for(i=0; i<64; i++)
2849         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2850 
2851     return sum;
2852 }
2853 
2854 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2855     MpegEncContext * const s= (MpegEncContext *)c;
2856     const uint8_t *scantable= s->intra_scantable.permutated;
2857     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2858     uint64_t __align8 aligned_bak[stride];
2859     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2860     uint8_t * const bak= (uint8_t*)aligned_bak;
2861     int i, last, run, bits, level, distoration, start_i;
2862     const int esc_length= s->ac_esc_length;
2863     uint8_t * length;
2864     uint8_t * last_length;
2865 
2866     assert(h==8);
2867 
2868     for(i=0; i<8; i++){
2869         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2870         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2871     }
2872 
2873     s->dsp.diff_pixels(temp, src1, src2, stride);
2874 
2875     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2876 
2877     bits=0;
2878 
2879     if (s->mb_intra) {
2880         start_i = 1;
2881         length     = s->intra_ac_vlc_length;
2882         last_length= s->intra_ac_vlc_last_length;
2883         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2884     } else {
2885         start_i = 0;
2886         length     = s->inter_ac_vlc_length;
2887         last_length= s->inter_ac_vlc_last_length;
2888     }
2889 
2890     if(last>=start_i){
2891         run=0;
2892         for(i=start_i; i<last; i++){
2893             int j= scantable[i];
2894             level= temp[j];
2895 
2896             if(level){
2897                 level+=64;
2898                 if((level&(~127)) == 0){
2899                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2900                 }else
2901                     bits+= esc_length;
2902                 run=0;
2903             }else
2904                 run++;
2905         }
2906         i= scantable[last];
2907 
2908         level= temp[i] + 64;
2909 
2910         assert(level - 64);
2911 
2912         if((level&(~127)) == 0){
2913             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2914         }else
2915             bits+= esc_length;
2916 
2917     }
2918 
2919     if(last>=0){
2920         if(s->mb_intra)
2921             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2922         else
2923             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2924     }
2925 
2926     s->dsp.idct_add(bak, stride, temp);
2927 
2928     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2929 
2930     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2931 }
2932 
2933 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2934     MpegEncContext * const s= (MpegEncContext *)c;
2935     const uint8_t *scantable= s->intra_scantable.permutated;
2936     uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2937     DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938     int i, last, run, bits, level, start_i;
2939     const int esc_length= s->ac_esc_length;
2940     uint8_t * length;
2941     uint8_t * last_length;
2942 
2943     assert(h==8);
2944 
2945     s->dsp.diff_pixels(temp, src1, src2, stride);
2946 
2947     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2948 
2949     bits=0;
2950 
2951     if (s->mb_intra) {
2952         start_i = 1;
2953         length     = s->intra_ac_vlc_length;
2954         last_length= s->intra_ac_vlc_last_length;
2955         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2956     } else {
2957         start_i = 0;
2958         length     = s->inter_ac_vlc_length;
2959         last_length= s->inter_ac_vlc_last_length;
2960     }
2961 
2962     if(last>=start_i){
2963         run=0;
2964         for(i=start_i; i<last; i++){
2965             int j= scantable[i];
2966             level= temp[j];
2967 
2968             if(level){
2969                 level+=64;
2970                 if((level&(~127)) == 0){
2971                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
2972                 }else
2973                     bits+= esc_length;
2974                 run=0;
2975             }else
2976                 run++;
2977         }
2978         i= scantable[last];
2979 
2980         level= temp[i] + 64;
2981 
2982         assert(level - 64);
2983 
2984         if((level&(~127)) == 0){
2985             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2986         }else
2987             bits+= esc_length;
2988     }
2989 
2990     return bits;
2991 }
2992 
2993 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2994     int score=0;
2995     int x,y;
2996 
2997     for(y=1; y<h; y++){
2998         for(x=0; x<16; x+=4){
2999             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3000                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3001         }
3002         s+= stride;
3003     }
3004 
3005     return score;
3006 }
3007 
3008 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3009     int score=0;
3010     int x,y;
3011 
3012     for(y=1; y<h; y++){
3013         for(x=0; x<16; x++){
3014             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3015         }
3016         s1+= stride;
3017         s2+= stride;
3018     }
3019 
3020     return score;
3021 }
3022 
3023 #define SQ(a) ((a)*(a))
3024 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3025     int score=0;
3026     int x,y;
3027 
3028     for(y=1; y<h; y++){
3029         for(x=0; x<16; x+=4){
3030             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3031                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3032         }
3033         s+= stride;
3034     }
3035 
3036     return score;
3037 }
3038 
3039 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3040     int score=0;
3041     int x,y;
3042 
3043     for(y=1; y<h; y++){
3044         for(x=0; x<16; x++){
3045             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3046         }
3047         s1+= stride;
3048         s2+= stride;
3049     }
3050 
3051     return score;
3052 }
3053 
3054 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3055 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3056 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3057 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3058 WARPER8_16_SQ(rd8x8_c, rd16_c)
3059 WARPER8_16_SQ(bit8x8_c, bit16_c)
3060 
3061 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3062  converted */
3063 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3064 {
3065     j_rev_dct (block);
3066     put_pixels_clamped_c(block, dest, line_size);
3067 }
3068 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3069 {
3070     j_rev_dct (block);
3071     add_pixels_clamped_c(block, dest, line_size);
3072 }
3073 #endif
3074 /* init static data */
dsputil_static_init(void)3075 void dsputil_static_init(void)
3076 {
3077     int i;
3078 
3079     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3080     for(i=0;i<MAX_NEG_CROP;i++) {
3081         cropTbl[i] = 0;
3082         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3083     }
3084 
3085     for(i=0;i<512;i++) {
3086         squareTbl[i] = (i - 256) * (i - 256);
3087     }
3088 
3089     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3090 }
3091 
3092 #if 0
3093 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3094 {
3095     int i;
3096 
3097 #ifdef CONFIG_ENCODERS
3098     if(avctx->dct_algo==FF_DCT_FASTINT) {
3099         c->fdct = fdct_ifast;
3100 	c->fdct248 = fdct_ifast248;
3101     }
3102     else if(avctx->dct_algo==FF_DCT_FAAN) {
3103         c->fdct = ff_faandct;
3104 	c->fdct248 = ff_faandct248;
3105     }
3106     else {
3107         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3108 	c->fdct248 = ff_fdct248_islow;
3109     }
3110 #endif //CONFIG_ENCODERS
3111 
3112     if(avctx->idct_algo==FF_IDCT_INT){
3113         c->idct_put= ff_jref_idct_put;
3114         c->idct_add= ff_jref_idct_add;
3115         c->idct    = j_rev_dct;
3116         c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3117     }else{ //accurate/default
3118         c->idct_put= simple_idct_put;
3119         c->idct_add= simple_idct_add;
3120         c->idct    = simple_idct;
3121         c->idct_permutation_type= FF_NO_IDCT_PERM;
3122     }
3123 
3124     c->get_pixels = get_pixels_c;
3125     c->diff_pixels = diff_pixels_c;
3126     c->put_pixels_clamped = put_pixels_clamped_c;
3127     c->add_pixels_clamped = add_pixels_clamped_c;
3128     c->gmc1 = gmc1_c;
3129     c->gmc = gmc_c;
3130     c->clear_blocks = clear_blocks_c;
3131     c->pix_sum = pix_sum_c;
3132     c->pix_norm1 = pix_norm1_c;
3133 
3134     /* TODO [0] 16  [1] 8 */
3135     c->pix_abs[0][0] = pix_abs16_c;
3136     c->pix_abs[0][1] = pix_abs16_x2_c;
3137     c->pix_abs[0][2] = pix_abs16_y2_c;
3138     c->pix_abs[0][3] = pix_abs16_xy2_c;
3139     c->pix_abs[1][0] = pix_abs8_c;
3140     c->pix_abs[1][1] = pix_abs8_x2_c;
3141     c->pix_abs[1][2] = pix_abs8_y2_c;
3142     c->pix_abs[1][3] = pix_abs8_xy2_c;
3143 
3144 #define dspfunc(PFX, IDX, NUM) \
3145     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3146     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3147     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3148     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3149 
3150     dspfunc(put, 0, 16);
3151     dspfunc(put_no_rnd, 0, 16);
3152     dspfunc(put, 1, 8);
3153     dspfunc(put_no_rnd, 1, 8);
3154     dspfunc(put, 2, 4);
3155     dspfunc(put, 3, 2);
3156 
3157     dspfunc(avg, 0, 16);
3158     dspfunc(avg_no_rnd, 0, 16);
3159     dspfunc(avg, 1, 8);
3160     dspfunc(avg_no_rnd, 1, 8);
3161     dspfunc(avg, 2, 4);
3162     dspfunc(avg, 3, 2);
3163 #undef dspfunc
3164 
3165     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3166     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3167     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3168     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3169     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3170     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3171     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3172     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3173     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3174 
3175     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3176     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3177     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3178     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3179     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3180     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3181     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3182     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3183     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3184 
3185 #define dspfunc(PFX, IDX, NUM) \
3186     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3187     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3188     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3189     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3190     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3191     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3192     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3193     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3194     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3195     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3196     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3197     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3198     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3199     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3200     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3201     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3202 
3203     dspfunc(put_qpel, 0, 16);
3204     dspfunc(put_no_rnd_qpel, 0, 16);
3205 
3206     dspfunc(avg_qpel, 0, 16);
3207     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3208 
3209     dspfunc(put_qpel, 1, 8);
3210     dspfunc(put_no_rnd_qpel, 1, 8);
3211 
3212     dspfunc(avg_qpel, 1, 8);
3213     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3214 
3215     dspfunc(put_h264_qpel, 0, 16);
3216     dspfunc(put_h264_qpel, 1, 8);
3217     dspfunc(put_h264_qpel, 2, 4);
3218     dspfunc(avg_h264_qpel, 0, 16);
3219     dspfunc(avg_h264_qpel, 1, 8);
3220     dspfunc(avg_h264_qpel, 2, 4);
3221 
3222 #undef dspfunc
3223     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3224     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3225     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3226     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3227     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3228     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3229 
3230     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3231     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3232     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3233     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3234     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3235     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3236     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3237     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3238 
3239 #define SET_CMP_FUNC(name) \
3240     c->name[0]= name ## 16_c;\
3241     c->name[1]= name ## 8x8_c;
3242 
3243     SET_CMP_FUNC(hadamard8_diff)
3244     c->hadamard8_diff[4]= hadamard8_intra16_c;
3245     SET_CMP_FUNC(dct_sad)
3246     c->sad[0]= pix_abs16_c;
3247     c->sad[1]= pix_abs8_c;
3248     c->sse[0]= sse16_c;
3249     c->sse[1]= sse8_c;
3250     SET_CMP_FUNC(quant_psnr)
3251     SET_CMP_FUNC(rd)
3252     SET_CMP_FUNC(bit)
3253     c->vsad[0]= vsad16_c;
3254     c->vsad[4]= vsad_intra16_c;
3255     c->vsse[0]= vsse16_c;
3256     c->vsse[4]= vsse_intra16_c;
3257 
3258     c->add_bytes= add_bytes_c;
3259     c->diff_bytes= diff_bytes_c;
3260     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3261     c->bswap_buf= bswap_buf;
3262 #if 0
3263     c->h263_h_loop_filter= h263_h_loop_filter_c;
3264     c->h263_v_loop_filter= h263_v_loop_filter_c;
3265 #endif
3266     c->try_8x8basis= try_8x8basis_c;
3267     c->add_8x8basis= add_8x8basis_c;
3268 
3269 #ifdef HAVE_MMX
3270     dsputil_init_mmx(c, avctx);
3271 #endif
3272 #ifdef ARCH_ARMV4L
3273     dsputil_init_armv4l(c, avctx);
3274 #endif
3275 #ifdef HAVE_MLIB
3276     dsputil_init_mlib(c, avctx);
3277 #endif
3278 #ifdef ARCH_ALPHA
3279     dsputil_init_alpha(c, avctx);
3280 #endif
3281 #ifdef ARCH_POWERPC
3282     dsputil_init_ppc(c, avctx);
3283 #endif
3284 #ifdef HAVE_MMI
3285     dsputil_init_mmi(c, avctx);
3286 #endif
3287 #ifdef ARCH_SH4
3288     dsputil_init_sh4(c,avctx);
3289 #endif
3290 
3291     switch(c->idct_permutation_type){
3292     case FF_NO_IDCT_PERM:
3293         for(i=0; i<64; i++)
3294             c->idct_permutation[i]= i;
3295         break;
3296     case FF_LIBMPEG2_IDCT_PERM:
3297         for(i=0; i<64; i++)
3298             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3299         break;
3300     case FF_SIMPLE_IDCT_PERM:
3301         for(i=0; i<64; i++)
3302             c->idct_permutation[i]= simple_mmx_permutation[i];
3303         break;
3304     case FF_TRANSPOSE_IDCT_PERM:
3305         for(i=0; i<64; i++)
3306             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3307         break;
3308     default:
3309         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3310     }
3311 }
3312 #endif
3313