1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */
22
23 /**
24 * @file dsputil.c
25 * DSP utils
26 */
27
28 #include "avcodec.h"
29 #include "dsputil.h"
30 //#include "mpegvideo.h"
31 #include "simple_idct.h"
32 //#include "faandct.h"
33
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
36
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
46 };
47
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
59 };
60
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
63
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
73 };
74
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
84 };
85
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
120 };
121
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 };
133 #if 0
134 static int pix_sum_c(uint8_t * pix, int line_size)
135 {
136 int s, i, j;
137
138 s = 0;
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
141 s += pix[0];
142 s += pix[1];
143 s += pix[2];
144 s += pix[3];
145 s += pix[4];
146 s += pix[5];
147 s += pix[6];
148 s += pix[7];
149 pix += 8;
150 }
151 pix += line_size - 16;
152 }
153 return s;
154 }
155
156 static int pix_norm1_c(uint8_t * pix, int line_size)
157 {
158 int s, i, j;
159 uint32_t *sq = squareTbl + 256;
160
161 s = 0;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
164 #if 0
165 s += sq[pix[0]];
166 s += sq[pix[1]];
167 s += sq[pix[2]];
168 s += sq[pix[3]];
169 s += sq[pix[4]];
170 s += sq[pix[5]];
171 s += sq[pix[6]];
172 s += sq[pix[7]];
173 #else
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
176 s += sq[x&0xff];
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
184 #else
185 register uint32_t x=*(uint32_t*)pix;
186 s += sq[x&0xff];
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
191 s += sq[x&0xff];
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 #endif
196 #endif
197 pix += 8;
198 }
199 pix += line_size - 16;
200 }
201 return s;
202 }
203
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
205 int i;
206
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
216 }
217 for(;i<w; i++){
218 dst[i+0]= bswap_32(src[i+0]);
219 }
220 }
221
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
223 {
224 int s, i;
225 uint32_t *sq = squareTbl + 256;
226
227 s = 0;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
237 pix1 += line_size;
238 pix2 += line_size;
239 }
240 return s;
241 }
242
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
244 {
245 int s, i;
246 uint32_t *sq = squareTbl + 256;
247
248 s = 0;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
266
267 pix1 += line_size;
268 pix2 += line_size;
269 }
270 return s;
271 }
272
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
274 {
275 int i;
276
277 /* read the pixels */
278 for(i=0;i<8;i++) {
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
287 pixels += line_size;
288 block += 8;
289 }
290 }
291
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
294 int i;
295
296 /* read the pixels */
297 for(i=0;i<8;i++) {
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
306 s1 += stride;
307 s2 += stride;
308 block += 8;
309 }
310 }
311
312
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
314 int line_size)
315 {
316 int i;
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
318
319 /* read the pixels */
320 for(i=0;i<8;i++) {
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
329
330 pixels += line_size;
331 block += 8;
332 }
333 }
334
335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
336 int line_size)
337 {
338 int i;
339 uint8_t *cm = cropTbl + MAX_NEG_CROP;
340
341 /* read the pixels */
342 for(i=0;i<8;i++) {
343 pixels[0] = cm[pixels[0] + block[0]];
344 pixels[1] = cm[pixels[1] + block[1]];
345 pixels[2] = cm[pixels[2] + block[2]];
346 pixels[3] = cm[pixels[3] + block[3]];
347 pixels[4] = cm[pixels[4] + block[4]];
348 pixels[5] = cm[pixels[5] + block[5]];
349 pixels[6] = cm[pixels[6] + block[6]];
350 pixels[7] = cm[pixels[7] + block[7]];
351 pixels += line_size;
352 block += 8;
353 }
354 }
355 #endif
356 #if 0
357
358 #define PIXOP2(OPNAME, OP) \
359 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
360 {\
361 int i;\
362 for(i=0; i<h; i++){\
363 OP(*((uint64_t*)block), LD64(pixels));\
364 pixels+=line_size;\
365 block +=line_size;\
366 }\
367 }\
368 \
369 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
370 {\
371 int i;\
372 for(i=0; i<h; i++){\
373 const uint64_t a= LD64(pixels );\
374 const uint64_t b= LD64(pixels+1);\
375 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
376 pixels+=line_size;\
377 block +=line_size;\
378 }\
379 }\
380 \
381 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
382 {\
383 int i;\
384 for(i=0; i<h; i++){\
385 const uint64_t a= LD64(pixels );\
386 const uint64_t b= LD64(pixels+1);\
387 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
388 pixels+=line_size;\
389 block +=line_size;\
390 }\
391 }\
392 \
393 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
394 {\
395 int i;\
396 for(i=0; i<h; i++){\
397 const uint64_t a= LD64(pixels );\
398 const uint64_t b= LD64(pixels+line_size);\
399 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
400 pixels+=line_size;\
401 block +=line_size;\
402 }\
403 }\
404 \
405 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
406 {\
407 int i;\
408 for(i=0; i<h; i++){\
409 const uint64_t a= LD64(pixels );\
410 const uint64_t b= LD64(pixels+line_size);\
411 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
412 pixels+=line_size;\
413 block +=line_size;\
414 }\
415 }\
416 \
417 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
418 {\
419 int i;\
420 const uint64_t a= LD64(pixels );\
421 const uint64_t b= LD64(pixels+1);\
422 uint64_t l0= (a&0x0303030303030303ULL)\
423 + (b&0x0303030303030303ULL)\
424 + 0x0202020202020202ULL;\
425 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
426 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
427 uint64_t l1,h1;\
428 \
429 pixels+=line_size;\
430 for(i=0; i<h; i+=2){\
431 uint64_t a= LD64(pixels );\
432 uint64_t b= LD64(pixels+1);\
433 l1= (a&0x0303030303030303ULL)\
434 + (b&0x0303030303030303ULL);\
435 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
436 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
437 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
438 pixels+=line_size;\
439 block +=line_size;\
440 a= LD64(pixels );\
441 b= LD64(pixels+1);\
442 l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
447 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
448 pixels+=line_size;\
449 block +=line_size;\
450 }\
451 }\
452 \
453 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
454 {\
455 int i;\
456 const uint64_t a= LD64(pixels );\
457 const uint64_t b= LD64(pixels+1);\
458 uint64_t l0= (a&0x0303030303030303ULL)\
459 + (b&0x0303030303030303ULL)\
460 + 0x0101010101010101ULL;\
461 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
462 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
463 uint64_t l1,h1;\
464 \
465 pixels+=line_size;\
466 for(i=0; i<h; i+=2){\
467 uint64_t a= LD64(pixels );\
468 uint64_t b= LD64(pixels+1);\
469 l1= (a&0x0303030303030303ULL)\
470 + (b&0x0303030303030303ULL);\
471 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
472 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
473 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
474 pixels+=line_size;\
475 block +=line_size;\
476 a= LD64(pixels );\
477 b= LD64(pixels+1);\
478 l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
483 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
484 pixels+=line_size;\
485 block +=line_size;\
486 }\
487 }\
488 \
489 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
491 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
492 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
495 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
496
497 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
498 #else // 64 bit variant
499
500 #define PIXOP2(OPNAME, OP) \
501 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
502 int i;\
503 for(i=0; i<h; i++){\
504 OP(*((uint16_t*)(block )), LD16(pixels ));\
505 pixels+=line_size;\
506 block +=line_size;\
507 }\
508 }\
509 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
510 int i;\
511 for(i=0; i<h; i++){\
512 OP(*((uint32_t*)(block )), LD32(pixels ));\
513 pixels+=line_size;\
514 block +=line_size;\
515 }\
516 }\
517 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
518 int i;\
519 for(i=0; i<h; i++){\
520 OP(*((uint32_t*)(block )), LD32(pixels ));\
521 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
522 pixels+=line_size;\
523 block +=line_size;\
524 }\
525 }\
526 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
527 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
528 }\
529 \
530 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
531 int src_stride1, int src_stride2, int h){\
532 int i;\
533 for(i=0; i<h; i++){\
534 uint32_t a,b;\
535 a= LD32(&src1[i*src_stride1 ]);\
536 b= LD32(&src2[i*src_stride2 ]);\
537 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
538 a= LD32(&src1[i*src_stride1+4]);\
539 b= LD32(&src2[i*src_stride2+4]);\
540 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
541 }\
542 }\
543 \
544 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
545 int src_stride1, int src_stride2, int h){\
546 int i;\
547 for(i=0; i<h; i++){\
548 uint32_t a,b;\
549 a= LD32(&src1[i*src_stride1 ]);\
550 b= LD32(&src2[i*src_stride2 ]);\
551 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
552 a= LD32(&src1[i*src_stride1+4]);\
553 b= LD32(&src2[i*src_stride2+4]);\
554 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
555 }\
556 }\
557 \
558 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
559 int src_stride1, int src_stride2, int h){\
560 int i;\
561 for(i=0; i<h; i++){\
562 uint32_t a,b;\
563 a= LD32(&src1[i*src_stride1 ]);\
564 b= LD32(&src2[i*src_stride2 ]);\
565 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
566 }\
567 }\
568 \
569 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
570 int src_stride1, int src_stride2, int h){\
571 int i;\
572 for(i=0; i<h; i++){\
573 uint32_t a,b;\
574 a= LD16(&src1[i*src_stride1 ]);\
575 b= LD16(&src2[i*src_stride2 ]);\
576 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
577 }\
578 }\
579 \
580 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
581 int src_stride1, int src_stride2, int h){\
582 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
583 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
584 }\
585 \
586 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
587 int src_stride1, int src_stride2, int h){\
588 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
589 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
590 }\
591 \
592 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
593 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
594 }\
595 \
596 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
597 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
598 }\
599 \
600 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
601 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
602 }\
603 \
604 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
605 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
606 }\
607 \
608 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
609 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610 int i;\
611 for(i=0; i<h; i++){\
612 uint32_t a, b, c, d, l0, l1, h0, h1;\
613 a= LD32(&src1[i*src_stride1]);\
614 b= LD32(&src2[i*src_stride2]);\
615 c= LD32(&src3[i*src_stride3]);\
616 d= LD32(&src4[i*src_stride4]);\
617 l0= (a&0x03030303UL)\
618 + (b&0x03030303UL)\
619 + 0x02020202UL;\
620 h0= ((a&0xFCFCFCFCUL)>>2)\
621 + ((b&0xFCFCFCFCUL)>>2);\
622 l1= (c&0x03030303UL)\
623 + (d&0x03030303UL);\
624 h1= ((c&0xFCFCFCFCUL)>>2)\
625 + ((d&0xFCFCFCFCUL)>>2);\
626 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
627 a= LD32(&src1[i*src_stride1+4]);\
628 b= LD32(&src2[i*src_stride2+4]);\
629 c= LD32(&src3[i*src_stride3+4]);\
630 d= LD32(&src4[i*src_stride4+4]);\
631 l0= (a&0x03030303UL)\
632 + (b&0x03030303UL)\
633 + 0x02020202UL;\
634 h0= ((a&0xFCFCFCFCUL)>>2)\
635 + ((b&0xFCFCFCFCUL)>>2);\
636 l1= (c&0x03030303UL)\
637 + (d&0x03030303UL);\
638 h1= ((c&0xFCFCFCFCUL)>>2)\
639 + ((d&0xFCFCFCFCUL)>>2);\
640 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
641 }\
642 }\
643 \
644 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
645 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
646 }\
647 \
648 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
649 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
650 }\
651 \
652 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
653 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
654 }\
655 \
656 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
657 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
658 }\
659 \
660 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
661 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
662 int i;\
663 for(i=0; i<h; i++){\
664 uint32_t a, b, c, d, l0, l1, h0, h1;\
665 a= LD32(&src1[i*src_stride1]);\
666 b= LD32(&src2[i*src_stride2]);\
667 c= LD32(&src3[i*src_stride3]);\
668 d= LD32(&src4[i*src_stride4]);\
669 l0= (a&0x03030303UL)\
670 + (b&0x03030303UL)\
671 + 0x01010101UL;\
672 h0= ((a&0xFCFCFCFCUL)>>2)\
673 + ((b&0xFCFCFCFCUL)>>2);\
674 l1= (c&0x03030303UL)\
675 + (d&0x03030303UL);\
676 h1= ((c&0xFCFCFCFCUL)>>2)\
677 + ((d&0xFCFCFCFCUL)>>2);\
678 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
679 a= LD32(&src1[i*src_stride1+4]);\
680 b= LD32(&src2[i*src_stride2+4]);\
681 c= LD32(&src3[i*src_stride3+4]);\
682 d= LD32(&src4[i*src_stride4+4]);\
683 l0= (a&0x03030303UL)\
684 + (b&0x03030303UL)\
685 + 0x01010101UL;\
686 h0= ((a&0xFCFCFCFCUL)>>2)\
687 + ((b&0xFCFCFCFCUL)>>2);\
688 l1= (c&0x03030303UL)\
689 + (d&0x03030303UL);\
690 h1= ((c&0xFCFCFCFCUL)>>2)\
691 + ((d&0xFCFCFCFCUL)>>2);\
692 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
693 }\
694 }\
695 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
696 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
697 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
699 }\
700 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
701 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
702 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
703 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
704 }\
705 \
706 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 {\
708 int i, a0, b0, a1, b1;\
709 a0= pixels[0];\
710 b0= pixels[1] + 2;\
711 a0 += b0;\
712 b0 += pixels[2];\
713 \
714 pixels+=line_size;\
715 for(i=0; i<h; i+=2){\
716 a1= pixels[0];\
717 b1= pixels[1];\
718 a1 += b1;\
719 b1 += pixels[2];\
720 \
721 block[0]= (a1+a0)>>2; /* FIXME non put */\
722 block[1]= (b1+b0)>>2;\
723 \
724 pixels+=line_size;\
725 block +=line_size;\
726 \
727 a0= pixels[0];\
728 b0= pixels[1] + 2;\
729 a0 += b0;\
730 b0 += pixels[2];\
731 \
732 block[0]= (a1+a0)>>2;\
733 block[1]= (b1+b0)>>2;\
734 pixels+=line_size;\
735 block +=line_size;\
736 }\
737 }\
738 \
739 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
740 {\
741 int i;\
742 const uint32_t a= LD32(pixels );\
743 const uint32_t b= LD32(pixels+1);\
744 uint32_t l0= (a&0x03030303UL)\
745 + (b&0x03030303UL)\
746 + 0x02020202UL;\
747 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
748 + ((b&0xFCFCFCFCUL)>>2);\
749 uint32_t l1,h1;\
750 \
751 pixels+=line_size;\
752 for(i=0; i<h; i+=2){\
753 uint32_t a= LD32(pixels );\
754 uint32_t b= LD32(pixels+1);\
755 l1= (a&0x03030303UL)\
756 + (b&0x03030303UL);\
757 h1= ((a&0xFCFCFCFCUL)>>2)\
758 + ((b&0xFCFCFCFCUL)>>2);\
759 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
760 pixels+=line_size;\
761 block +=line_size;\
762 a= LD32(pixels );\
763 b= LD32(pixels+1);\
764 l0= (a&0x03030303UL)\
765 + (b&0x03030303UL)\
766 + 0x02020202UL;\
767 h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
769 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
770 pixels+=line_size;\
771 block +=line_size;\
772 }\
773 }\
774 \
775 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776 {\
777 int j;\
778 for(j=0; j<2; j++){\
779 int i;\
780 const uint32_t a= LD32(pixels );\
781 const uint32_t b= LD32(pixels+1);\
782 uint32_t l0= (a&0x03030303UL)\
783 + (b&0x03030303UL)\
784 + 0x02020202UL;\
785 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
786 + ((b&0xFCFCFCFCUL)>>2);\
787 uint32_t l1,h1;\
788 \
789 pixels+=line_size;\
790 for(i=0; i<h; i+=2){\
791 uint32_t a= LD32(pixels );\
792 uint32_t b= LD32(pixels+1);\
793 l1= (a&0x03030303UL)\
794 + (b&0x03030303UL);\
795 h1= ((a&0xFCFCFCFCUL)>>2)\
796 + ((b&0xFCFCFCFCUL)>>2);\
797 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
798 pixels+=line_size;\
799 block +=line_size;\
800 a= LD32(pixels );\
801 b= LD32(pixels+1);\
802 l0= (a&0x03030303UL)\
803 + (b&0x03030303UL)\
804 + 0x02020202UL;\
805 h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
807 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
808 pixels+=line_size;\
809 block +=line_size;\
810 }\
811 pixels+=4-line_size*(h+1);\
812 block +=4-line_size*h;\
813 }\
814 }\
815 \
816 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
817 {\
818 int j;\
819 for(j=0; j<2; j++){\
820 int i;\
821 const uint32_t a= LD32(pixels );\
822 const uint32_t b= LD32(pixels+1);\
823 uint32_t l0= (a&0x03030303UL)\
824 + (b&0x03030303UL)\
825 + 0x01010101UL;\
826 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
827 + ((b&0xFCFCFCFCUL)>>2);\
828 uint32_t l1,h1;\
829 \
830 pixels+=line_size;\
831 for(i=0; i<h; i+=2){\
832 uint32_t a= LD32(pixels );\
833 uint32_t b= LD32(pixels+1);\
834 l1= (a&0x03030303UL)\
835 + (b&0x03030303UL);\
836 h1= ((a&0xFCFCFCFCUL)>>2)\
837 + ((b&0xFCFCFCFCUL)>>2);\
838 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
839 pixels+=line_size;\
840 block +=line_size;\
841 a= LD32(pixels );\
842 b= LD32(pixels+1);\
843 l0= (a&0x03030303UL)\
844 + (b&0x03030303UL)\
845 + 0x01010101UL;\
846 h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
848 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
849 pixels+=line_size;\
850 block +=line_size;\
851 }\
852 pixels+=4-line_size*(h+1);\
853 block +=4-line_size*h;\
854 }\
855 }\
856 \
857 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
859 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
860 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
864 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
865
866 #define op_avg(a, b) a = rnd_avg32(a, b)
867 #endif
868 #define op_put(a, b) a = b
869
870 //PIXOP2(avg, op_avg)
871 //PIXOP2(put, op_put)
872 #undef op_avg
873 #undef op_put
874
875 #define avg2(a,b) ((a+b+1)>>1)
876 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
877
878 #if 0
879 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
880 {
881 const int A=(16-x16)*(16-y16);
882 const int B=( x16)*(16-y16);
883 const int C=(16-x16)*( y16);
884 const int D=( x16)*( y16);
885 int i;
886
887 for(i=0; i<h; i++)
888 {
889 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
890 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
891 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
892 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
893 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
894 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
895 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
896 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
897 dst+= stride;
898 src+= stride;
899 }
900 }
901
902 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
903 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
904 {
905 int y, vx, vy;
906 const int s= 1<<shift;
907
908 width--;
909 height--;
910
911 for(y=0; y<h; y++){
912 int x;
913
914 vx= ox;
915 vy= oy;
916 for(x=0; x<8; x++){ //XXX FIXME optimize
917 int src_x, src_y, frac_x, frac_y, index;
918
919 src_x= vx>>16;
920 src_y= vy>>16;
921 frac_x= src_x&(s-1);
922 frac_y= src_y&(s-1);
923 src_x>>=shift;
924 src_y>>=shift;
925
926 if((unsigned)src_x < width){
927 if((unsigned)src_y < height){
928 index= src_x + src_y*stride;
929 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
930 + src[index +1]* frac_x )*(s-frac_y)
931 + ( src[index+stride ]*(s-frac_x)
932 + src[index+stride+1]* frac_x )* frac_y
933 + r)>>(shift*2);
934 }else{
935 index= src_x + clip(src_y, 0, height)*stride;
936 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
937 + src[index +1]* frac_x )*s
938 + r)>>(shift*2);
939 }
940 }else{
941 if((unsigned)src_y < height){
942 index= clip(src_x, 0, width) + src_y*stride;
943 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
944 + src[index+stride ]* frac_y )*s
945 + r)>>(shift*2);
946 }else{
947 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
948 dst[y*stride + x]= src[index ];
949 }
950 }
951
952 vx+= dxx;
953 vy+= dyx;
954 }
955 ox += dxy;
956 oy += dyy;
957 }
958 }
959
960 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
961 switch(width){
962 case 2: put_pixels2_c (dst, src, stride, height); break;
963 case 4: put_pixels4_c (dst, src, stride, height); break;
964 case 8: put_pixels8_c (dst, src, stride, height); break;
965 case 16:put_pixels16_c(dst, src, stride, height); break;
966 }
967 }
968
969 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
970 int i,j;
971 for (i=0; i < height; i++) {
972 for (j=0; j < width; j++) {
973 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
974 }
975 src += stride;
976 dst += stride;
977 }
978 }
979
980 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
981 int i,j;
982 for (i=0; i < height; i++) {
983 for (j=0; j < width; j++) {
984 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
985 }
986 src += stride;
987 dst += stride;
988 }
989 }
990
991 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
992 int i,j;
993 for (i=0; i < height; i++) {
994 for (j=0; j < width; j++) {
995 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
996 }
997 src += stride;
998 dst += stride;
999 }
1000 }
1001
1002 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1003 int i,j;
1004 for (i=0; i < height; i++) {
1005 for (j=0; j < width; j++) {
1006 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1007 }
1008 src += stride;
1009 dst += stride;
1010 }
1011 }
1012
1013 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1014 int i,j;
1015 for (i=0; i < height; i++) {
1016 for (j=0; j < width; j++) {
1017 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1018 }
1019 src += stride;
1020 dst += stride;
1021 }
1022 }
1023
1024 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1025 int i,j;
1026 for (i=0; i < height; i++) {
1027 for (j=0; j < width; j++) {
1028 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1029 }
1030 src += stride;
1031 dst += stride;
1032 }
1033 }
1034
1035 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1036 int i,j;
1037 for (i=0; i < height; i++) {
1038 for (j=0; j < width; j++) {
1039 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1040 }
1041 src += stride;
1042 dst += stride;
1043 }
1044 }
1045
1046 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1047 int i,j;
1048 for (i=0; i < height; i++) {
1049 for (j=0; j < width; j++) {
1050 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1051 }
1052 src += stride;
1053 dst += stride;
1054 }
1055 }
1056
1057 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1058 switch(width){
1059 case 2: avg_pixels2_c (dst, src, stride, height); break;
1060 case 4: avg_pixels4_c (dst, src, stride, height); break;
1061 case 8: avg_pixels8_c (dst, src, stride, height); break;
1062 case 16:avg_pixels16_c(dst, src, stride, height); break;
1063 }
1064 }
1065
1066 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1067 int i,j;
1068 for (i=0; i < height; i++) {
1069 for (j=0; j < width; j++) {
1070 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1071 }
1072 src += stride;
1073 dst += stride;
1074 }
1075 }
1076
1077 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1078 int i,j;
1079 for (i=0; i < height; i++) {
1080 for (j=0; j < width; j++) {
1081 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1082 }
1083 src += stride;
1084 dst += stride;
1085 }
1086 }
1087
1088 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1089 int i,j;
1090 for (i=0; i < height; i++) {
1091 for (j=0; j < width; j++) {
1092 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1093 }
1094 src += stride;
1095 dst += stride;
1096 }
1097 }
1098
1099 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1100 int i,j;
1101 for (i=0; i < height; i++) {
1102 for (j=0; j < width; j++) {
1103 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1104 }
1105 src += stride;
1106 dst += stride;
1107 }
1108 }
1109
1110 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1111 int i,j;
1112 for (i=0; i < height; i++) {
1113 for (j=0; j < width; j++) {
1114 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1115 }
1116 src += stride;
1117 dst += stride;
1118 }
1119 }
1120
1121 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1122 int i,j;
1123 for (i=0; i < height; i++) {
1124 for (j=0; j < width; j++) {
1125 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1126 }
1127 src += stride;
1128 dst += stride;
1129 }
1130 }
1131
1132 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1133 int i,j;
1134 for (i=0; i < height; i++) {
1135 for (j=0; j < width; j++) {
1136 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1137 }
1138 src += stride;
1139 dst += stride;
1140 }
1141 }
1142
1143 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1144 int i,j;
1145 for (i=0; i < height; i++) {
1146 for (j=0; j < width; j++) {
1147 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1148 }
1149 src += stride;
1150 dst += stride;
1151 }
1152 }
1153 #if 0
1154 #define TPEL_WIDTH(width)\
1155 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1171 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1173 #endif
1174
1175 #define H264_CHROMA_MC(OPNAME, OP)\
1176 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1177 const int A=(8-x)*(8-y);\
1178 const int B=( x)*(8-y);\
1179 const int C=(8-x)*( y);\
1180 const int D=( x)*( y);\
1181 int i;\
1182 \
1183 assert(x<8 && y<8 && x>=0 && y>=0);\
1184 \
1185 for(i=0; i<h; i++)\
1186 {\
1187 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1188 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1189 dst+= stride;\
1190 src+= stride;\
1191 }\
1192 }\
1193 \
1194 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1195 const int A=(8-x)*(8-y);\
1196 const int B=( x)*(8-y);\
1197 const int C=(8-x)*( y);\
1198 const int D=( x)*( y);\
1199 int i;\
1200 \
1201 assert(x<8 && y<8 && x>=0 && y>=0);\
1202 \
1203 for(i=0; i<h; i++)\
1204 {\
1205 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1206 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1207 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1208 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1209 dst+= stride;\
1210 src+= stride;\
1211 }\
1212 }\
1213 \
1214 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1215 const int A=(8-x)*(8-y);\
1216 const int B=( x)*(8-y);\
1217 const int C=(8-x)*( y);\
1218 const int D=( x)*( y);\
1219 int i;\
1220 \
1221 assert(x<8 && y<8 && x>=0 && y>=0);\
1222 \
1223 for(i=0; i<h; i++)\
1224 {\
1225 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1226 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1227 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1228 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1229 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1230 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1231 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1232 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1233 dst+= stride;\
1234 src+= stride;\
1235 }\
1236 }
1237
1238 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1239 #define op_put(a, b) a = (((b) + 32)>>6)
1240
1241 H264_CHROMA_MC(put_ , op_put)
1242 H264_CHROMA_MC(avg_ , op_avg)
1243 #undef op_avg
1244 #undef op_put
1245
1246 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1247 {
1248 int i;
1249 for(i=0; i<h; i++)
1250 {
1251 ST32(dst , LD32(src ));
1252 dst+=dstStride;
1253 src+=srcStride;
1254 }
1255 }
1256
1257 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1258 {
1259 int i;
1260 for(i=0; i<h; i++)
1261 {
1262 ST32(dst , LD32(src ));
1263 ST32(dst+4 , LD32(src+4 ));
1264 dst+=dstStride;
1265 src+=srcStride;
1266 }
1267 }
1268
1269 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1270 {
1271 int i;
1272 for(i=0; i<h; i++)
1273 {
1274 ST32(dst , LD32(src ));
1275 ST32(dst+4 , LD32(src+4 ));
1276 ST32(dst+8 , LD32(src+8 ));
1277 ST32(dst+12, LD32(src+12));
1278 dst+=dstStride;
1279 src+=srcStride;
1280 }
1281 }
1282
1283 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1284 {
1285 int i;
1286 for(i=0; i<h; i++)
1287 {
1288 ST32(dst , LD32(src ));
1289 ST32(dst+4 , LD32(src+4 ));
1290 ST32(dst+8 , LD32(src+8 ));
1291 ST32(dst+12, LD32(src+12));
1292 dst[16]= src[16];
1293 dst+=dstStride;
1294 src+=srcStride;
1295 }
1296 }
1297
1298 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1299 {
1300 int i;
1301 for(i=0; i<h; i++)
1302 {
1303 ST32(dst , LD32(src ));
1304 ST32(dst+4 , LD32(src+4 ));
1305 dst[8]= src[8];
1306 dst+=dstStride;
1307 src+=srcStride;
1308 }
1309 }
1310
1311
1312 #define QPEL_MC(r, OPNAME, RND, OP) \
1313 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1314 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1315 int i;\
1316 for(i=0; i<h; i++)\
1317 {\
1318 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1319 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1320 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1321 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1322 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1323 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1324 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1325 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1326 dst+=dstStride;\
1327 src+=srcStride;\
1328 }\
1329 }\
1330 \
1331 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1332 const int w=8;\
1333 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1334 int i;\
1335 for(i=0; i<w; i++)\
1336 {\
1337 const int src0= src[0*srcStride];\
1338 const int src1= src[1*srcStride];\
1339 const int src2= src[2*srcStride];\
1340 const int src3= src[3*srcStride];\
1341 const int src4= src[4*srcStride];\
1342 const int src5= src[5*srcStride];\
1343 const int src6= src[6*srcStride];\
1344 const int src7= src[7*srcStride];\
1345 const int src8= src[8*srcStride];\
1346 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1347 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1348 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1349 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1350 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1351 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1352 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1353 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1354 dst++;\
1355 src++;\
1356 }\
1357 }\
1358 \
1359 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1361 int i;\
1362 \
1363 for(i=0; i<h; i++)\
1364 {\
1365 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1366 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1367 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1368 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1369 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1370 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1371 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1372 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1373 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1374 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1375 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1376 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1377 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1378 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1379 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1380 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1381 dst+=dstStride;\
1382 src+=srcStride;\
1383 }\
1384 }\
1385 \
1386 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1388 int i;\
1389 const int w=16;\
1390 for(i=0; i<w; i++)\
1391 {\
1392 const int src0= src[0*srcStride];\
1393 const int src1= src[1*srcStride];\
1394 const int src2= src[2*srcStride];\
1395 const int src3= src[3*srcStride];\
1396 const int src4= src[4*srcStride];\
1397 const int src5= src[5*srcStride];\
1398 const int src6= src[6*srcStride];\
1399 const int src7= src[7*srcStride];\
1400 const int src8= src[8*srcStride];\
1401 const int src9= src[9*srcStride];\
1402 const int src10= src[10*srcStride];\
1403 const int src11= src[11*srcStride];\
1404 const int src12= src[12*srcStride];\
1405 const int src13= src[13*srcStride];\
1406 const int src14= src[14*srcStride];\
1407 const int src15= src[15*srcStride];\
1408 const int src16= src[16*srcStride];\
1409 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1410 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1411 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1412 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1413 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1414 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1415 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1416 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1417 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1418 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1419 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1420 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1421 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1422 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1423 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1424 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1425 dst++;\
1426 src++;\
1427 }\
1428 }\
1429 \
1430 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1431 OPNAME ## pixels8_c(dst, src, stride, 8);\
1432 }\
1433 \
1434 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1435 uint8_t half[64];\
1436 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1437 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1438 }\
1439 \
1440 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1441 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1442 }\
1443 \
1444 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1445 uint8_t half[64];\
1446 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1447 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1448 }\
1449 \
1450 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1451 uint8_t full[16*9];\
1452 uint8_t half[64];\
1453 copy_block9(full, src, 16, stride, 9);\
1454 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1455 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1456 }\
1457 \
1458 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1459 uint8_t full[16*9];\
1460 copy_block9(full, src, 16, stride, 9);\
1461 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1462 }\
1463 \
1464 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1465 uint8_t full[16*9];\
1466 uint8_t half[64];\
1467 copy_block9(full, src, 16, stride, 9);\
1468 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1469 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1470 }\
1471 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1472 uint8_t full[16*9];\
1473 uint8_t halfH[72];\
1474 uint8_t halfV[64];\
1475 uint8_t halfHV[64];\
1476 copy_block9(full, src, 16, stride, 9);\
1477 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1478 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1479 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1480 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1481 }\
1482 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1483 uint8_t full[16*9];\
1484 uint8_t halfH[72];\
1485 uint8_t halfHV[64];\
1486 copy_block9(full, src, 16, stride, 9);\
1487 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1488 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1489 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1490 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1491 }\
1492 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1493 uint8_t full[16*9];\
1494 uint8_t halfH[72];\
1495 uint8_t halfV[64];\
1496 uint8_t halfHV[64];\
1497 copy_block9(full, src, 16, stride, 9);\
1498 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1499 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1500 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1501 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1502 }\
1503 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1504 uint8_t full[16*9];\
1505 uint8_t halfH[72];\
1506 uint8_t halfHV[64];\
1507 copy_block9(full, src, 16, stride, 9);\
1508 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1509 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1510 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1511 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1512 }\
1513 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1514 uint8_t full[16*9];\
1515 uint8_t halfH[72];\
1516 uint8_t halfV[64];\
1517 uint8_t halfHV[64];\
1518 copy_block9(full, src, 16, stride, 9);\
1519 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1520 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1521 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1522 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1523 }\
1524 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1525 uint8_t full[16*9];\
1526 uint8_t halfH[72];\
1527 uint8_t halfHV[64];\
1528 copy_block9(full, src, 16, stride, 9);\
1529 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1530 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1531 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1532 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1533 }\
1534 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1535 uint8_t full[16*9];\
1536 uint8_t halfH[72];\
1537 uint8_t halfV[64];\
1538 uint8_t halfHV[64];\
1539 copy_block9(full, src, 16, stride, 9);\
1540 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1541 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1542 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1543 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1544 }\
1545 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1546 uint8_t full[16*9];\
1547 uint8_t halfH[72];\
1548 uint8_t halfHV[64];\
1549 copy_block9(full, src, 16, stride, 9);\
1550 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1551 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1552 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1553 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1554 }\
1555 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1556 uint8_t halfH[72];\
1557 uint8_t halfHV[64];\
1558 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1559 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1560 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1561 }\
1562 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1563 uint8_t halfH[72];\
1564 uint8_t halfHV[64];\
1565 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1566 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1567 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1568 }\
1569 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1570 uint8_t full[16*9];\
1571 uint8_t halfH[72];\
1572 uint8_t halfV[64];\
1573 uint8_t halfHV[64];\
1574 copy_block9(full, src, 16, stride, 9);\
1575 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1576 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1577 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1578 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1579 }\
1580 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1581 uint8_t full[16*9];\
1582 uint8_t halfH[72];\
1583 copy_block9(full, src, 16, stride, 9);\
1584 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1585 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1586 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1587 }\
1588 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1589 uint8_t full[16*9];\
1590 uint8_t halfH[72];\
1591 uint8_t halfV[64];\
1592 uint8_t halfHV[64];\
1593 copy_block9(full, src, 16, stride, 9);\
1594 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1595 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1596 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1597 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1598 }\
1599 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1600 uint8_t full[16*9];\
1601 uint8_t halfH[72];\
1602 copy_block9(full, src, 16, stride, 9);\
1603 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1604 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1605 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1606 }\
1607 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t halfH[72];\
1609 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1610 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1611 }\
1612 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1613 OPNAME ## pixels16_c(dst, src, stride, 16);\
1614 }\
1615 \
1616 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1617 uint8_t half[256];\
1618 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1619 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1620 }\
1621 \
1622 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1623 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1624 }\
1625 \
1626 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t half[256];\
1628 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1629 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1630 }\
1631 \
1632 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1633 uint8_t full[24*17];\
1634 uint8_t half[256];\
1635 copy_block17(full, src, 24, stride, 17);\
1636 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1637 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1638 }\
1639 \
1640 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1641 uint8_t full[24*17];\
1642 copy_block17(full, src, 24, stride, 17);\
1643 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1644 }\
1645 \
1646 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1647 uint8_t full[24*17];\
1648 uint8_t half[256];\
1649 copy_block17(full, src, 24, stride, 17);\
1650 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1651 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1652 }\
1653 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1654 uint8_t full[24*17];\
1655 uint8_t halfH[272];\
1656 uint8_t halfV[256];\
1657 uint8_t halfHV[256];\
1658 copy_block17(full, src, 24, stride, 17);\
1659 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1660 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1661 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1662 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1663 }\
1664 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1665 uint8_t full[24*17];\
1666 uint8_t halfH[272];\
1667 uint8_t halfHV[256];\
1668 copy_block17(full, src, 24, stride, 17);\
1669 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1670 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1671 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1672 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1673 }\
1674 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[24*17];\
1676 uint8_t halfH[272];\
1677 uint8_t halfV[256];\
1678 uint8_t halfHV[256];\
1679 copy_block17(full, src, 24, stride, 17);\
1680 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1681 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1682 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1683 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1684 }\
1685 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1686 uint8_t full[24*17];\
1687 uint8_t halfH[272];\
1688 uint8_t halfHV[256];\
1689 copy_block17(full, src, 24, stride, 17);\
1690 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1691 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1692 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1693 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1694 }\
1695 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1696 uint8_t full[24*17];\
1697 uint8_t halfH[272];\
1698 uint8_t halfV[256];\
1699 uint8_t halfHV[256];\
1700 copy_block17(full, src, 24, stride, 17);\
1701 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1702 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1703 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1704 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1705 }\
1706 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[24*17];\
1708 uint8_t halfH[272];\
1709 uint8_t halfHV[256];\
1710 copy_block17(full, src, 24, stride, 17);\
1711 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1712 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1713 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1714 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1715 }\
1716 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[24*17];\
1718 uint8_t halfH[272];\
1719 uint8_t halfV[256];\
1720 uint8_t halfHV[256];\
1721 copy_block17(full, src, 24, stride, 17);\
1722 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1723 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1724 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1725 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1726 }\
1727 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1728 uint8_t full[24*17];\
1729 uint8_t halfH[272];\
1730 uint8_t halfHV[256];\
1731 copy_block17(full, src, 24, stride, 17);\
1732 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1733 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1734 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1735 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1736 }\
1737 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t halfH[272];\
1739 uint8_t halfHV[256];\
1740 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1741 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1742 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1743 }\
1744 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t halfH[272];\
1746 uint8_t halfHV[256];\
1747 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1748 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1749 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1750 }\
1751 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1752 uint8_t full[24*17];\
1753 uint8_t halfH[272];\
1754 uint8_t halfV[256];\
1755 uint8_t halfHV[256];\
1756 copy_block17(full, src, 24, stride, 17);\
1757 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1758 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1759 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1760 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1761 }\
1762 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t full[24*17];\
1764 uint8_t halfH[272];\
1765 copy_block17(full, src, 24, stride, 17);\
1766 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1767 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1768 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1769 }\
1770 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1771 uint8_t full[24*17];\
1772 uint8_t halfH[272];\
1773 uint8_t halfV[256];\
1774 uint8_t halfHV[256];\
1775 copy_block17(full, src, 24, stride, 17);\
1776 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1777 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1778 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1779 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1780 }\
1781 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1782 uint8_t full[24*17];\
1783 uint8_t halfH[272];\
1784 copy_block17(full, src, 24, stride, 17);\
1785 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1786 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1787 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1788 }\
1789 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t halfH[272];\
1791 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1792 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1793 }
1794
1795 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1796 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1797 #define op_put(a, b) a = cm[((b) + 16)>>5]
1798 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1799
1800 QPEL_MC(0, put_ , _ , op_put)
1801 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1802 QPEL_MC(0, avg_ , _ , op_avg)
1803 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1804 #undef op_avg
1805 #undef op_avg_no_rnd
1806 #undef op_put
1807 #undef op_put_no_rnd
1808
1809 #if 1
1810 #define H264_LOWPASS(OPNAME, OP, OP2) \
1811 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812 const int h=4;\
1813 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1814 int i;\
1815 for(i=0; i<h; i++)\
1816 {\
1817 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1818 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1819 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1820 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1821 dst+=dstStride;\
1822 src+=srcStride;\
1823 }\
1824 }\
1825 \
1826 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1827 const int w=4;\
1828 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1829 int i;\
1830 for(i=0; i<w; i++)\
1831 {\
1832 const int srcB= src[-2*srcStride];\
1833 const int srcA= src[-1*srcStride];\
1834 const int src0= src[0 *srcStride];\
1835 const int src1= src[1 *srcStride];\
1836 const int src2= src[2 *srcStride];\
1837 const int src3= src[3 *srcStride];\
1838 const int src4= src[4 *srcStride];\
1839 const int src5= src[5 *srcStride];\
1840 const int src6= src[6 *srcStride];\
1841 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1842 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1843 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1844 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1845 dst++;\
1846 src++;\
1847 }\
1848 }\
1849 \
1850 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1851 const int h=4;\
1852 const int w=4;\
1853 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1854 int i;\
1855 src -= 2*srcStride;\
1856 for(i=0; i<h+5; i++)\
1857 {\
1858 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1859 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1860 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1861 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1862 tmp+=tmpStride;\
1863 src+=srcStride;\
1864 }\
1865 tmp -= tmpStride*(h+5-2);\
1866 for(i=0; i<w; i++)\
1867 {\
1868 const int tmpB= tmp[-2*tmpStride];\
1869 const int tmpA= tmp[-1*tmpStride];\
1870 const int tmp0= tmp[0 *tmpStride];\
1871 const int tmp1= tmp[1 *tmpStride];\
1872 const int tmp2= tmp[2 *tmpStride];\
1873 const int tmp3= tmp[3 *tmpStride];\
1874 const int tmp4= tmp[4 *tmpStride];\
1875 const int tmp5= tmp[5 *tmpStride];\
1876 const int tmp6= tmp[6 *tmpStride];\
1877 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1878 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1879 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1880 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1881 dst++;\
1882 tmp++;\
1883 }\
1884 }\
1885 \
1886 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1887 const int h=8;\
1888 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1889 int i;\
1890 for(i=0; i<h; i++)\
1891 {\
1892 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1893 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1894 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1895 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1896 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1897 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1898 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1899 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1900 dst+=dstStride;\
1901 src+=srcStride;\
1902 }\
1903 }\
1904 \
1905 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1906 const int w=8;\
1907 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1908 int i;\
1909 for(i=0; i<w; i++)\
1910 {\
1911 const int srcB= src[-2*srcStride];\
1912 const int srcA= src[-1*srcStride];\
1913 const int src0= src[0 *srcStride];\
1914 const int src1= src[1 *srcStride];\
1915 const int src2= src[2 *srcStride];\
1916 const int src3= src[3 *srcStride];\
1917 const int src4= src[4 *srcStride];\
1918 const int src5= src[5 *srcStride];\
1919 const int src6= src[6 *srcStride];\
1920 const int src7= src[7 *srcStride];\
1921 const int src8= src[8 *srcStride];\
1922 const int src9= src[9 *srcStride];\
1923 const int src10=src[10*srcStride];\
1924 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1925 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1926 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1927 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1928 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1929 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1930 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1931 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1932 dst++;\
1933 src++;\
1934 }\
1935 }\
1936 \
1937 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1938 const int h=8;\
1939 const int w=8;\
1940 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1941 int i;\
1942 src -= 2*srcStride;\
1943 for(i=0; i<h+5; i++)\
1944 {\
1945 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1946 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1947 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1948 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1949 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1950 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1951 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1952 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1953 tmp+=tmpStride;\
1954 src+=srcStride;\
1955 }\
1956 tmp -= tmpStride*(h+5-2);\
1957 for(i=0; i<w; i++)\
1958 {\
1959 const int tmpB= tmp[-2*tmpStride];\
1960 const int tmpA= tmp[-1*tmpStride];\
1961 const int tmp0= tmp[0 *tmpStride];\
1962 const int tmp1= tmp[1 *tmpStride];\
1963 const int tmp2= tmp[2 *tmpStride];\
1964 const int tmp3= tmp[3 *tmpStride];\
1965 const int tmp4= tmp[4 *tmpStride];\
1966 const int tmp5= tmp[5 *tmpStride];\
1967 const int tmp6= tmp[6 *tmpStride];\
1968 const int tmp7= tmp[7 *tmpStride];\
1969 const int tmp8= tmp[8 *tmpStride];\
1970 const int tmp9= tmp[9 *tmpStride];\
1971 const int tmp10=tmp[10*tmpStride];\
1972 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1973 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1974 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1975 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1976 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1977 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1978 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1979 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1980 dst++;\
1981 tmp++;\
1982 }\
1983 }\
1984 \
1985 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1986 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1987 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1988 src += 8*srcStride;\
1989 dst += 8*dstStride;\
1990 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1991 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1992 }\
1993 \
1994 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1995 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1996 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1997 src += 8*srcStride;\
1998 dst += 8*dstStride;\
1999 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2000 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2001 }\
2002 \
2003 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2004 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2005 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2006 src += 8*srcStride;\
2007 tmp += 8*tmpStride;\
2008 dst += 8*dstStride;\
2009 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2010 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2011 }\
2012
2013 #define H264_MC(OPNAME, SIZE) \
2014 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2015 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2016 }\
2017 \
2018 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t half[SIZE*SIZE];\
2020 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2021 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2022 }\
2023 \
2024 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2025 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2026 }\
2027 \
2028 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t half[SIZE*SIZE];\
2030 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2031 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2032 }\
2033 \
2034 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t full[SIZE*(SIZE+5)];\
2036 uint8_t * const full_mid= full + SIZE*2;\
2037 uint8_t half[SIZE*SIZE];\
2038 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2039 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2040 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2041 }\
2042 \
2043 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t full[SIZE*(SIZE+5)];\
2045 uint8_t * const full_mid= full + SIZE*2;\
2046 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2047 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2048 }\
2049 \
2050 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2051 uint8_t full[SIZE*(SIZE+5)];\
2052 uint8_t * const full_mid= full + SIZE*2;\
2053 uint8_t half[SIZE*SIZE];\
2054 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2055 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2056 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2057 }\
2058 \
2059 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2060 uint8_t full[SIZE*(SIZE+5)];\
2061 uint8_t * const full_mid= full + SIZE*2;\
2062 uint8_t halfH[SIZE*SIZE];\
2063 uint8_t halfV[SIZE*SIZE];\
2064 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2068 }\
2069 \
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 uint8_t halfH[SIZE*SIZE];\
2074 uint8_t halfV[SIZE*SIZE];\
2075 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2076 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2077 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2078 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2079 }\
2080 \
2081 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2082 uint8_t full[SIZE*(SIZE+5)];\
2083 uint8_t * const full_mid= full + SIZE*2;\
2084 uint8_t halfH[SIZE*SIZE];\
2085 uint8_t halfV[SIZE*SIZE];\
2086 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2087 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2088 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2089 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2090 }\
2091 \
2092 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2093 uint8_t full[SIZE*(SIZE+5)];\
2094 uint8_t * const full_mid= full + SIZE*2;\
2095 uint8_t halfH[SIZE*SIZE];\
2096 uint8_t halfV[SIZE*SIZE];\
2097 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2098 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2099 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2100 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2101 }\
2102 \
2103 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2104 int16_t tmp[SIZE*(SIZE+5)];\
2105 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2106 }\
2107 \
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2109 int16_t tmp[SIZE*(SIZE+5)];\
2110 uint8_t halfH[SIZE*SIZE];\
2111 uint8_t halfHV[SIZE*SIZE];\
2112 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2113 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2114 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2115 }\
2116 \
2117 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2118 int16_t tmp[SIZE*(SIZE+5)];\
2119 uint8_t halfH[SIZE*SIZE];\
2120 uint8_t halfHV[SIZE*SIZE];\
2121 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2122 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2123 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2124 }\
2125 \
2126 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2127 uint8_t full[SIZE*(SIZE+5)];\
2128 uint8_t * const full_mid= full + SIZE*2;\
2129 int16_t tmp[SIZE*(SIZE+5)];\
2130 uint8_t halfV[SIZE*SIZE];\
2131 uint8_t halfHV[SIZE*SIZE];\
2132 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2133 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2134 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2135 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2136 }\
2137 \
2138 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t full[SIZE*(SIZE+5)];\
2140 uint8_t * const full_mid= full + SIZE*2;\
2141 int16_t tmp[SIZE*(SIZE+5)];\
2142 uint8_t halfV[SIZE*SIZE];\
2143 uint8_t halfHV[SIZE*SIZE];\
2144 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2145 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2146 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2147 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2148 }\
2149
2150 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2151 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2152 #define op_put(a, b) a = cm[((b) + 16)>>5]
2153 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2154 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2155
2156 H264_LOWPASS(put_ , op_put, op2_put)
2157 H264_LOWPASS(avg_ , op_avg, op2_avg)
2158 H264_MC(put_, 4)
2159 H264_MC(put_, 8)
2160 H264_MC(put_, 16)
2161 H264_MC(avg_, 4)
2162 H264_MC(avg_, 8)
2163 H264_MC(avg_, 16)
2164
2165 #undef op_avg
2166 #undef op_put
2167 #undef op2_avg
2168 #undef op2_put
2169 #endif
2170
2171 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2172 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2173 int i;
2174
2175 for(i=0; i<h; i++){
2176 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2177 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2178 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2179 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2180 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2181 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2182 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2183 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2184 dst+=dstStride;
2185 src+=srcStride;
2186 }
2187 }
2188
2189 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2190 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2191 int i;
2192
2193 for(i=0; i<w; i++){
2194 const int src_1= src[ -srcStride];
2195 const int src0 = src[0 ];
2196 const int src1 = src[ srcStride];
2197 const int src2 = src[2*srcStride];
2198 const int src3 = src[3*srcStride];
2199 const int src4 = src[4*srcStride];
2200 const int src5 = src[5*srcStride];
2201 const int src6 = src[6*srcStride];
2202 const int src7 = src[7*srcStride];
2203 const int src8 = src[8*srcStride];
2204 const int src9 = src[9*srcStride];
2205 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2206 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2207 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2208 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2209 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2210 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2211 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2212 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2213 src++;
2214 dst++;
2215 }
2216 }
2217
2218 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2219 put_pixels8_c(dst, src, stride, 8);
2220 }
2221
2222 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2223 uint8_t half[64];
2224 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2225 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2226 }
2227
2228 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2229 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2230 }
2231
2232 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2233 uint8_t half[64];
2234 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2235 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2236 }
2237
2238 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2239 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2240 }
2241
2242 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2243 uint8_t halfH[88];
2244 uint8_t halfV[64];
2245 uint8_t halfHV[64];
2246 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2247 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2248 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2249 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2250 }
2251 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2252 uint8_t halfH[88];
2253 uint8_t halfV[64];
2254 uint8_t halfHV[64];
2255 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2256 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2257 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2258 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2259 }
2260 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2261 uint8_t halfH[88];
2262 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2263 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2264 }
2265 #endif
2266 #if 0
2267 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2268 int x;
2269 const int strength= ff_h263_loop_filter_strength[qscale];
2270
2271 for(x=0; x<8; x++){
2272 int d1, d2, ad1;
2273 int p0= src[x-2*stride];
2274 int p1= src[x-1*stride];
2275 int p2= src[x+0*stride];
2276 int p3= src[x+1*stride];
2277 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2278
2279 if (d<-2*strength) d1= 0;
2280 else if(d<- strength) d1=-2*strength - d;
2281 else if(d< strength) d1= d;
2282 else if(d< 2*strength) d1= 2*strength - d;
2283 else d1= 0;
2284
2285 p1 += d1;
2286 p2 -= d1;
2287 if(p1&256) p1= ~(p1>>31);
2288 if(p2&256) p2= ~(p2>>31);
2289
2290 src[x-1*stride] = p1;
2291 src[x+0*stride] = p2;
2292
2293 ad1= ABS(d1)>>1;
2294
2295 d2= clip((p0-p3)/4, -ad1, ad1);
2296
2297 src[x-2*stride] = p0 - d2;
2298 src[x+ stride] = p3 + d2;
2299 }
2300 }
2301
2302 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2303 int y;
2304 const int strength= ff_h263_loop_filter_strength[qscale];
2305
2306 for(y=0; y<8; y++){
2307 int d1, d2, ad1;
2308 int p0= src[y*stride-2];
2309 int p1= src[y*stride-1];
2310 int p2= src[y*stride+0];
2311 int p3= src[y*stride+1];
2312 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2313
2314 if (d<-2*strength) d1= 0;
2315 else if(d<- strength) d1=-2*strength - d;
2316 else if(d< strength) d1= d;
2317 else if(d< 2*strength) d1= 2*strength - d;
2318 else d1= 0;
2319
2320 p1 += d1;
2321 p2 -= d1;
2322 if(p1&256) p1= ~(p1>>31);
2323 if(p2&256) p2= ~(p2>>31);
2324
2325 src[y*stride-1] = p1;
2326 src[y*stride+0] = p2;
2327
2328 ad1= ABS(d1)>>1;
2329
2330 d2= clip((p0-p3)/4, -ad1, ad1);
2331
2332 src[y*stride-2] = p0 - d2;
2333 src[y*stride+1] = p3 + d2;
2334 }
2335 }
2336 #endif
2337 #if 0
2338 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2339 {
2340 int s, i;
2341
2342 s = 0;
2343 for(i=0;i<h;i++) {
2344 s += abs(pix1[0] - pix2[0]);
2345 s += abs(pix1[1] - pix2[1]);
2346 s += abs(pix1[2] - pix2[2]);
2347 s += abs(pix1[3] - pix2[3]);
2348 s += abs(pix1[4] - pix2[4]);
2349 s += abs(pix1[5] - pix2[5]);
2350 s += abs(pix1[6] - pix2[6]);
2351 s += abs(pix1[7] - pix2[7]);
2352 s += abs(pix1[8] - pix2[8]);
2353 s += abs(pix1[9] - pix2[9]);
2354 s += abs(pix1[10] - pix2[10]);
2355 s += abs(pix1[11] - pix2[11]);
2356 s += abs(pix1[12] - pix2[12]);
2357 s += abs(pix1[13] - pix2[13]);
2358 s += abs(pix1[14] - pix2[14]);
2359 s += abs(pix1[15] - pix2[15]);
2360 pix1 += line_size;
2361 pix2 += line_size;
2362 }
2363 return s;
2364 }
2365
2366 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2367 {
2368 int s, i;
2369
2370 s = 0;
2371 for(i=0;i<h;i++) {
2372 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2373 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2374 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2375 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2376 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2377 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2378 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2379 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2380 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2381 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2382 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2383 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2384 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2385 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2386 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2387 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2388 pix1 += line_size;
2389 pix2 += line_size;
2390 }
2391 return s;
2392 }
2393
2394 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2395 {
2396 int s, i;
2397 uint8_t *pix3 = pix2 + line_size;
2398
2399 s = 0;
2400 for(i=0;i<h;i++) {
2401 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2402 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2403 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2404 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2405 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2406 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2407 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2408 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2409 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2410 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2411 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2412 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2413 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2414 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2415 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2416 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2417 pix1 += line_size;
2418 pix2 += line_size;
2419 pix3 += line_size;
2420 }
2421 return s;
2422 }
2423
2424 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2425 {
2426 int s, i;
2427 uint8_t *pix3 = pix2 + line_size;
2428
2429 s = 0;
2430 for(i=0;i<h;i++) {
2431 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2432 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2433 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2434 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2435 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2436 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2437 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2438 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2439 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2440 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2441 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2442 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2443 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2444 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2445 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2446 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2447 pix1 += line_size;
2448 pix2 += line_size;
2449 pix3 += line_size;
2450 }
2451 return s;
2452 }
2453
2454 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2455 {
2456 int s, i;
2457
2458 s = 0;
2459 for(i=0;i<h;i++) {
2460 s += abs(pix1[0] - pix2[0]);
2461 s += abs(pix1[1] - pix2[1]);
2462 s += abs(pix1[2] - pix2[2]);
2463 s += abs(pix1[3] - pix2[3]);
2464 s += abs(pix1[4] - pix2[4]);
2465 s += abs(pix1[5] - pix2[5]);
2466 s += abs(pix1[6] - pix2[6]);
2467 s += abs(pix1[7] - pix2[7]);
2468 pix1 += line_size;
2469 pix2 += line_size;
2470 }
2471 return s;
2472 }
2473
2474 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2475 {
2476 int s, i;
2477
2478 s = 0;
2479 for(i=0;i<h;i++) {
2480 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2481 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2482 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2483 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2484 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2485 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2486 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2487 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2488 pix1 += line_size;
2489 pix2 += line_size;
2490 }
2491 return s;
2492 }
2493
2494 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2495 {
2496 int s, i;
2497 uint8_t *pix3 = pix2 + line_size;
2498
2499 s = 0;
2500 for(i=0;i<h;i++) {
2501 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2502 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2503 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2504 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2505 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2506 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2507 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2508 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2509 pix1 += line_size;
2510 pix2 += line_size;
2511 pix3 += line_size;
2512 }
2513 return s;
2514 }
2515
2516 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2517 {
2518 int s, i;
2519 uint8_t *pix3 = pix2 + line_size;
2520
2521 s = 0;
2522 for(i=0;i<h;i++) {
2523 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2524 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2525 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2526 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2527 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2528 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2529 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2530 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2531 pix1 += line_size;
2532 pix2 += line_size;
2533 pix3 += line_size;
2534 }
2535 return s;
2536 }
2537
2538 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2539 int i;
2540 unsigned int sum=0;
2541
2542 for(i=0; i<8*8; i++){
2543 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2544 int w= weight[i];
2545 b>>= RECON_SHIFT;
2546 assert(-512<b && b<512);
2547
2548 sum += (w*b)*(w*b)>>4;
2549 }
2550 return sum>>2;
2551 }
2552
2553 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2554 int i;
2555
2556 for(i=0; i<8*8; i++){
2557 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2558 }
2559 }
2560
2561 /**
2562 * permutes an 8x8 block.
2563 * @param block the block which will be permuted according to the given permutation vector
2564 * @param permutation the permutation vector
2565 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2566 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2567 * (inverse) permutated to scantable order!
2568 */
2569 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2570 {
2571 int i;
2572 DCTELEM temp[64];
2573
2574 if(last<=0) return;
2575 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2576
2577 for(i=0; i<=last; i++){
2578 const int j= scantable[i];
2579 temp[j]= block[j];
2580 block[j]=0;
2581 }
2582
2583 for(i=0; i<=last; i++){
2584 const int j= scantable[i];
2585 const int perm_j= permutation[j];
2586 block[perm_j]= temp[j];
2587 }
2588 }
2589
2590 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2591 return 0;
2592 }
2593
2594 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2595 int i;
2596
2597 memset(cmp, 0, sizeof(void*)*5);
2598
2599 for(i=0; i<5; i++){
2600 switch(type&0xFF){
2601 case FF_CMP_SAD:
2602 cmp[i]= c->sad[i];
2603 break;
2604 case FF_CMP_SATD:
2605 cmp[i]= c->hadamard8_diff[i];
2606 break;
2607 case FF_CMP_SSE:
2608 cmp[i]= c->sse[i];
2609 break;
2610 case FF_CMP_DCT:
2611 cmp[i]= c->dct_sad[i];
2612 break;
2613 case FF_CMP_PSNR:
2614 cmp[i]= c->quant_psnr[i];
2615 break;
2616 case FF_CMP_BIT:
2617 cmp[i]= c->bit[i];
2618 break;
2619 case FF_CMP_RD:
2620 cmp[i]= c->rd[i];
2621 break;
2622 case FF_CMP_VSAD:
2623 cmp[i]= c->vsad[i];
2624 break;
2625 case FF_CMP_VSSE:
2626 cmp[i]= c->vsse[i];
2627 break;
2628 case FF_CMP_ZERO:
2629 cmp[i]= zero_cmp;
2630 break;
2631 default:
2632 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2633 }
2634 }
2635 }
2636
2637 /**
2638 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2639 */
2640 static void clear_blocks_c(DCTELEM *blocks)
2641 {
2642 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2643 }
2644
2645 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2646 int i;
2647 for(i=0; i+7<w; i+=8){
2648 dst[i+0] += src[i+0];
2649 dst[i+1] += src[i+1];
2650 dst[i+2] += src[i+2];
2651 dst[i+3] += src[i+3];
2652 dst[i+4] += src[i+4];
2653 dst[i+5] += src[i+5];
2654 dst[i+6] += src[i+6];
2655 dst[i+7] += src[i+7];
2656 }
2657 for(; i<w; i++)
2658 dst[i+0] += src[i+0];
2659 }
2660
2661 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2662 int i;
2663 for(i=0; i+7<w; i+=8){
2664 dst[i+0] = src1[i+0]-src2[i+0];
2665 dst[i+1] = src1[i+1]-src2[i+1];
2666 dst[i+2] = src1[i+2]-src2[i+2];
2667 dst[i+3] = src1[i+3]-src2[i+3];
2668 dst[i+4] = src1[i+4]-src2[i+4];
2669 dst[i+5] = src1[i+5]-src2[i+5];
2670 dst[i+6] = src1[i+6]-src2[i+6];
2671 dst[i+7] = src1[i+7]-src2[i+7];
2672 }
2673 for(; i<w; i++)
2674 dst[i+0] = src1[i+0]-src2[i+0];
2675 }
2676
2677 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2678 int i;
2679 uint8_t l, lt;
2680
2681 l= *left;
2682 lt= *left_top;
2683
2684 for(i=0; i<w; i++){
2685 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2686 lt= src1[i];
2687 l= src2[i];
2688 dst[i]= l - pred;
2689 }
2690
2691 *left= l;
2692 *left_top= lt;
2693 }
2694 #endif
2695 #if 0
2696 #define BUTTERFLY2(o1,o2,i1,i2) \
2697 o1= (i1)+(i2);\
2698 o2= (i1)-(i2);
2699
2700 #define BUTTERFLY1(x,y) \
2701 {\
2702 int a,b;\
2703 a= x;\
2704 b= y;\
2705 x= a+b;\
2706 y= a-b;\
2707 }
2708
2709 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2710
2711 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2712 int i;
2713 int temp[64];
2714 int sum=0;
2715
2716 assert(h==8);
2717
2718 for(i=0; i<8; i++){
2719 //FIXME try pointer walks
2720 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2721 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2722 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2723 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2724
2725 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2726 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2727 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2728 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2729
2730 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2731 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2732 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2733 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2734 }
2735
2736 for(i=0; i<8; i++){
2737 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2738 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2739 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2740 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2741
2742 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2743 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2744 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2745 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2746
2747 sum +=
2748 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2749 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2750 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2751 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2752 }
2753 #if 0
2754 static int maxi=0;
2755 if(sum>maxi){
2756 maxi=sum;
2757 printf("MAX:%d\n", maxi);
2758 }
2759 #endif
2760 return sum;
2761 }
2762
2763 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2764 int i;
2765 int temp[64];
2766 int sum=0;
2767
2768 assert(h==8);
2769
2770 for(i=0; i<8; i++){
2771 //FIXME try pointer walks
2772 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2773 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2774 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2775 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2776
2777 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2778 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2779 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2780 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2781
2782 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2783 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2784 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2785 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2786 }
2787
2788 for(i=0; i<8; i++){
2789 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2790 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2791 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2792 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2793
2794 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2795 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2796 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2797 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2798
2799 sum +=
2800 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2801 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2802 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2803 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2804 }
2805
2806 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2807
2808 return sum;
2809 }
2810
2811 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2812 MpegEncContext * const s= (MpegEncContext *)c;
2813 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2814 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2815 int sum=0, i;
2816
2817 assert(h==8);
2818
2819 s->dsp.diff_pixels(temp, src1, src2, stride);
2820 s->dsp.fdct(temp);
2821
2822 for(i=0; i<64; i++)
2823 sum+= ABS(temp[i]);
2824
2825 return sum;
2826 }
2827
2828 void simple_idct(DCTELEM *block); //FIXME
2829
2830 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2831 MpegEncContext * const s= (MpegEncContext *)c;
2832 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2833 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2834 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2835 int sum=0, i;
2836
2837 assert(h==8);
2838 s->mb_intra=0;
2839
2840 s->dsp.diff_pixels(temp, src1, src2, stride);
2841
2842 memcpy(bak, temp, 64*sizeof(DCTELEM));
2843
2844 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2845 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2846 simple_idct(temp); //FIXME
2847
2848 for(i=0; i<64; i++)
2849 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2850
2851 return sum;
2852 }
2853
2854 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2855 MpegEncContext * const s= (MpegEncContext *)c;
2856 const uint8_t *scantable= s->intra_scantable.permutated;
2857 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2858 uint64_t __align8 aligned_bak[stride];
2859 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2860 uint8_t * const bak= (uint8_t*)aligned_bak;
2861 int i, last, run, bits, level, distoration, start_i;
2862 const int esc_length= s->ac_esc_length;
2863 uint8_t * length;
2864 uint8_t * last_length;
2865
2866 assert(h==8);
2867
2868 for(i=0; i<8; i++){
2869 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2870 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2871 }
2872
2873 s->dsp.diff_pixels(temp, src1, src2, stride);
2874
2875 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2876
2877 bits=0;
2878
2879 if (s->mb_intra) {
2880 start_i = 1;
2881 length = s->intra_ac_vlc_length;
2882 last_length= s->intra_ac_vlc_last_length;
2883 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2884 } else {
2885 start_i = 0;
2886 length = s->inter_ac_vlc_length;
2887 last_length= s->inter_ac_vlc_last_length;
2888 }
2889
2890 if(last>=start_i){
2891 run=0;
2892 for(i=start_i; i<last; i++){
2893 int j= scantable[i];
2894 level= temp[j];
2895
2896 if(level){
2897 level+=64;
2898 if((level&(~127)) == 0){
2899 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2900 }else
2901 bits+= esc_length;
2902 run=0;
2903 }else
2904 run++;
2905 }
2906 i= scantable[last];
2907
2908 level= temp[i] + 64;
2909
2910 assert(level - 64);
2911
2912 if((level&(~127)) == 0){
2913 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2914 }else
2915 bits+= esc_length;
2916
2917 }
2918
2919 if(last>=0){
2920 if(s->mb_intra)
2921 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2922 else
2923 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2924 }
2925
2926 s->dsp.idct_add(bak, stride, temp);
2927
2928 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2929
2930 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2931 }
2932
2933 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2934 MpegEncContext * const s= (MpegEncContext *)c;
2935 const uint8_t *scantable= s->intra_scantable.permutated;
2936 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2937 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938 int i, last, run, bits, level, start_i;
2939 const int esc_length= s->ac_esc_length;
2940 uint8_t * length;
2941 uint8_t * last_length;
2942
2943 assert(h==8);
2944
2945 s->dsp.diff_pixels(temp, src1, src2, stride);
2946
2947 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2948
2949 bits=0;
2950
2951 if (s->mb_intra) {
2952 start_i = 1;
2953 length = s->intra_ac_vlc_length;
2954 last_length= s->intra_ac_vlc_last_length;
2955 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2956 } else {
2957 start_i = 0;
2958 length = s->inter_ac_vlc_length;
2959 last_length= s->inter_ac_vlc_last_length;
2960 }
2961
2962 if(last>=start_i){
2963 run=0;
2964 for(i=start_i; i<last; i++){
2965 int j= scantable[i];
2966 level= temp[j];
2967
2968 if(level){
2969 level+=64;
2970 if((level&(~127)) == 0){
2971 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2972 }else
2973 bits+= esc_length;
2974 run=0;
2975 }else
2976 run++;
2977 }
2978 i= scantable[last];
2979
2980 level= temp[i] + 64;
2981
2982 assert(level - 64);
2983
2984 if((level&(~127)) == 0){
2985 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2986 }else
2987 bits+= esc_length;
2988 }
2989
2990 return bits;
2991 }
2992
2993 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2994 int score=0;
2995 int x,y;
2996
2997 for(y=1; y<h; y++){
2998 for(x=0; x<16; x+=4){
2999 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3000 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3001 }
3002 s+= stride;
3003 }
3004
3005 return score;
3006 }
3007
3008 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3009 int score=0;
3010 int x,y;
3011
3012 for(y=1; y<h; y++){
3013 for(x=0; x<16; x++){
3014 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3015 }
3016 s1+= stride;
3017 s2+= stride;
3018 }
3019
3020 return score;
3021 }
3022
3023 #define SQ(a) ((a)*(a))
3024 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3025 int score=0;
3026 int x,y;
3027
3028 for(y=1; y<h; y++){
3029 for(x=0; x<16; x+=4){
3030 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3031 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3032 }
3033 s+= stride;
3034 }
3035
3036 return score;
3037 }
3038
3039 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3040 int score=0;
3041 int x,y;
3042
3043 for(y=1; y<h; y++){
3044 for(x=0; x<16; x++){
3045 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3046 }
3047 s1+= stride;
3048 s2+= stride;
3049 }
3050
3051 return score;
3052 }
3053
3054 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3055 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3056 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3057 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3058 WARPER8_16_SQ(rd8x8_c, rd16_c)
3059 WARPER8_16_SQ(bit8x8_c, bit16_c)
3060
3061 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3062 converted */
3063 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3064 {
3065 j_rev_dct (block);
3066 put_pixels_clamped_c(block, dest, line_size);
3067 }
3068 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3069 {
3070 j_rev_dct (block);
3071 add_pixels_clamped_c(block, dest, line_size);
3072 }
3073 #endif
3074 /* init static data */
dsputil_static_init(void)3075 void dsputil_static_init(void)
3076 {
3077 int i;
3078
3079 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3080 for(i=0;i<MAX_NEG_CROP;i++) {
3081 cropTbl[i] = 0;
3082 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3083 }
3084
3085 for(i=0;i<512;i++) {
3086 squareTbl[i] = (i - 256) * (i - 256);
3087 }
3088
3089 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3090 }
3091
3092 #if 0
3093 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3094 {
3095 int i;
3096
3097 #ifdef CONFIG_ENCODERS
3098 if(avctx->dct_algo==FF_DCT_FASTINT) {
3099 c->fdct = fdct_ifast;
3100 c->fdct248 = fdct_ifast248;
3101 }
3102 else if(avctx->dct_algo==FF_DCT_FAAN) {
3103 c->fdct = ff_faandct;
3104 c->fdct248 = ff_faandct248;
3105 }
3106 else {
3107 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3108 c->fdct248 = ff_fdct248_islow;
3109 }
3110 #endif //CONFIG_ENCODERS
3111
3112 if(avctx->idct_algo==FF_IDCT_INT){
3113 c->idct_put= ff_jref_idct_put;
3114 c->idct_add= ff_jref_idct_add;
3115 c->idct = j_rev_dct;
3116 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3117 }else{ //accurate/default
3118 c->idct_put= simple_idct_put;
3119 c->idct_add= simple_idct_add;
3120 c->idct = simple_idct;
3121 c->idct_permutation_type= FF_NO_IDCT_PERM;
3122 }
3123
3124 c->get_pixels = get_pixels_c;
3125 c->diff_pixels = diff_pixels_c;
3126 c->put_pixels_clamped = put_pixels_clamped_c;
3127 c->add_pixels_clamped = add_pixels_clamped_c;
3128 c->gmc1 = gmc1_c;
3129 c->gmc = gmc_c;
3130 c->clear_blocks = clear_blocks_c;
3131 c->pix_sum = pix_sum_c;
3132 c->pix_norm1 = pix_norm1_c;
3133
3134 /* TODO [0] 16 [1] 8 */
3135 c->pix_abs[0][0] = pix_abs16_c;
3136 c->pix_abs[0][1] = pix_abs16_x2_c;
3137 c->pix_abs[0][2] = pix_abs16_y2_c;
3138 c->pix_abs[0][3] = pix_abs16_xy2_c;
3139 c->pix_abs[1][0] = pix_abs8_c;
3140 c->pix_abs[1][1] = pix_abs8_x2_c;
3141 c->pix_abs[1][2] = pix_abs8_y2_c;
3142 c->pix_abs[1][3] = pix_abs8_xy2_c;
3143
3144 #define dspfunc(PFX, IDX, NUM) \
3145 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3146 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3147 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3148 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3149
3150 dspfunc(put, 0, 16);
3151 dspfunc(put_no_rnd, 0, 16);
3152 dspfunc(put, 1, 8);
3153 dspfunc(put_no_rnd, 1, 8);
3154 dspfunc(put, 2, 4);
3155 dspfunc(put, 3, 2);
3156
3157 dspfunc(avg, 0, 16);
3158 dspfunc(avg_no_rnd, 0, 16);
3159 dspfunc(avg, 1, 8);
3160 dspfunc(avg_no_rnd, 1, 8);
3161 dspfunc(avg, 2, 4);
3162 dspfunc(avg, 3, 2);
3163 #undef dspfunc
3164
3165 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3166 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3167 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3168 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3169 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3170 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3171 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3172 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3173 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3174
3175 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3176 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3177 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3178 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3179 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3180 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3181 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3182 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3183 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3184
3185 #define dspfunc(PFX, IDX, NUM) \
3186 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3187 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3188 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3189 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3190 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3191 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3192 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3193 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3194 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3195 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3196 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3197 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3198 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3199 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3200 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3201 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3202
3203 dspfunc(put_qpel, 0, 16);
3204 dspfunc(put_no_rnd_qpel, 0, 16);
3205
3206 dspfunc(avg_qpel, 0, 16);
3207 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3208
3209 dspfunc(put_qpel, 1, 8);
3210 dspfunc(put_no_rnd_qpel, 1, 8);
3211
3212 dspfunc(avg_qpel, 1, 8);
3213 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3214
3215 dspfunc(put_h264_qpel, 0, 16);
3216 dspfunc(put_h264_qpel, 1, 8);
3217 dspfunc(put_h264_qpel, 2, 4);
3218 dspfunc(avg_h264_qpel, 0, 16);
3219 dspfunc(avg_h264_qpel, 1, 8);
3220 dspfunc(avg_h264_qpel, 2, 4);
3221
3222 #undef dspfunc
3223 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3224 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3225 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3226 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3227 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3228 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3229
3230 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3231 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3232 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3233 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3234 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3235 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3236 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3237 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3238
3239 #define SET_CMP_FUNC(name) \
3240 c->name[0]= name ## 16_c;\
3241 c->name[1]= name ## 8x8_c;
3242
3243 SET_CMP_FUNC(hadamard8_diff)
3244 c->hadamard8_diff[4]= hadamard8_intra16_c;
3245 SET_CMP_FUNC(dct_sad)
3246 c->sad[0]= pix_abs16_c;
3247 c->sad[1]= pix_abs8_c;
3248 c->sse[0]= sse16_c;
3249 c->sse[1]= sse8_c;
3250 SET_CMP_FUNC(quant_psnr)
3251 SET_CMP_FUNC(rd)
3252 SET_CMP_FUNC(bit)
3253 c->vsad[0]= vsad16_c;
3254 c->vsad[4]= vsad_intra16_c;
3255 c->vsse[0]= vsse16_c;
3256 c->vsse[4]= vsse_intra16_c;
3257
3258 c->add_bytes= add_bytes_c;
3259 c->diff_bytes= diff_bytes_c;
3260 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3261 c->bswap_buf= bswap_buf;
3262 #if 0
3263 c->h263_h_loop_filter= h263_h_loop_filter_c;
3264 c->h263_v_loop_filter= h263_v_loop_filter_c;
3265 #endif
3266 c->try_8x8basis= try_8x8basis_c;
3267 c->add_8x8basis= add_8x8basis_c;
3268
3269 #ifdef HAVE_MMX
3270 dsputil_init_mmx(c, avctx);
3271 #endif
3272 #ifdef ARCH_ARMV4L
3273 dsputil_init_armv4l(c, avctx);
3274 #endif
3275 #ifdef HAVE_MLIB
3276 dsputil_init_mlib(c, avctx);
3277 #endif
3278 #ifdef ARCH_ALPHA
3279 dsputil_init_alpha(c, avctx);
3280 #endif
3281 #ifdef ARCH_POWERPC
3282 dsputil_init_ppc(c, avctx);
3283 #endif
3284 #ifdef HAVE_MMI
3285 dsputil_init_mmi(c, avctx);
3286 #endif
3287 #ifdef ARCH_SH4
3288 dsputil_init_sh4(c,avctx);
3289 #endif
3290
3291 switch(c->idct_permutation_type){
3292 case FF_NO_IDCT_PERM:
3293 for(i=0; i<64; i++)
3294 c->idct_permutation[i]= i;
3295 break;
3296 case FF_LIBMPEG2_IDCT_PERM:
3297 for(i=0; i<64; i++)
3298 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3299 break;
3300 case FF_SIMPLE_IDCT_PERM:
3301 for(i=0; i<64; i++)
3302 c->idct_permutation[i]= simple_mmx_permutation[i];
3303 break;
3304 case FF_TRANSPOSE_IDCT_PERM:
3305 for(i=0; i<64; i++)
3306 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3307 break;
3308 default:
3309 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3310 }
3311 }
3312 #endif
3313