1 /*****************************************************************************
2  *
3  *  XVID MPEG-4 VIDEO CODEC
4  *  - QPel interpolation -
5  *
6  *  Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
7  *
8  *  This program is free software ; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation ; either version 2 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program ; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  *
22  * $Id: qpel.c 1985 2011-05-18 09:02:35Z Isibaar $
23  *
24  ****************************************************************************/
25 
26 #ifndef XVID_AUTO_INCLUDE
27 
28 #include <stdio.h>
29 
30 #include "../portab.h"
31 #include "qpel.h"
32 
33 /* Quarterpel FIR definition
34  ****************************************************************************/
35 
36 static const int32_t FIR_Tab_8[9][8] = {
37 	{ 14, -3,  2, -1,  0,  0,  0,  0 },
38 	{ 23, 19, -6,  3, -1,  0,  0,  0 },
39 	{ -7, 20, 20, -6,  3, -1,  0,  0 },
40 	{  3, -6, 20, 20, -6,  3, -1,  0 },
41 	{ -1,  3, -6, 20, 20, -6,  3, -1 },
42 	{  0, -1,  3, -6, 20, 20, -6,  3 },
43 	{  0,  0, -1,  3, -6, 20, 20, -7 },
44 	{  0,  0,  0, -1,  3, -6, 19, 23 },
45 	{  0,  0,  0,  0, -1,  2, -3, 14 }
46 };
47 
48 static const int32_t FIR_Tab_16[17][16] = {
49 	{ 14, -3,  2, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
50 	{ 23, 19, -6,  3, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
51 	{ -7, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
52 	{  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0 },
53 	{ -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0,  0,  0,  0 },
54 	{  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0,  0,  0 },
55 	{  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0,  0 },
56 	{  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0,  0 },
57 	{  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0,  0 },
58 	{  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0,  0 },
59 	{  0,  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0,  0 },
60 	{  0,  0,  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1,  0 },
61 	{  0,  0,  0,  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3, -1 },
62 	{  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -6,  3 },
63 	{  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  3, -6, 20, 20, -7 },
64 	{  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  3, -6, 19, 23 },
65 	{  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  2, -3, 14 }
66 };
67 
68 /* Implementation
69  ****************************************************************************/
70 
71 #define XVID_AUTO_INCLUDE
72 /* First auto include this file to generate reference code for SIMD versions
73  * This set of functions are good for educational purpose, because they're
74  * straightforward to understand, use loops and so on... But obviously they
75  * sux when it comes to speed */
76 #define REFERENCE_CODE
77 
78 /* 16x? filters */
79 
80 #define SIZE  16
81 #define TABLE FIR_Tab_16
82 
83 #define STORE(d,s)  (d) = (s)
84 #define FUNC_H      H_Pass_16_C_ref
85 #define FUNC_V      V_Pass_16_C_ref
86 #define FUNC_HA     H_Pass_Avrg_16_C_ref
87 #define FUNC_VA     V_Pass_Avrg_16_C_ref
88 #define FUNC_HA_UP  H_Pass_Avrg_Up_16_C_ref
89 #define FUNC_VA_UP  V_Pass_Avrg_Up_16_C_ref
90 
91 #include "qpel.c"   /* self-include ourself */
92 
93 /* note: B-frame always uses Rnd=0... */
94 #define STORE(d,s)  (d) = ( (s)+(d)+1 ) >> 1
95 #define FUNC_H      H_Pass_16_Add_C_ref
96 #define FUNC_V      V_Pass_16_Add_C_ref
97 #define FUNC_HA     H_Pass_Avrg_16_Add_C_ref
98 #define FUNC_VA     V_Pass_Avrg_16_Add_C_ref
99 #define FUNC_HA_UP  H_Pass_Avrg_Up_16_Add_C_ref
100 #define FUNC_VA_UP  V_Pass_Avrg_Up_16_Add_C_ref
101 
102 #include "qpel.c"   /* self-include ourself */
103 
104 #undef SIZE
105 #undef TABLE
106 
107 /* 8x? filters */
108 
109 #define SIZE  8
110 #define TABLE FIR_Tab_8
111 
112 #define STORE(d,s)  (d) = (s)
113 #define FUNC_H      H_Pass_8_C_ref
114 #define FUNC_V      V_Pass_8_C_ref
115 #define FUNC_HA     H_Pass_Avrg_8_C_ref
116 #define FUNC_VA     V_Pass_Avrg_8_C_ref
117 #define FUNC_HA_UP  H_Pass_Avrg_Up_8_C_ref
118 #define FUNC_VA_UP  V_Pass_Avrg_Up_8_C_ref
119 
120 #include "qpel.c"   /* self-include ourself */
121 
122 /* note: B-frame always uses Rnd=0... */
123 #define STORE(d,s)  (d) = ( (s)+(d)+1 ) >> 1
124 #define FUNC_H      H_Pass_8_Add_C_ref
125 #define FUNC_V      V_Pass_8_Add_C_ref
126 #define FUNC_HA     H_Pass_Avrg_8_Add_C_ref
127 #define FUNC_VA     V_Pass_Avrg_8_Add_C_ref
128 #define FUNC_HA_UP  H_Pass_Avrg_Up_8_Add_C_ref
129 #define FUNC_VA_UP  V_Pass_Avrg_Up_8_Add_C_ref
130 
131 #include "qpel.c"   /* self-include ourself */
132 
133 #undef SIZE
134 #undef TABLE
135 
136 /* Then we define more optimized C version where loops are unrolled, where
137  * FIR coeffcients are not read from memory but are hardcoded in instructions
138  * They should be faster */
139 #undef REFERENCE_CODE
140 
141 /* 16x? filters */
142 
143 #define SIZE  16
144 
145 #define STORE(d,s)  (d) = (s)
146 #define FUNC_H      H_Pass_16_C
147 #define FUNC_V      V_Pass_16_C
148 #define FUNC_HA     H_Pass_Avrg_16_C
149 #define FUNC_VA     V_Pass_Avrg_16_C
150 #define FUNC_HA_UP  H_Pass_Avrg_Up_16_C
151 #define FUNC_VA_UP  V_Pass_Avrg_Up_16_C
152 
153 #include "qpel.c"   /* self-include ourself */
154 
155 /* note: B-frame always uses Rnd=0... */
156 #define STORE(d,s)  (d) = ( (s)+(d)+1 ) >> 1
157 #define FUNC_H      H_Pass_16_Add_C
158 #define FUNC_V      V_Pass_16_Add_C
159 #define FUNC_HA     H_Pass_Avrg_16_Add_C
160 #define FUNC_VA     V_Pass_Avrg_16_Add_C
161 #define FUNC_HA_UP  H_Pass_Avrg_Up_16_Add_C
162 #define FUNC_VA_UP  V_Pass_Avrg_Up_16_Add_C
163 
164 #include "qpel.c"   /* self-include ourself */
165 
166 #undef SIZE
167 #undef TABLE
168 
169 /* 8x? filters */
170 
171 #define SIZE  8
172 #define TABLE FIR_Tab_8
173 
174 #define STORE(d,s)  (d) = (s)
175 #define FUNC_H      H_Pass_8_C
176 #define FUNC_V      V_Pass_8_C
177 #define FUNC_HA     H_Pass_Avrg_8_C
178 #define FUNC_VA     V_Pass_Avrg_8_C
179 #define FUNC_HA_UP  H_Pass_Avrg_Up_8_C
180 #define FUNC_VA_UP  V_Pass_Avrg_Up_8_C
181 
182 #include "qpel.c"   /* self-include ourself */
183 
184 /* note: B-frame always uses Rnd=0... */
185 #define STORE(d,s)  (d) = ( (s)+(d)+1 ) >> 1
186 #define FUNC_H      H_Pass_8_Add_C
187 #define FUNC_V      V_Pass_8_Add_C
188 #define FUNC_HA     H_Pass_Avrg_8_Add_C
189 #define FUNC_VA     V_Pass_Avrg_8_Add_C
190 #define FUNC_HA_UP  H_Pass_Avrg_Up_8_Add_C
191 #define FUNC_VA_UP  V_Pass_Avrg_Up_8_Add_C
192 
193 #include "qpel.c"   /* self-include ourself */
194 
195 #undef SIZE
196 #undef TABLE
197 #undef XVID_AUTO_INCLUDE
198 
199 /* Global scope hooks
200  ****************************************************************************/
201 
202 XVID_QP_FUNCS *xvid_QP_Funcs = NULL;
203 XVID_QP_FUNCS *xvid_QP_Add_Funcs = NULL;
204 
205 /* Reference plain C impl. declaration
206  ****************************************************************************/
207 
208 XVID_QP_FUNCS xvid_QP_Funcs_C_ref = {
209 	H_Pass_16_C_ref, H_Pass_Avrg_16_C_ref, H_Pass_Avrg_Up_16_C_ref,
210 	V_Pass_16_C_ref, V_Pass_Avrg_16_C_ref, V_Pass_Avrg_Up_16_C_ref,
211 
212 	H_Pass_8_C_ref, H_Pass_Avrg_8_C_ref, H_Pass_Avrg_Up_8_C_ref,
213 	V_Pass_8_C_ref, V_Pass_Avrg_8_C_ref, V_Pass_Avrg_Up_8_C_ref
214 };
215 
216 XVID_QP_FUNCS xvid_QP_Add_Funcs_C_ref = {
217 	H_Pass_16_Add_C_ref, H_Pass_Avrg_16_Add_C_ref, H_Pass_Avrg_Up_16_Add_C_ref,
218 	V_Pass_16_Add_C_ref, V_Pass_Avrg_16_Add_C_ref, V_Pass_Avrg_Up_16_Add_C_ref,
219 
220 	H_Pass_8_Add_C_ref, H_Pass_Avrg_8_Add_C_ref, H_Pass_Avrg_Up_8_Add_C_ref,
221 	V_Pass_8_Add_C_ref, V_Pass_Avrg_8_Add_C_ref, V_Pass_Avrg_Up_8_Add_C_ref
222 };
223 
224 /* Plain C impl. declaration (faster than ref one)
225  ****************************************************************************/
226 
227 XVID_QP_FUNCS xvid_QP_Funcs_C = {
228 	H_Pass_16_C, H_Pass_Avrg_16_C, H_Pass_Avrg_Up_16_C,
229 	V_Pass_16_C, V_Pass_Avrg_16_C, V_Pass_Avrg_Up_16_C,
230 
231 	H_Pass_8_C, H_Pass_Avrg_8_C, H_Pass_Avrg_Up_8_C,
232 	V_Pass_8_C, V_Pass_Avrg_8_C, V_Pass_Avrg_Up_8_C
233 };
234 
235 XVID_QP_FUNCS xvid_QP_Add_Funcs_C = {
236 	H_Pass_16_Add_C, H_Pass_Avrg_16_Add_C, H_Pass_Avrg_Up_16_Add_C,
237 	V_Pass_16_Add_C, V_Pass_Avrg_16_Add_C, V_Pass_Avrg_Up_16_Add_C,
238 
239 	H_Pass_8_Add_C, H_Pass_Avrg_8_Add_C, H_Pass_Avrg_Up_8_Add_C,
240 	V_Pass_8_Add_C, V_Pass_Avrg_8_Add_C, V_Pass_Avrg_Up_8_Add_C
241 };
242 
243 /* mmx impl. declaration (see. qpel_mmx.asm
244  ****************************************************************************/
245 
246 #if defined (ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
247 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_16_mmx);
248 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_16_mmx);
249 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_Up_16_mmx);
250 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_16_mmx);
251 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_16_mmx);
252 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_Up_16_mmx);
253 
254 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_8_mmx);
255 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_8_mmx);
256 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_Up_8_mmx);
257 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_8_mmx);
258 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_8_mmx);
259 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_Up_8_mmx);
260 
261 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Add_16_mmx);
262 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_Add_16_mmx);
263 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_Up_Add_16_mmx);
264 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Add_16_mmx);
265 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_Add_16_mmx);
266 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_Up_Add_16_mmx);
267 
268 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_8_Add_mmx);
269 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_8_Add_mmx);
270 extern XVID_QP_PASS_SIGNATURE(xvid_H_Pass_Avrg_Up_8_Add_mmx);
271 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_8_Add_mmx);
272 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_8_Add_mmx);
273 extern XVID_QP_PASS_SIGNATURE(xvid_V_Pass_Avrg_Up_8_Add_mmx);
274 
275 XVID_QP_FUNCS xvid_QP_Funcs_mmx = {
276 	xvid_H_Pass_16_mmx, xvid_H_Pass_Avrg_16_mmx, xvid_H_Pass_Avrg_Up_16_mmx,
277 	xvid_V_Pass_16_mmx, xvid_V_Pass_Avrg_16_mmx, xvid_V_Pass_Avrg_Up_16_mmx,
278 
279 	xvid_H_Pass_8_mmx, xvid_H_Pass_Avrg_8_mmx, xvid_H_Pass_Avrg_Up_8_mmx,
280 	xvid_V_Pass_8_mmx, xvid_V_Pass_Avrg_8_mmx, xvid_V_Pass_Avrg_Up_8_mmx
281 };
282 
283 XVID_QP_FUNCS xvid_QP_Add_Funcs_mmx = {
284 	xvid_H_Pass_Add_16_mmx, xvid_H_Pass_Avrg_Add_16_mmx, xvid_H_Pass_Avrg_Up_Add_16_mmx,
285 	xvid_V_Pass_Add_16_mmx, xvid_V_Pass_Avrg_Add_16_mmx, xvid_V_Pass_Avrg_Up_Add_16_mmx,
286 
287 	xvid_H_Pass_8_Add_mmx, xvid_H_Pass_Avrg_8_Add_mmx, xvid_H_Pass_Avrg_Up_8_Add_mmx,
288 	xvid_V_Pass_8_Add_mmx, xvid_V_Pass_Avrg_8_Add_mmx, xvid_V_Pass_Avrg_Up_8_Add_mmx,
289 };
290 #endif /* ARCH_IS_IA32 */
291 
292 
293 /* altivec impl. declaration (see qpel_altivec.c)
294  ****************************************************************************/
295 
296 #ifdef ARCH_IS_PPC
297 
298 extern XVID_QP_PASS_SIGNATURE(H_Pass_16_Altivec_C);
299 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_16_Altivec_C);
300 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_Up_16_Altivec_C);
301 extern XVID_QP_PASS_SIGNATURE(V_Pass_16_Altivec_C);
302 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_16_Altivec_C);
303 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_Up_16_Altivec_C);
304 
305 extern XVID_QP_PASS_SIGNATURE(H_Pass_8_Altivec_C);
306 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_8_Altivec_C);
307 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_Up_8_Altivec_C);
308 extern XVID_QP_PASS_SIGNATURE(V_Pass_8_Altivec_C);
309 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_8_Altivec_C);
310 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_Up_8_Altivec_C);
311 
312 
313 extern XVID_QP_PASS_SIGNATURE(H_Pass_16_Add_Altivec_C);
314 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_16_Add_Altivec_C);
315 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_Up_16_Add_Altivec_C);
316 extern XVID_QP_PASS_SIGNATURE(V_Pass_16_Add_Altivec_C);
317 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_16_Add_Altivec_C);
318 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_Up_16_Add_Altivec_C);
319 
320 extern XVID_QP_PASS_SIGNATURE(H_Pass_8_Add_Altivec_C);
321 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_8_Add_Altivec_C);
322 extern XVID_QP_PASS_SIGNATURE(H_Pass_Avrg_Up_8_Add_Altivec_C);
323 extern XVID_QP_PASS_SIGNATURE(V_Pass_8_Add_Altivec_C);
324 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_8_Add_Altivec_C);
325 extern XVID_QP_PASS_SIGNATURE(V_Pass_Avrg_Up_8_Add_Altivec_C);
326 
327 XVID_QP_FUNCS xvid_QP_Funcs_Altivec_C = {
328 	H_Pass_16_Altivec_C, H_Pass_Avrg_16_Altivec_C, H_Pass_Avrg_Up_16_Altivec_C,
329 	V_Pass_16_Altivec_C, V_Pass_Avrg_16_Altivec_C, V_Pass_Avrg_Up_16_Altivec_C,
330 
331 	H_Pass_8_Altivec_C, H_Pass_Avrg_8_Altivec_C, H_Pass_Avrg_Up_8_Altivec_C,
332 	V_Pass_8_Altivec_C, V_Pass_Avrg_8_Altivec_C, V_Pass_Avrg_Up_8_Altivec_C
333 };
334 
335 XVID_QP_FUNCS xvid_QP_Add_Funcs_Altivec_C = {
336 	H_Pass_16_Add_Altivec_C, H_Pass_Avrg_16_Add_Altivec_C, H_Pass_Avrg_Up_16_Add_Altivec_C,
337 	V_Pass_16_Add_Altivec_C, V_Pass_Avrg_16_Add_Altivec_C, V_Pass_Avrg_Up_16_Add_Altivec_C,
338 
339 	H_Pass_8_Add_Altivec_C, H_Pass_Avrg_8_Add_Altivec_C, H_Pass_Avrg_Up_8_Add_Altivec_C,
340 	V_Pass_8_Add_Altivec_C, V_Pass_Avrg_8_Add_Altivec_C, V_Pass_Avrg_Up_8_Add_Altivec_C
341 };
342 
343 #endif /* ARCH_IS_PPC */
344 
345 /* tables for ASM
346  ****************************************************************************/
347 
348 
349 #if defined(ARCH_IS_IA32) || defined(ARCH_IS_X86_64)
350 /* These symbols will be used outside this file, so tell the compiler
351  * they're global. */
352 extern uint16_t xvid_Expand_mmx[256][4]; /* 8b -> 64b expansion table */
353 
354 extern int16_t xvid_FIR_1_0_0_0[256][4];
355 extern int16_t xvid_FIR_3_1_0_0[256][4];
356 extern int16_t xvid_FIR_6_3_1_0[256][4];
357 extern int16_t xvid_FIR_14_3_2_1[256][4];
358 extern int16_t xvid_FIR_20_6_3_1[256][4];
359 extern int16_t xvid_FIR_20_20_6_3[256][4];
360 extern int16_t xvid_FIR_23_19_6_3[256][4];
361 extern int16_t xvid_FIR_7_20_20_6[256][4];
362 extern int16_t xvid_FIR_6_20_20_6[256][4];
363 extern int16_t xvid_FIR_6_20_20_7[256][4];
364 extern int16_t xvid_FIR_3_6_20_20[256][4];
365 extern int16_t xvid_FIR_3_6_19_23[256][4];
366 extern int16_t xvid_FIR_1_3_6_20[256][4];
367 extern int16_t xvid_FIR_1_2_3_14[256][4];
368 extern int16_t xvid_FIR_0_1_3_6[256][4];
369 extern int16_t xvid_FIR_0_0_1_3[256][4];
370 extern int16_t xvid_FIR_0_0_0_1[256][4];
371 #endif
372 
373 /* Arrays definitions, according to the target platform */
374 
375 #if !defined(ARCH_IS_X86_64) && !defined(ARCH_IS_IA32)
376 /* Only ia32/ia64 will use these tables outside this file so mark them
377 * static for all other archs */
378 #define __SCOPE static
379 __SCOPE int16_t xvid_FIR_1_0_0_0[256][4];
380 __SCOPE int16_t xvid_FIR_3_1_0_0[256][4];
381 __SCOPE int16_t xvid_FIR_6_3_1_0[256][4];
382 __SCOPE int16_t xvid_FIR_14_3_2_1[256][4];
383 __SCOPE int16_t xvid_FIR_20_6_3_1[256][4];
384 __SCOPE int16_t xvid_FIR_20_20_6_3[256][4];
385 __SCOPE int16_t xvid_FIR_23_19_6_3[256][4];
386 __SCOPE int16_t xvid_FIR_7_20_20_6[256][4];
387 __SCOPE int16_t xvid_FIR_6_20_20_6[256][4];
388 __SCOPE int16_t xvid_FIR_6_20_20_7[256][4];
389 __SCOPE int16_t xvid_FIR_3_6_20_20[256][4];
390 __SCOPE int16_t xvid_FIR_3_6_19_23[256][4];
391 __SCOPE int16_t xvid_FIR_1_3_6_20[256][4];
392 __SCOPE int16_t xvid_FIR_1_2_3_14[256][4];
393 __SCOPE int16_t xvid_FIR_0_1_3_6[256][4];
394 __SCOPE int16_t xvid_FIR_0_0_1_3[256][4];
395 __SCOPE int16_t xvid_FIR_0_0_0_1[256][4];
396 #endif
397 
Init_FIR_Table(int16_t Tab[][4],int A,int B,int C,int D)398 static void Init_FIR_Table(int16_t Tab[][4],
399                            int A, int B, int C, int D)
400 {
401 	int i;
402 	for(i=0; i<256; ++i) {
403 		Tab[i][0] = i*A;
404 		Tab[i][1] = i*B;
405 		Tab[i][2] = i*C;
406 		Tab[i][3] = i*D;
407 	}
408 }
409 
410 
xvid_Init_QP(void)411 void xvid_Init_QP(void)
412 {
413 #if defined (ARCH_IS_IA32) || defined (ARCH_IS_X86_64)
414 	int i;
415 
416 	for(i=0; i<256; ++i) {
417 		xvid_Expand_mmx[i][0] = i;
418 		xvid_Expand_mmx[i][1] = i;
419 		xvid_Expand_mmx[i][2] = i;
420 		xvid_Expand_mmx[i][3] = i;
421 	}
422 #endif
423 
424 	/* Alternate way of filtering (cf. USE_TABLES flag in qpel_mmx.asm) */
425 
426 	Init_FIR_Table(xvid_FIR_1_0_0_0,   -1,  0,  0,  0);
427 	Init_FIR_Table(xvid_FIR_3_1_0_0,    3, -1,  0,  0);
428 	Init_FIR_Table(xvid_FIR_6_3_1_0,   -6,  3, -1,  0);
429 	Init_FIR_Table(xvid_FIR_14_3_2_1,  14, -3,  2, -1);
430 	Init_FIR_Table(xvid_FIR_20_6_3_1,  20, -6,  3, -1);
431 	Init_FIR_Table(xvid_FIR_20_20_6_3, 20, 20, -6,  3);
432 	Init_FIR_Table(xvid_FIR_23_19_6_3, 23, 19, -6,  3);
433 	Init_FIR_Table(xvid_FIR_7_20_20_6, -7, 20, 20, -6);
434 	Init_FIR_Table(xvid_FIR_6_20_20_6, -6, 20, 20, -6);
435 	Init_FIR_Table(xvid_FIR_6_20_20_7, -6, 20, 20, -7);
436 	Init_FIR_Table(xvid_FIR_3_6_20_20,  3, -6, 20, 20);
437 	Init_FIR_Table(xvid_FIR_3_6_19_23,  3, -6, 19, 23);
438 	Init_FIR_Table(xvid_FIR_1_3_6_20,  -1,  3, -6, 20);
439 	Init_FIR_Table(xvid_FIR_1_2_3_14,  -1,  2, -3, 14);
440 	Init_FIR_Table(xvid_FIR_0_1_3_6,    0, -1,  3, -6);
441 	Init_FIR_Table(xvid_FIR_0_0_1_3,    0,  0, -1,  3);
442 	Init_FIR_Table(xvid_FIR_0_0_0_1,    0,  0,  0, -1);
443 
444 }
445 
446 #endif /* !XVID_AUTO_INCLUDE */
447 
448 #if defined(XVID_AUTO_INCLUDE) && defined(REFERENCE_CODE)
449 
450 /*****************************************************************************
451  * "reference" filters impl. in plain C
452  ****************************************************************************/
453 
454 static
FUNC_H(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t Rnd)455 void FUNC_H(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t Rnd)
456 {
457 	while(H-->0) {
458 		int32_t i, k;
459 		int32_t Sums[SIZE] = { 0 };
460 		for(i=0; i<=SIZE; ++i)
461 			for(k=0; k<SIZE; ++k)
462 				Sums[k] += TABLE[i][k] * Src[i];
463 
464 		for(i=0; i<SIZE; ++i) {
465 			int32_t C = ( Sums[i] + 16-Rnd ) >> 5;
466 			if (C<0) C = 0; else if (C>255) C = 255;
467 			STORE(Dst[i], C);
468 		}
469 		Src += BpS;
470 		Dst += BpS;
471 	}
472 }
473 
474 static
FUNC_V(uint8_t * Dst,const uint8_t * Src,int32_t W,int32_t BpS,int32_t Rnd)475 void FUNC_V(uint8_t *Dst, const uint8_t *Src, int32_t W, int32_t BpS, int32_t Rnd)
476 {
477 	while(W-->0) {
478 		int32_t i, k;
479 		int32_t Sums[SIZE] = { 0 };
480 		const uint8_t *S = Src++;
481 		uint8_t *D = Dst++;
482 		for(i=0; i<=SIZE; ++i) {
483 			for(k=0; k<SIZE; ++k)
484 				Sums[k] += TABLE[i][k] * S[0];
485 			S += BpS;
486 		}
487 
488 		for(i=0; i<SIZE; ++i) {
489 			int32_t C = ( Sums[i] + 16-Rnd )>>5;
490 			if (C<0) C = 0; else if (C>255) C = 255;
491 			STORE(D[0], C);
492 			D += BpS;
493 		}
494 	}
495 }
496 
497 static
FUNC_HA(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t Rnd)498 void FUNC_HA(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t Rnd)
499 {
500 	while(H-->0) {
501 		int32_t i, k;
502 		int32_t Sums[SIZE] = { 0 };
503 		for(i=0; i<=SIZE; ++i)
504 			for(k=0; k<SIZE; ++k)
505 				Sums[k] += TABLE[i][k] * Src[i];
506 
507 		for(i=0; i<SIZE; ++i) {
508 			int32_t C = ( Sums[i] + 16-Rnd ) >> 5;
509 			if (C<0) C = 0; else if (C>255) C = 255;
510 			C = (C+Src[i]+1-Rnd) >> 1;
511 			STORE(Dst[i], C);
512 		}
513 		Src += BpS;
514 		Dst += BpS;
515 	}
516 }
517 
518 static
FUNC_HA_UP(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t Rnd)519 void FUNC_HA_UP(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t Rnd)
520 {
521 	while(H-->0) {
522 		int32_t i, k;
523 		int32_t Sums[SIZE] = { 0 };
524 		for(i=0; i<=SIZE; ++i)
525 			for(k=0; k<SIZE; ++k)
526 				Sums[k] += TABLE[i][k] * Src[i];
527 
528 		for(i=0; i<SIZE; ++i) {
529 			int32_t C = ( Sums[i] + 16-Rnd ) >> 5;
530 			if (C<0) C = 0; else if (C>255) C = 255;
531 			C = (C+Src[i+1]+1-Rnd) >> 1;
532 			STORE(Dst[i], C);
533 		}
534 		Src += BpS;
535 		Dst += BpS;
536 	}
537 }
538 
539 static
FUNC_VA(uint8_t * Dst,const uint8_t * Src,int32_t W,int32_t BpS,int32_t Rnd)540 void FUNC_VA(uint8_t *Dst, const uint8_t *Src, int32_t W, int32_t BpS, int32_t Rnd)
541 {
542 	while(W-->0) {
543 		int32_t i, k;
544 		int32_t Sums[SIZE] = { 0 };
545 		const uint8_t *S = Src;
546 		uint8_t *D = Dst;
547 
548 		for(i=0; i<=SIZE; ++i) {
549 			for(k=0; k<SIZE; ++k)
550 				Sums[k] += TABLE[i][k] * S[0];
551 			S += BpS;
552 		}
553 
554 		S = Src;
555 		for(i=0; i<SIZE; ++i) {
556 			int32_t C = ( Sums[i] + 16-Rnd )>>5;
557 			if (C<0) C = 0; else if (C>255) C = 255;
558 			C = ( C+S[0]+1-Rnd ) >> 1;
559 			STORE(D[0], C);
560 			D += BpS;
561 			S += BpS;
562 		}
563 		Src++;
564 		Dst++;
565 	}
566 }
567 
568 static
FUNC_VA_UP(uint8_t * Dst,const uint8_t * Src,int32_t W,int32_t BpS,int32_t Rnd)569 void FUNC_VA_UP(uint8_t *Dst, const uint8_t *Src, int32_t W, int32_t BpS, int32_t Rnd)
570 {
571 	while(W-->0) {
572 		int32_t i, k;
573 		int32_t Sums[SIZE] = { 0 };
574 		const uint8_t *S = Src;
575 		uint8_t *D = Dst;
576 
577 		for(i=0; i<=SIZE; ++i) {
578 			for(k=0; k<SIZE; ++k)
579 				Sums[k] += TABLE[i][k] * S[0];
580 			S += BpS;
581 		}
582 
583 		S = Src + BpS;
584 		for(i=0; i<SIZE; ++i) {
585 			int32_t C = ( Sums[i] + 16-Rnd )>>5;
586 			if (C<0) C = 0; else if (C>255) C = 255;
587 			C = ( C+S[0]+1-Rnd ) >> 1;
588 			STORE(D[0], C);
589 			D += BpS;
590 			S += BpS;
591 		}
592 		Dst++;
593 		Src++;
594 	}
595 }
596 
597 #undef STORE
598 #undef FUNC_H
599 #undef FUNC_V
600 #undef FUNC_HA
601 #undef FUNC_VA
602 #undef FUNC_HA_UP
603 #undef FUNC_VA_UP
604 
605 #elif defined(XVID_AUTO_INCLUDE) && !defined(REFERENCE_CODE)
606 
607 /*****************************************************************************
608  * "fast" filters impl. in plain C
609  ****************************************************************************/
610 
611 #define CLIP_STORE(D,C) \
612   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
613   STORE(D, C)
614 
615 static void
FUNC_H(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)616 FUNC_H(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
617 {
618 #if (SIZE==16)
619   while(H-->0) {
620     int C;
621     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
622     CLIP_STORE(Dst[ 0],C);
623     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
624     CLIP_STORE(Dst[ 1],C);
625     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
626     CLIP_STORE(Dst[ 2],C);
627     C = 16-RND - (Src[0]+Src[7 ]) + 3*(Src[ 1]+Src[ 6])-6*(Src[ 2]+Src[ 5]) + 20*(Src[ 3]+Src[ 4]);
628     CLIP_STORE(Dst[ 3],C);
629     C = 16-RND - (Src[1]+Src[8 ]) + 3*(Src[ 2]+Src[ 7])-6*(Src[ 3]+Src[ 6]) + 20*(Src[ 4]+Src[ 5]);
630     CLIP_STORE(Dst[ 4],C);
631     C = 16-RND - (Src[2]+Src[9 ]) + 3*(Src[ 3]+Src[ 8])-6*(Src[ 4]+Src[ 7]) + 20*(Src[ 5]+Src[ 6]);
632     CLIP_STORE(Dst[ 5],C);
633     C = 16-RND - (Src[3]+Src[10]) + 3*(Src[ 4]+Src[ 9])-6*(Src[ 5]+Src[ 8]) + 20*(Src[ 6]+Src[ 7]);
634     CLIP_STORE(Dst[ 6],C);
635     C = 16-RND - (Src[4]+Src[11]) + 3*(Src[ 5]+Src[10])-6*(Src[ 6]+Src[ 9]) + 20*(Src[ 7]+Src[ 8]);
636     CLIP_STORE(Dst[ 7],C);
637     C = 16-RND - (Src[5]+Src[12]) + 3*(Src[ 6]+Src[11])-6*(Src[ 7]+Src[10]) + 20*(Src[ 8]+Src[ 9]);
638     CLIP_STORE(Dst[ 8],C);
639     C = 16-RND - (Src[6]+Src[13]) + 3*(Src[ 7]+Src[12])-6*(Src[ 8]+Src[11]) + 20*(Src[ 9]+Src[10]);
640     CLIP_STORE(Dst[ 9],C);
641     C = 16-RND - (Src[7]+Src[14]) + 3*(Src[ 8]+Src[13])-6*(Src[ 9]+Src[12]) + 20*(Src[10]+Src[11]);
642     CLIP_STORE(Dst[10],C);
643     C = 16-RND - (Src[8]+Src[15]) + 3*(Src[ 9]+Src[14])-6*(Src[10]+Src[13]) + 20*(Src[11]+Src[12]);
644     CLIP_STORE(Dst[11],C);
645     C = 16-RND - (Src[9]+Src[16]) + 3*(Src[10]+Src[15])-6*(Src[11]+Src[14]) + 20*(Src[12]+Src[13]);
646     CLIP_STORE(Dst[12],C);
647     C = 16-RND - Src[10] +3*Src[11] -6*(Src[12]+Src[15]) + 20*(Src[13]+Src[14]) +2*Src[16];
648     CLIP_STORE(Dst[13],C);
649     C = 16-RND - Src[11] +3*(Src[12]-Src[16]) -6*Src[13] + 20*Src[14] + 19*Src[15];
650     CLIP_STORE(Dst[14],C);
651     C = 16-RND - Src[12] +3*Src[13] -7*Src[14] + 23*Src[15] + 14*Src[16];
652     CLIP_STORE(Dst[15],C);
653     Src += BpS;
654     Dst += BpS;
655   }
656 #else
657   while(H-->0) {
658     int C;
659     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
660     CLIP_STORE(Dst[0],C);
661     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
662     CLIP_STORE(Dst[1],C);
663     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
664     CLIP_STORE(Dst[2],C);
665     C = 16-RND - (Src[0]+Src[7]) + 3*(Src[1]+Src[6])-6*(Src[2]+Src[5]) + 20*(Src[3]+Src[4]);
666     CLIP_STORE(Dst[3],C);
667     C = 16-RND - (Src[1]+Src[8]) + 3*(Src[2]+Src[7])-6*(Src[3]+Src[6]) + 20*(Src[4]+Src[5]);
668     CLIP_STORE(Dst[4],C);
669     C = 16-RND - Src[2] +3*Src[3] -6*(Src[4]+Src[7]) + 20*(Src[5]+Src[6]) +2*Src[8];
670     CLIP_STORE(Dst[5],C);
671     C = 16-RND - Src[3] +3*(Src[4]-Src[8]) -6*Src[5] + 20*Src[6] + 19*Src[7];
672     CLIP_STORE(Dst[6],C);
673     C = 16-RND - Src[4] +3*Src[5] -7*Src[6] + 23*Src[7] + 14*Src[8];
674     CLIP_STORE(Dst[7],C);
675     Src += BpS;
676     Dst += BpS;
677   }
678 #endif
679 }
680 #undef CLIP_STORE
681 
682 #define CLIP_STORE(i,C) \
683   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
684   C = (C+Src[i]+1-RND) >> 1;  \
685   STORE(Dst[i], C)
686 
687 static void
FUNC_HA(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)688 FUNC_HA(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
689 {
690 #if (SIZE==16)
691   while(H-->0) {
692     int C;
693     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
694     CLIP_STORE(0,C);
695     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
696     CLIP_STORE( 1,C);
697     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
698     CLIP_STORE( 2,C);
699     C = 16-RND - (Src[0]+Src[7 ]) + 3*(Src[ 1]+Src[ 6])-6*(Src[ 2]+Src[ 5]) + 20*(Src[ 3]+Src[ 4]);
700     CLIP_STORE( 3,C);
701     C = 16-RND - (Src[1]+Src[8 ]) + 3*(Src[ 2]+Src[ 7])-6*(Src[ 3]+Src[ 6]) + 20*(Src[ 4]+Src[ 5]);
702     CLIP_STORE( 4,C);
703     C = 16-RND - (Src[2]+Src[9 ]) + 3*(Src[ 3]+Src[ 8])-6*(Src[ 4]+Src[ 7]) + 20*(Src[ 5]+Src[ 6]);
704     CLIP_STORE( 5,C);
705     C = 16-RND - (Src[3]+Src[10]) + 3*(Src[ 4]+Src[ 9])-6*(Src[ 5]+Src[ 8]) + 20*(Src[ 6]+Src[ 7]);
706     CLIP_STORE( 6,C);
707     C = 16-RND - (Src[4]+Src[11]) + 3*(Src[ 5]+Src[10])-6*(Src[ 6]+Src[ 9]) + 20*(Src[ 7]+Src[ 8]);
708     CLIP_STORE( 7,C);
709     C = 16-RND - (Src[5]+Src[12]) + 3*(Src[ 6]+Src[11])-6*(Src[ 7]+Src[10]) + 20*(Src[ 8]+Src[ 9]);
710     CLIP_STORE( 8,C);
711     C = 16-RND - (Src[6]+Src[13]) + 3*(Src[ 7]+Src[12])-6*(Src[ 8]+Src[11]) + 20*(Src[ 9]+Src[10]);
712     CLIP_STORE( 9,C);
713     C = 16-RND - (Src[7]+Src[14]) + 3*(Src[ 8]+Src[13])-6*(Src[ 9]+Src[12]) + 20*(Src[10]+Src[11]);
714     CLIP_STORE(10,C);
715     C = 16-RND - (Src[8]+Src[15]) + 3*(Src[ 9]+Src[14])-6*(Src[10]+Src[13]) + 20*(Src[11]+Src[12]);
716     CLIP_STORE(11,C);
717     C = 16-RND - (Src[9]+Src[16]) + 3*(Src[10]+Src[15])-6*(Src[11]+Src[14]) + 20*(Src[12]+Src[13]);
718     CLIP_STORE(12,C);
719     C = 16-RND - Src[10] +3*Src[11] -6*(Src[12]+Src[15]) + 20*(Src[13]+Src[14]) +2*Src[16];
720     CLIP_STORE(13,C);
721     C = 16-RND - Src[11] +3*(Src[12]-Src[16]) -6*Src[13] + 20*Src[14] + 19*Src[15];
722     CLIP_STORE(14,C);
723     C = 16-RND - Src[12] +3*Src[13] -7*Src[14] + 23*Src[15] + 14*Src[16];
724     CLIP_STORE(15,C);
725     Src += BpS;
726     Dst += BpS;
727   }
728 #else
729   while(H-->0) {
730     int C;
731     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
732     CLIP_STORE(0,C);
733     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
734     CLIP_STORE(1,C);
735     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
736     CLIP_STORE(2,C);
737     C = 16-RND - (Src[0]+Src[7]) + 3*(Src[1]+Src[6])-6*(Src[2]+Src[5]) + 20*(Src[3]+Src[4]);
738     CLIP_STORE(3,C);
739     C = 16-RND - (Src[1]+Src[8]) + 3*(Src[2]+Src[7])-6*(Src[3]+Src[6]) + 20*(Src[4]+Src[5]);
740     CLIP_STORE(4,C);
741     C = 16-RND - Src[2] +3*Src[3] -6*(Src[4]+Src[7]) + 20*(Src[5]+Src[6]) +2*Src[8];
742     CLIP_STORE(5,C);
743     C = 16-RND - Src[3] +3*(Src[4]-Src[8]) -6*Src[5] + 20*Src[6] + 19*Src[7];
744     CLIP_STORE(6,C);
745     C = 16-RND - Src[4] +3*Src[5] -7*Src[6] + 23*Src[7] + 14*Src[8];
746     CLIP_STORE(7,C);
747     Src += BpS;
748     Dst += BpS;
749   }
750 #endif
751 }
752 #undef CLIP_STORE
753 
754 #define CLIP_STORE(i,C) \
755   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
756   C = (C+Src[i+1]+1-RND) >> 1;  \
757   STORE(Dst[i], C)
758 
759 static void
FUNC_HA_UP(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)760 FUNC_HA_UP(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
761 {
762 #if (SIZE==16)
763   while(H-->0) {
764     int C;
765     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
766     CLIP_STORE(0,C);
767     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
768     CLIP_STORE( 1,C);
769     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
770     CLIP_STORE( 2,C);
771     C = 16-RND - (Src[0]+Src[7 ]) + 3*(Src[ 1]+Src[ 6])-6*(Src[ 2]+Src[ 5]) + 20*(Src[ 3]+Src[ 4]);
772     CLIP_STORE( 3,C);
773     C = 16-RND - (Src[1]+Src[8 ]) + 3*(Src[ 2]+Src[ 7])-6*(Src[ 3]+Src[ 6]) + 20*(Src[ 4]+Src[ 5]);
774     CLIP_STORE( 4,C);
775     C = 16-RND - (Src[2]+Src[9 ]) + 3*(Src[ 3]+Src[ 8])-6*(Src[ 4]+Src[ 7]) + 20*(Src[ 5]+Src[ 6]);
776     CLIP_STORE( 5,C);
777     C = 16-RND - (Src[3]+Src[10]) + 3*(Src[ 4]+Src[ 9])-6*(Src[ 5]+Src[ 8]) + 20*(Src[ 6]+Src[ 7]);
778     CLIP_STORE( 6,C);
779     C = 16-RND - (Src[4]+Src[11]) + 3*(Src[ 5]+Src[10])-6*(Src[ 6]+Src[ 9]) + 20*(Src[ 7]+Src[ 8]);
780     CLIP_STORE( 7,C);
781     C = 16-RND - (Src[5]+Src[12]) + 3*(Src[ 6]+Src[11])-6*(Src[ 7]+Src[10]) + 20*(Src[ 8]+Src[ 9]);
782     CLIP_STORE( 8,C);
783     C = 16-RND - (Src[6]+Src[13]) + 3*(Src[ 7]+Src[12])-6*(Src[ 8]+Src[11]) + 20*(Src[ 9]+Src[10]);
784     CLIP_STORE( 9,C);
785     C = 16-RND - (Src[7]+Src[14]) + 3*(Src[ 8]+Src[13])-6*(Src[ 9]+Src[12]) + 20*(Src[10]+Src[11]);
786     CLIP_STORE(10,C);
787     C = 16-RND - (Src[8]+Src[15]) + 3*(Src[ 9]+Src[14])-6*(Src[10]+Src[13]) + 20*(Src[11]+Src[12]);
788     CLIP_STORE(11,C);
789     C = 16-RND - (Src[9]+Src[16]) + 3*(Src[10]+Src[15])-6*(Src[11]+Src[14]) + 20*(Src[12]+Src[13]);
790     CLIP_STORE(12,C);
791     C = 16-RND - Src[10] +3*Src[11] -6*(Src[12]+Src[15]) + 20*(Src[13]+Src[14]) +2*Src[16];
792     CLIP_STORE(13,C);
793     C = 16-RND - Src[11] +3*(Src[12]-Src[16]) -6*Src[13] + 20*Src[14] + 19*Src[15];
794     CLIP_STORE(14,C);
795     C = 16-RND - Src[12] +3*Src[13] -7*Src[14] + 23*Src[15] + 14*Src[16];
796     CLIP_STORE(15,C);
797     Src += BpS;
798     Dst += BpS;
799   }
800 #else
801   while(H-->0) {
802     int C;
803     C = 16-RND +14*Src[0] +23*Src[1] - 7*Src[2] + 3*Src[3] -   Src[4];
804     CLIP_STORE(0,C);
805     C = 16-RND - 3*(Src[0]-Src[4]) +19*Src[1] +20*Src[2] - 6*Src[3] - Src[5];
806     CLIP_STORE(1,C);
807     C = 16-RND + 2*Src[0] - 6*(Src[1]+Src[4]) +20*(Src[2]+Src[3]) + 3*Src[5] - Src[6];
808     CLIP_STORE(2,C);
809     C = 16-RND - (Src[0]+Src[7]) + 3*(Src[1]+Src[6])-6*(Src[2]+Src[5]) + 20*(Src[3]+Src[4]);
810     CLIP_STORE(3,C);
811     C = 16-RND - (Src[1]+Src[8]) + 3*(Src[2]+Src[7])-6*(Src[3]+Src[6]) + 20*(Src[4]+Src[5]);
812     CLIP_STORE(4,C);
813     C = 16-RND - Src[2] +3*Src[3] -6*(Src[4]+Src[7]) + 20*(Src[5]+Src[6]) +2*Src[8];
814     CLIP_STORE(5,C);
815     C = 16-RND - Src[3] +3*(Src[4]-Src[8]) -6*Src[5] + 20*Src[6] + 19*Src[7];
816     CLIP_STORE(6,C);
817     C = 16-RND - Src[4] +3*Src[5] -7*Src[6] + 23*Src[7] + 14*Src[8];
818     CLIP_STORE(7,C);
819     Src += BpS;
820     Dst += BpS;
821   }
822 #endif
823 }
824 #undef CLIP_STORE
825 
826 //////////////////////////////////////////////////////////
827 // vertical passes
828 //////////////////////////////////////////////////////////
829 // Note: for vertical passes, width (W) needs only be 8 or 16.
830 
831 #define CLIP_STORE(D,C) \
832   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
833   STORE(D, C)
834 
835 static void
FUNC_V(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)836 FUNC_V(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
837 {
838 #if (SIZE==16)
839   while(H-->0) {
840     int C;
841     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
842     CLIP_STORE(Dst[BpS* 0],C);
843     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
844     CLIP_STORE(Dst[BpS* 1],C);
845     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
846     CLIP_STORE(Dst[BpS* 2],C);
847     C = 16-RND - (Src[BpS*0]+Src[BpS*7 ]) + 3*(Src[BpS* 1]+Src[BpS* 6])-6*(Src[BpS* 2]+Src[BpS* 5]) + 20*(Src[BpS* 3]+Src[BpS* 4]);
848     CLIP_STORE(Dst[BpS* 3],C);
849     C = 16-RND - (Src[BpS*1]+Src[BpS*8 ]) + 3*(Src[BpS* 2]+Src[BpS* 7])-6*(Src[BpS* 3]+Src[BpS* 6]) + 20*(Src[BpS* 4]+Src[BpS* 5]);
850     CLIP_STORE(Dst[BpS* 4],C);
851     C = 16-RND - (Src[BpS*2]+Src[BpS*9 ]) + 3*(Src[BpS* 3]+Src[BpS* 8])-6*(Src[BpS* 4]+Src[BpS* 7]) + 20*(Src[BpS* 5]+Src[BpS* 6]);
852     CLIP_STORE(Dst[BpS* 5],C);
853     C = 16-RND - (Src[BpS*3]+Src[BpS*10]) + 3*(Src[BpS* 4]+Src[BpS* 9])-6*(Src[BpS* 5]+Src[BpS* 8]) + 20*(Src[BpS* 6]+Src[BpS* 7]);
854     CLIP_STORE(Dst[BpS* 6],C);
855     C = 16-RND - (Src[BpS*4]+Src[BpS*11]) + 3*(Src[BpS* 5]+Src[BpS*10])-6*(Src[BpS* 6]+Src[BpS* 9]) + 20*(Src[BpS* 7]+Src[BpS* 8]);
856     CLIP_STORE(Dst[BpS* 7],C);
857     C = 16-RND - (Src[BpS*5]+Src[BpS*12]) + 3*(Src[BpS* 6]+Src[BpS*11])-6*(Src[BpS* 7]+Src[BpS*10]) + 20*(Src[BpS* 8]+Src[BpS* 9]);
858     CLIP_STORE(Dst[BpS* 8],C);
859     C = 16-RND - (Src[BpS*6]+Src[BpS*13]) + 3*(Src[BpS* 7]+Src[BpS*12])-6*(Src[BpS* 8]+Src[BpS*11]) + 20*(Src[BpS* 9]+Src[BpS*10]);
860     CLIP_STORE(Dst[BpS* 9],C);
861     C = 16-RND - (Src[BpS*7]+Src[BpS*14]) + 3*(Src[BpS* 8]+Src[BpS*13])-6*(Src[BpS* 9]+Src[BpS*12]) + 20*(Src[BpS*10]+Src[BpS*11]);
862     CLIP_STORE(Dst[BpS*10],C);
863     C = 16-RND - (Src[BpS*8]+Src[BpS*15]) + 3*(Src[BpS* 9]+Src[BpS*14])-6*(Src[BpS*10]+Src[BpS*13]) + 20*(Src[BpS*11]+Src[BpS*12]);
864     CLIP_STORE(Dst[BpS*11],C);
865     C = 16-RND - (Src[BpS*9]+Src[BpS*16]) + 3*(Src[BpS*10]+Src[BpS*15])-6*(Src[BpS*11]+Src[BpS*14]) + 20*(Src[BpS*12]+Src[BpS*13]);
866     CLIP_STORE(Dst[BpS*12],C);
867     C = 16-RND - Src[BpS*10] +3*Src[BpS*11] -6*(Src[BpS*12]+Src[BpS*15]) + 20*(Src[BpS*13]+Src[BpS*14]) +2*Src[BpS*16];
868     CLIP_STORE(Dst[BpS*13],C);
869     C = 16-RND - Src[BpS*11] +3*(Src[BpS*12]-Src[BpS*16]) -6*Src[BpS*13] + 20*Src[BpS*14] + 19*Src[BpS*15];
870     CLIP_STORE(Dst[BpS*14],C);
871     C = 16-RND - Src[BpS*12] +3*Src[BpS*13] -7*Src[BpS*14] + 23*Src[BpS*15] + 14*Src[BpS*16];
872     CLIP_STORE(Dst[BpS*15],C);
873     Src += 1;
874     Dst += 1;
875   }
876 #else
877   while(H-->0) {
878     int C;
879     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
880     CLIP_STORE(Dst[BpS*0],C);
881     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
882     CLIP_STORE(Dst[BpS*1],C);
883     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
884     CLIP_STORE(Dst[BpS*2],C);
885     C = 16-RND - (Src[BpS*0]+Src[BpS*7]) + 3*(Src[BpS*1]+Src[BpS*6])-6*(Src[BpS*2]+Src[BpS*5]) + 20*(Src[BpS*3]+Src[BpS*4]);
886     CLIP_STORE(Dst[BpS*3],C);
887     C = 16-RND - (Src[BpS*1]+Src[BpS*8]) + 3*(Src[BpS*2]+Src[BpS*7])-6*(Src[BpS*3]+Src[BpS*6]) + 20*(Src[BpS*4]+Src[BpS*5]);
888     CLIP_STORE(Dst[BpS*4],C);
889     C = 16-RND - Src[BpS*2] +3*Src[BpS*3] -6*(Src[BpS*4]+Src[BpS*7]) + 20*(Src[BpS*5]+Src[BpS*6]) +2*Src[BpS*8];
890     CLIP_STORE(Dst[BpS*5],C);
891     C = 16-RND - Src[BpS*3] +3*(Src[BpS*4]-Src[BpS*8]) -6*Src[BpS*5] + 20*Src[BpS*6] + 19*Src[BpS*7];
892     CLIP_STORE(Dst[BpS*6],C);
893     C = 16-RND - Src[BpS*4] +3*Src[BpS*5] -7*Src[BpS*6] + 23*Src[BpS*7] + 14*Src[BpS*8];
894     CLIP_STORE(Dst[BpS*7],C);
895     Src += 1;
896     Dst += 1;
897   }
898 #endif
899 }
900 #undef CLIP_STORE
901 
902 #define CLIP_STORE(i,C) \
903   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
904   C = (C+Src[BpS*i]+1-RND) >> 1;  \
905   STORE(Dst[BpS*i], C)
906 
907 static void
FUNC_VA(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)908 FUNC_VA(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
909 {
910 #if (SIZE==16)
911   while(H-->0) {
912     int C;
913     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
914     CLIP_STORE(0,C);
915     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
916     CLIP_STORE( 1,C);
917     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
918     CLIP_STORE( 2,C);
919     C = 16-RND - (Src[BpS*0]+Src[BpS*7 ]) + 3*(Src[BpS* 1]+Src[BpS* 6])-6*(Src[BpS* 2]+Src[BpS* 5]) + 20*(Src[BpS* 3]+Src[BpS* 4]);
920     CLIP_STORE( 3,C);
921     C = 16-RND - (Src[BpS*1]+Src[BpS*8 ]) + 3*(Src[BpS* 2]+Src[BpS* 7])-6*(Src[BpS* 3]+Src[BpS* 6]) + 20*(Src[BpS* 4]+Src[BpS* 5]);
922     CLIP_STORE( 4,C);
923     C = 16-RND - (Src[BpS*2]+Src[BpS*9 ]) + 3*(Src[BpS* 3]+Src[BpS* 8])-6*(Src[BpS* 4]+Src[BpS* 7]) + 20*(Src[BpS* 5]+Src[BpS* 6]);
924     CLIP_STORE( 5,C);
925     C = 16-RND - (Src[BpS*3]+Src[BpS*10]) + 3*(Src[BpS* 4]+Src[BpS* 9])-6*(Src[BpS* 5]+Src[BpS* 8]) + 20*(Src[BpS* 6]+Src[BpS* 7]);
926     CLIP_STORE( 6,C);
927     C = 16-RND - (Src[BpS*4]+Src[BpS*11]) + 3*(Src[BpS* 5]+Src[BpS*10])-6*(Src[BpS* 6]+Src[BpS* 9]) + 20*(Src[BpS* 7]+Src[BpS* 8]);
928     CLIP_STORE( 7,C);
929     C = 16-RND - (Src[BpS*5]+Src[BpS*12]) + 3*(Src[BpS* 6]+Src[BpS*11])-6*(Src[BpS* 7]+Src[BpS*10]) + 20*(Src[BpS* 8]+Src[BpS* 9]);
930     CLIP_STORE( 8,C);
931     C = 16-RND - (Src[BpS*6]+Src[BpS*13]) + 3*(Src[BpS* 7]+Src[BpS*12])-6*(Src[BpS* 8]+Src[BpS*11]) + 20*(Src[BpS* 9]+Src[BpS*10]);
932     CLIP_STORE( 9,C);
933     C = 16-RND - (Src[BpS*7]+Src[BpS*14]) + 3*(Src[BpS* 8]+Src[BpS*13])-6*(Src[BpS* 9]+Src[BpS*12]) + 20*(Src[BpS*10]+Src[BpS*11]);
934     CLIP_STORE(10,C);
935     C = 16-RND - (Src[BpS*8]+Src[BpS*15]) + 3*(Src[BpS* 9]+Src[BpS*14])-6*(Src[BpS*10]+Src[BpS*13]) + 20*(Src[BpS*11]+Src[BpS*12]);
936     CLIP_STORE(11,C);
937     C = 16-RND - (Src[BpS*9]+Src[BpS*16]) + 3*(Src[BpS*10]+Src[BpS*15])-6*(Src[BpS*11]+Src[BpS*14]) + 20*(Src[BpS*12]+Src[BpS*13]);
938     CLIP_STORE(12,C);
939     C = 16-RND - Src[BpS*10] +3*Src[BpS*11] -6*(Src[BpS*12]+Src[BpS*15]) + 20*(Src[BpS*13]+Src[BpS*14]) +2*Src[BpS*16];
940     CLIP_STORE(13,C);
941     C = 16-RND - Src[BpS*11] +3*(Src[BpS*12]-Src[BpS*16]) -6*Src[BpS*13] + 20*Src[BpS*14] + 19*Src[BpS*15];
942     CLIP_STORE(14,C);
943     C = 16-RND - Src[BpS*12] +3*Src[BpS*13] -7*Src[BpS*14] + 23*Src[BpS*15] + 14*Src[BpS*16];
944     CLIP_STORE(15,C);
945     Src += 1;
946     Dst += 1;
947   }
948 #else
949   while(H-->0) {
950     int C;
951     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
952     CLIP_STORE(0,C);
953     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
954     CLIP_STORE(1,C);
955     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
956     CLIP_STORE(2,C);
957     C = 16-RND - (Src[BpS*0]+Src[BpS*7]) + 3*(Src[BpS*1]+Src[BpS*6])-6*(Src[BpS*2]+Src[BpS*5]) + 20*(Src[BpS*3]+Src[BpS*4]);
958     CLIP_STORE(3,C);
959     C = 16-RND - (Src[BpS*1]+Src[BpS*8]) + 3*(Src[BpS*2]+Src[BpS*7])-6*(Src[BpS*3]+Src[BpS*6]) + 20*(Src[BpS*4]+Src[BpS*5]);
960     CLIP_STORE(4,C);
961     C = 16-RND - Src[BpS*2] +3*Src[BpS*3] -6*(Src[BpS*4]+Src[BpS*7]) + 20*(Src[BpS*5]+Src[BpS*6]) +2*Src[BpS*8];
962     CLIP_STORE(5,C);
963     C = 16-RND - Src[BpS*3] +3*(Src[BpS*4]-Src[BpS*8]) -6*Src[BpS*5] + 20*Src[BpS*6] + 19*Src[BpS*7];
964     CLIP_STORE(6,C);
965     C = 16-RND - Src[BpS*4] +3*Src[BpS*5] -7*Src[BpS*6] + 23*Src[BpS*7] + 14*Src[BpS*8];
966     CLIP_STORE(7,C);
967     Src += 1;
968     Dst += 1;
969   }
970 #endif
971 }
972 #undef CLIP_STORE
973 
974 #define CLIP_STORE(i,C) \
975   if (C<0) C = 0; else if (C>(255<<5)) C = 255; else C = C>>5;  \
976   C = (C+Src[BpS*i+BpS]+1-RND) >> 1;  \
977   STORE(Dst[BpS*i], C)
978 
979 static void
FUNC_VA_UP(uint8_t * Dst,const uint8_t * Src,int32_t H,int32_t BpS,int32_t RND)980 FUNC_VA_UP(uint8_t *Dst, const uint8_t *Src, int32_t H, int32_t BpS, int32_t RND)
981 {
982 #if (SIZE==16)
983   while(H-->0) {
984     int C;
985     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
986     CLIP_STORE(0,C);
987     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
988     CLIP_STORE( 1,C);
989     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
990     CLIP_STORE( 2,C);
991     C = 16-RND - (Src[BpS*0]+Src[BpS*7 ]) + 3*(Src[BpS* 1]+Src[BpS* 6])-6*(Src[BpS* 2]+Src[BpS* 5]) + 20*(Src[BpS* 3]+Src[BpS* 4]);
992     CLIP_STORE( 3,C);
993     C = 16-RND - (Src[BpS*1]+Src[BpS*8 ]) + 3*(Src[BpS* 2]+Src[BpS* 7])-6*(Src[BpS* 3]+Src[BpS* 6]) + 20*(Src[BpS* 4]+Src[BpS* 5]);
994     CLIP_STORE( 4,C);
995     C = 16-RND - (Src[BpS*2]+Src[BpS*9 ]) + 3*(Src[BpS* 3]+Src[BpS* 8])-6*(Src[BpS* 4]+Src[BpS* 7]) + 20*(Src[BpS* 5]+Src[BpS* 6]);
996     CLIP_STORE( 5,C);
997     C = 16-RND - (Src[BpS*3]+Src[BpS*10]) + 3*(Src[BpS* 4]+Src[BpS* 9])-6*(Src[BpS* 5]+Src[BpS* 8]) + 20*(Src[BpS* 6]+Src[BpS* 7]);
998     CLIP_STORE( 6,C);
999     C = 16-RND - (Src[BpS*4]+Src[BpS*11]) + 3*(Src[BpS* 5]+Src[BpS*10])-6*(Src[BpS* 6]+Src[BpS* 9]) + 20*(Src[BpS* 7]+Src[BpS* 8]);
1000     CLIP_STORE( 7,C);
1001     C = 16-RND - (Src[BpS*5]+Src[BpS*12]) + 3*(Src[BpS* 6]+Src[BpS*11])-6*(Src[BpS* 7]+Src[BpS*10]) + 20*(Src[BpS* 8]+Src[BpS* 9]);
1002     CLIP_STORE( 8,C);
1003     C = 16-RND - (Src[BpS*6]+Src[BpS*13]) + 3*(Src[BpS* 7]+Src[BpS*12])-6*(Src[BpS* 8]+Src[BpS*11]) + 20*(Src[BpS* 9]+Src[BpS*10]);
1004     CLIP_STORE( 9,C);
1005     C = 16-RND - (Src[BpS*7]+Src[BpS*14]) + 3*(Src[BpS* 8]+Src[BpS*13])-6*(Src[BpS* 9]+Src[BpS*12]) + 20*(Src[BpS*10]+Src[BpS*11]);
1006     CLIP_STORE(10,C);
1007     C = 16-RND - (Src[BpS*8]+Src[BpS*15]) + 3*(Src[BpS* 9]+Src[BpS*14])-6*(Src[BpS*10]+Src[BpS*13]) + 20*(Src[BpS*11]+Src[BpS*12]);
1008     CLIP_STORE(11,C);
1009     C = 16-RND - (Src[BpS*9]+Src[BpS*16]) + 3*(Src[BpS*10]+Src[BpS*15])-6*(Src[BpS*11]+Src[BpS*14]) + 20*(Src[BpS*12]+Src[BpS*13]);
1010     CLIP_STORE(12,C);
1011     C = 16-RND - Src[BpS*10] +3*Src[BpS*11] -6*(Src[BpS*12]+Src[BpS*15]) + 20*(Src[BpS*13]+Src[BpS*14]) +2*Src[BpS*16];
1012     CLIP_STORE(13,C);
1013     C = 16-RND - Src[BpS*11] +3*(Src[BpS*12]-Src[BpS*16]) -6*Src[BpS*13] + 20*Src[BpS*14] + 19*Src[BpS*15];
1014     CLIP_STORE(14,C);
1015     C = 16-RND - Src[BpS*12] +3*Src[BpS*13] -7*Src[BpS*14] + 23*Src[BpS*15] + 14*Src[BpS*16];
1016     CLIP_STORE(15,C);
1017     Src += 1;
1018     Dst += 1;
1019   }
1020 #else
1021   while(H-->0) {
1022     int C;
1023     C = 16-RND +14*Src[BpS*0] +23*Src[BpS*1] - 7*Src[BpS*2] + 3*Src[BpS*3] -   Src[BpS*4];
1024     CLIP_STORE(0,C);
1025     C = 16-RND - 3*(Src[BpS*0]-Src[BpS*4]) +19*Src[BpS*1] +20*Src[BpS*2] - 6*Src[BpS*3] - Src[BpS*5];
1026     CLIP_STORE(1,C);
1027     C = 16-RND + 2*Src[BpS*0] - 6*(Src[BpS*1]+Src[BpS*4]) +20*(Src[BpS*2]+Src[BpS*3]) + 3*Src[BpS*5] - Src[BpS*6];
1028     CLIP_STORE(2,C);
1029     C = 16-RND - (Src[BpS*0]+Src[BpS*7]) + 3*(Src[BpS*1]+Src[BpS*6])-6*(Src[BpS*2]+Src[BpS*5]) + 20*(Src[BpS*3]+Src[BpS*4]);
1030     CLIP_STORE(3,C);
1031     C = 16-RND - (Src[BpS*1]+Src[BpS*8]) + 3*(Src[BpS*2]+Src[BpS*7])-6*(Src[BpS*3]+Src[BpS*6]) + 20*(Src[BpS*4]+Src[BpS*5]);
1032     CLIP_STORE(4,C);
1033     C = 16-RND - Src[BpS*2] +3*Src[BpS*3] -6*(Src[BpS*4]+Src[BpS*7]) + 20*(Src[BpS*5]+Src[BpS*6]) +2*Src[BpS*8];
1034     CLIP_STORE(5,C);
1035     C = 16-RND - Src[BpS*3] +3*(Src[BpS*4]-Src[BpS*8]) -6*Src[BpS*5] + 20*Src[BpS*6] + 19*Src[BpS*7];
1036     CLIP_STORE(6,C);
1037     C = 16-RND - Src[BpS*4] +3*Src[BpS*5] -7*Src[BpS*6] + 23*Src[BpS*7] + 14*Src[BpS*8];
1038     CLIP_STORE(7,C);
1039     Src += 1;
1040     Dst += 1;
1041   }
1042 #endif
1043 }
1044 #undef CLIP_STORE
1045 
1046 #undef STORE
1047 #undef FUNC_H
1048 #undef FUNC_V
1049 #undef FUNC_HA
1050 #undef FUNC_VA
1051 #undef FUNC_HA_UP
1052 #undef FUNC_VA_UP
1053 
1054 
1055 #endif /* XVID_AUTO_INCLUDE && !defined(REF) */
1056