1 /*****************************************************************************
2  *
3  *	XVID MPEG-4 VIDEO CODEC
4  *	- 8x8 block-based halfpel interpolation -
5  *
6  *  Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
7  *
8  *  This program is free software ; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation ; either version 2 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program ; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  *
22  * $Id: interpolate8x8.c 1985 2011-05-18 09:02:35Z Isibaar $
23  *
24  ****************************************************************************/
25 
26 #include "../portab.h"
27 #include "../global.h"
28 #include "interpolate8x8.h"
29 
30 /* function pointers */
31 INTERPOLATE8X8_PTR interpolate8x8_halfpel_h;
32 INTERPOLATE8X8_PTR interpolate8x8_halfpel_v;
33 INTERPOLATE8X8_PTR interpolate8x8_halfpel_hv;
34 
35 INTERPOLATE8X8_PTR interpolate8x4_halfpel_h;
36 INTERPOLATE8X8_PTR interpolate8x4_halfpel_v;
37 INTERPOLATE8X8_PTR interpolate8x4_halfpel_hv;
38 
39 INTERPOLATE8X8_PTR interpolate8x8_halfpel_add;
40 INTERPOLATE8X8_PTR interpolate8x8_halfpel_h_add;
41 INTERPOLATE8X8_PTR interpolate8x8_halfpel_v_add;
42 INTERPOLATE8X8_PTR interpolate8x8_halfpel_hv_add;
43 
44 INTERPOLATE8X8_AVG2_PTR interpolate8x8_avg2;
45 INTERPOLATE8X8_AVG4_PTR interpolate8x8_avg4;
46 
47 INTERPOLATE_LOWPASS_PTR interpolate8x8_lowpass_h;
48 INTERPOLATE_LOWPASS_PTR interpolate8x8_lowpass_v;
49 
50 INTERPOLATE_LOWPASS_PTR interpolate16x16_lowpass_h;
51 INTERPOLATE_LOWPASS_PTR interpolate16x16_lowpass_v;
52 
53 INTERPOLATE_LOWPASS_HV_PTR interpolate8x8_lowpass_hv;
54 INTERPOLATE_LOWPASS_HV_PTR interpolate16x16_lowpass_hv;
55 
56 INTERPOLATE8X8_6TAP_LOWPASS_PTR interpolate8x8_6tap_lowpass_h;
57 INTERPOLATE8X8_6TAP_LOWPASS_PTR interpolate8x8_6tap_lowpass_v;
58 
59 void
interpolate8x8_avg2_c(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,const uint32_t stride,const uint32_t rounding,const uint32_t height)60 interpolate8x8_avg2_c(uint8_t * dst, const uint8_t * src1, const uint8_t *src2, const uint32_t stride, const uint32_t rounding, const uint32_t height)
61 {
62     uint32_t i;
63 	const int32_t round = 1 - rounding;
64 
65     for(i = 0; i < height; i++) {
66         dst[0] = (src1[0] + src2[0] + round) >> 1;
67         dst[1] = (src1[1] + src2[1] + round) >> 1;
68         dst[2] = (src1[2] + src2[2] + round) >> 1;
69         dst[3] = (src1[3] + src2[3] + round) >> 1;
70         dst[4] = (src1[4] + src2[4] + round) >> 1;
71         dst[5] = (src1[5] + src2[5] + round) >> 1;
72         dst[6] = (src1[6] + src2[6] + round) >> 1;
73         dst[7] = (src1[7] + src2[7] + round) >> 1;
74 
75         dst += stride;
76         src1 += stride;
77         src2 += stride;
78     }
79 }
80 
81 void
interpolate8x8_halfpel_add_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)82 interpolate8x8_halfpel_add_c(uint8_t * const dst, const uint8_t * const src, const uint32_t stride, const uint32_t rounding)
83 {
84 	interpolate8x8_avg2_c(dst, dst, src, stride, 0, 8);
85 }
86 
interpolate8x8_avg4_c(uint8_t * dst,const uint8_t * src1,const uint8_t * src2,const uint8_t * src3,const uint8_t * src4,const uint32_t stride,const uint32_t rounding)87 void interpolate8x8_avg4_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4, const uint32_t stride, const uint32_t rounding)
88 {
89     int32_t i;
90 	const int32_t round = 2 - rounding;
91 
92     for(i = 0; i < 8; i++) {
93         dst[0] = (src1[0] + src2[0] + src3[0] + src4[0] + round) >> 2;
94         dst[1] = (src1[1] + src2[1] + src3[1] + src4[1] + round) >> 2;
95         dst[2] = (src1[2] + src2[2] + src3[2] + src4[2] + round) >> 2;
96         dst[3] = (src1[3] + src2[3] + src3[3] + src4[3] + round) >> 2;
97         dst[4] = (src1[4] + src2[4] + src3[4] + src4[4] + round) >> 2;
98         dst[5] = (src1[5] + src2[5] + src3[5] + src4[5] + round) >> 2;
99         dst[6] = (src1[6] + src2[6] + src3[6] + src4[6] + round) >> 2;
100         dst[7] = (src1[7] + src2[7] + src3[7] + src4[7] + round) >> 2;
101 
102 		dst += stride;
103         src1 += stride;
104         src2 += stride;
105         src3 += stride;
106         src4 += stride;
107     }
108 }
109 
110 /* dst = interpolate(src) */
111 
112 void
interpolate8x8_halfpel_h_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)113 interpolate8x8_halfpel_h_c(uint8_t * const dst,
114 						   const uint8_t * const src,
115 						   const uint32_t stride,
116 						   const uint32_t rounding)
117 {
118 	uintptr_t j;
119 
120 	if (rounding) {
121 		for (j = 0; j < 8*stride; j+=stride) {
122 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] )>>1);
123 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] )>>1);
124 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] )>>1);
125 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] )>>1);
126 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] )>>1);
127 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] )>>1);
128 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] )>>1);
129 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] )>>1);
130 		}
131 	} else {
132 		for (j = 0; j < 8*stride; j+=stride) {
133 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] + 1)>>1);
134 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] + 1)>>1);
135 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] + 1)>>1);
136 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] + 1)>>1);
137 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] + 1)>>1);
138 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] + 1)>>1);
139 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] + 1)>>1);
140 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] + 1)>>1);
141 		}
142 	}
143 }
144 
145 /* dst = interpolate(src) */
146 
147 void
interpolate8x4_halfpel_h_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)148 interpolate8x4_halfpel_h_c(uint8_t * const dst,
149 						   const uint8_t * const src,
150 						   const uint32_t stride,
151 						   const uint32_t rounding)
152 {
153 	uintptr_t j;
154 
155 	if (rounding) {
156 		for (j = 0; j < 4*stride; j+=stride) {
157 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] )>>1);
158 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] )>>1);
159 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] )>>1);
160 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] )>>1);
161 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] )>>1);
162 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] )>>1);
163 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] )>>1);
164 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] )>>1);
165 		}
166 	} else {
167 		for (j = 0; j < 4*stride; j+=stride) {
168 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + 1] + 1)>>1);
169 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + 2] + 1)>>1);
170 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + 3] + 1)>>1);
171 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + 4] + 1)>>1);
172 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + 5] + 1)>>1);
173 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + 6] + 1)>>1);
174 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + 7] + 1)>>1);
175 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + 8] + 1)>>1);
176 		}
177 	}
178 }
179 
180 /* dst = (dst + interpolate(src)/2 */
181 
182 void
interpolate8x8_halfpel_h_add_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)183 interpolate8x8_halfpel_h_add_c(uint8_t * const dst,
184 						   const uint8_t * const src,
185 						   const uint32_t stride,
186 						   const uint32_t rounding)
187 {
188 	uintptr_t j;
189 
190 	if (rounding) {
191 		for (j = 0; j < 8*stride; j+=stride) {
192 				dst[j + 0] = (uint8_t)((((src[j + 0] + src[j + 1] )>>1) + dst[j+0] + 1)>>1);
193 				dst[j + 1] = (uint8_t)((((src[j + 1] + src[j + 2] )>>1) + dst[j+1] + 1)>>1);
194 				dst[j + 2] = (uint8_t)((((src[j + 2] + src[j + 3] )>>1) + dst[j+2] + 1)>>1);
195 				dst[j + 3] = (uint8_t)((((src[j + 3] + src[j + 4] )>>1) + dst[j+3] + 1)>>1);
196 				dst[j + 4] = (uint8_t)((((src[j + 4] + src[j + 5] )>>1) + dst[j+4] + 1)>>1);
197 				dst[j + 5] = (uint8_t)((((src[j + 5] + src[j + 6] )>>1) + dst[j+5] + 1)>>1);
198 				dst[j + 6] = (uint8_t)((((src[j + 6] + src[j + 7] )>>1) + dst[j+6] + 1)>>1);
199 				dst[j + 7] = (uint8_t)((((src[j + 7] + src[j + 8] )>>1) + dst[j+7] + 1)>>1);
200 		}
201 	} else {
202 		for (j = 0; j < 8*stride; j+=stride) {
203 				dst[j + 0] = (uint8_t)((((src[j + 0] + src[j + 1] + 1)>>1) + dst[j+0] + 1)>>1);
204 				dst[j + 1] = (uint8_t)((((src[j + 1] + src[j + 2] + 1)>>1) + dst[j+1] + 1)>>1);
205 				dst[j + 2] = (uint8_t)((((src[j + 2] + src[j + 3] + 1)>>1) + dst[j+2] + 1)>>1);
206 				dst[j + 3] = (uint8_t)((((src[j + 3] + src[j + 4] + 1)>>1) + dst[j+3] + 1)>>1);
207 				dst[j + 4] = (uint8_t)((((src[j + 4] + src[j + 5] + 1)>>1) + dst[j+4] + 1)>>1);
208 				dst[j + 5] = (uint8_t)((((src[j + 5] + src[j + 6] + 1)>>1) + dst[j+5] + 1)>>1);
209 				dst[j + 6] = (uint8_t)((((src[j + 6] + src[j + 7] + 1)>>1) + dst[j+6] + 1)>>1);
210 				dst[j + 7] = (uint8_t)((((src[j + 7] + src[j + 8] + 1)>>1) + dst[j+7] + 1)>>1);
211 		}
212 	}
213 }
214 
215 /* dst = interpolate(src) */
216 
217 void
interpolate8x8_halfpel_v_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)218 interpolate8x8_halfpel_v_c(uint8_t * const dst,
219 						   const uint8_t * const src,
220 						   const uint32_t stride,
221 						   const uint32_t rounding)
222 {
223 	uintptr_t j;
224 
225 
226 	if (rounding) {
227 		for (j = 0; j < 8*stride; j+=stride) {
228 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] )>>1);
229 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] )>>1);
230 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] )>>1);
231 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] )>>1);
232 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] )>>1);
233 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] )>>1);
234 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] )>>1);
235 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] )>>1);
236 		}
237 	} else {
238 		for (j = 0; j < 8*stride; j+=stride) {
239 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] + 1)>>1);
240 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] + 1)>>1);
241 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] + 1)>>1);
242 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] + 1)>>1);
243 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] + 1)>>1);
244 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] + 1)>>1);
245 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] + 1)>>1);
246 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] + 1)>>1);
247 		}
248 	}
249 }
250 
251 /* dst = interpolate(src) */
252 
253 void
interpolate8x4_halfpel_v_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)254 interpolate8x4_halfpel_v_c(uint8_t * const dst,
255 						   const uint8_t * const src,
256 						   const uint32_t stride,
257 						   const uint32_t rounding)
258 {
259 	uintptr_t j;
260 
261 
262 	if (rounding) {
263 		for (j = 0; j < 4*stride; j+=stride) {
264 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] )>>1);
265 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] )>>1);
266 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] )>>1);
267 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] )>>1);
268 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] )>>1);
269 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] )>>1);
270 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] )>>1);
271 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] )>>1);
272 		}
273 	} else {
274 		for (j = 0; j < 4*stride; j+=stride) {
275 				dst[j + 0] = (uint8_t)((src[j + 0] + src[j + stride + 0] + 1)>>1);
276 				dst[j + 1] = (uint8_t)((src[j + 1] + src[j + stride + 1] + 1)>>1);
277 				dst[j + 2] = (uint8_t)((src[j + 2] + src[j + stride + 2] + 1)>>1);
278 				dst[j + 3] = (uint8_t)((src[j + 3] + src[j + stride + 3] + 1)>>1);
279 				dst[j + 4] = (uint8_t)((src[j + 4] + src[j + stride + 4] + 1)>>1);
280 				dst[j + 5] = (uint8_t)((src[j + 5] + src[j + stride + 5] + 1)>>1);
281 				dst[j + 6] = (uint8_t)((src[j + 6] + src[j + stride + 6] + 1)>>1);
282 				dst[j + 7] = (uint8_t)((src[j + 7] + src[j + stride + 7] + 1)>>1);
283 		}
284 	}
285 }
286 
287 /* dst = (dst + interpolate(src))/2 */
288 
289 void
interpolate8x8_halfpel_v_add_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)290 interpolate8x8_halfpel_v_add_c(uint8_t * const dst,
291 						   const uint8_t * const src,
292 						   const uint32_t stride,
293 						   const uint32_t rounding)
294 {
295 	uintptr_t j;
296 
297 
298 	if (rounding) {
299 		for (j = 0; j < 8*stride; j+=stride) {
300 				dst[j + 0] = (uint8_t)((((src[j + 0] + src[j + stride + 0] )>>1) + dst[j+0] + 1)>>1);
301 				dst[j + 1] = (uint8_t)((((src[j + 1] + src[j + stride + 1] )>>1) + dst[j+1] + 1)>>1);
302 				dst[j + 2] = (uint8_t)((((src[j + 2] + src[j + stride + 2] )>>1) + dst[j+2] + 1)>>1);
303 				dst[j + 3] = (uint8_t)((((src[j + 3] + src[j + stride + 3] )>>1) + dst[j+3] + 1)>>1);
304 				dst[j + 4] = (uint8_t)((((src[j + 4] + src[j + stride + 4] )>>1) + dst[j+4] + 1)>>1);
305 				dst[j + 5] = (uint8_t)((((src[j + 5] + src[j + stride + 5] )>>1) + dst[j+5] + 1)>>1);
306 				dst[j + 6] = (uint8_t)((((src[j + 6] + src[j + stride + 6] )>>1) + dst[j+6] + 1)>>1);
307 				dst[j + 7] = (uint8_t)((((src[j + 7] + src[j + stride + 7] )>>1) + dst[j+7] + 1)>>1);
308 		}
309 	} else {
310 		for (j = 0; j < 8*stride; j+=stride) {
311 				dst[j + 0] = (uint8_t)((((src[j + 0] + src[j + stride + 0] + 1)>>1) + dst[j+0] + 1)>>1);
312 				dst[j + 1] = (uint8_t)((((src[j + 1] + src[j + stride + 1] + 1)>>1) + dst[j+1] + 1)>>1);
313 				dst[j + 2] = (uint8_t)((((src[j + 2] + src[j + stride + 2] + 1)>>1) + dst[j+2] + 1)>>1);
314 				dst[j + 3] = (uint8_t)((((src[j + 3] + src[j + stride + 3] + 1)>>1) + dst[j+3] + 1)>>1);
315 				dst[j + 4] = (uint8_t)((((src[j + 4] + src[j + stride + 4] + 1)>>1) + dst[j+4] + 1)>>1);
316 				dst[j + 5] = (uint8_t)((((src[j + 5] + src[j + stride + 5] + 1)>>1) + dst[j+5] + 1)>>1);
317 				dst[j + 6] = (uint8_t)((((src[j + 6] + src[j + stride + 6] + 1)>>1) + dst[j+6] + 1)>>1);
318 				dst[j + 7] = (uint8_t)((((src[j + 7] + src[j + stride + 7] + 1)>>1) + dst[j+7] + 1)>>1);
319 		}
320 	}
321 }
322 
323 /* dst = interpolate(src) */
324 
325 void
interpolate8x8_halfpel_hv_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)326 interpolate8x8_halfpel_hv_c(uint8_t * const dst,
327 							const uint8_t * const src,
328 							const uint32_t stride,
329 							const uint32_t rounding)
330 {
331 	uintptr_t j;
332 
333 	if (rounding) {
334 		for (j = 0; j < 8*stride; j+=stride) {
335 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +1)>>2);
336 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +1)>>2);
337 				dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +1)>>2);
338 				dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +1)>>2);
339 				dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +1)>>2);
340 				dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +1)>>2);
341 				dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +1)>>2);
342 				dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +1)>>2);
343 		}
344 	} else {
345 		for (j = 0; j < 8*stride; j+=stride) {
346 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +2)>>2);
347 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +2)>>2);
348 				dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +2)>>2);
349 				dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +2)>>2);
350 				dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +2)>>2);
351 				dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +2)>>2);
352 				dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +2)>>2);
353 				dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +2)>>2);
354 		}
355 	}
356 }
357 
358 /* dst = interpolate(src) */
359 
360 void
interpolate8x4_halfpel_hv_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)361 interpolate8x4_halfpel_hv_c(uint8_t * const dst,
362 							const uint8_t * const src,
363 							const uint32_t stride,
364 							const uint32_t rounding)
365 {
366 	uintptr_t j;
367 
368 	if (rounding) {
369 		for (j = 0; j < 4*stride; j+=stride) {
370 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +1)>>2);
371 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +1)>>2);
372 				dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +1)>>2);
373 				dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +1)>>2);
374 				dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +1)>>2);
375 				dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +1)>>2);
376 				dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +1)>>2);
377 				dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +1)>>2);
378 		}
379 	} else {
380 		for (j = 0; j < 4*stride; j+=stride) {
381 				dst[j + 0] = (uint8_t)((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +2)>>2);
382 				dst[j + 1] = (uint8_t)((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +2)>>2);
383 				dst[j + 2] = (uint8_t)((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +2)>>2);
384 				dst[j + 3] = (uint8_t)((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +2)>>2);
385 				dst[j + 4] = (uint8_t)((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +2)>>2);
386 				dst[j + 5] = (uint8_t)((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +2)>>2);
387 				dst[j + 6] = (uint8_t)((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +2)>>2);
388 				dst[j + 7] = (uint8_t)((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +2)>>2);
389 		}
390 	}
391 }
392 
393 /* dst = (interpolate(src) + dst)/2 */
394 
395 void
interpolate8x8_halfpel_hv_add_c(uint8_t * const dst,const uint8_t * const src,const uint32_t stride,const uint32_t rounding)396 interpolate8x8_halfpel_hv_add_c(uint8_t * const dst,
397 							const uint8_t * const src,
398 							const uint32_t stride,
399 							const uint32_t rounding)
400 {
401 	uintptr_t j;
402 
403 	if (rounding) {
404 		for (j = 0; j < 8*stride; j+=stride) {
405 				dst[j + 0] = (uint8_t)((((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +1)>>2) + dst[j+0])>>1);
406 				dst[j + 1] = (uint8_t)((((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +1)>>2) + dst[j+1])>>1);
407 				dst[j + 2] = (uint8_t)((((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +1)>>2) + dst[j+2])>>1);
408 				dst[j + 3] = (uint8_t)((((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +1)>>2) + dst[j+3])>>1);
409 				dst[j + 4] = (uint8_t)((((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +1)>>2) + dst[j+4])>>1);
410 				dst[j + 5] = (uint8_t)((((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +1)>>2) + dst[j+5])>>1);
411 				dst[j + 6] = (uint8_t)((((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +1)>>2) + dst[j+6])>>1);
412 				dst[j + 7] = (uint8_t)((((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +1)>>2) + dst[j+7])>>1);
413 		}
414 	} else {
415 		for (j = 0; j < 8*stride; j+=stride) {
416 				dst[j + 0] = (uint8_t)((((src[j+0] + src[j+1] + src[j+stride+0] + src[j+stride+1] +2)>>2) + dst[j+0] + 1)>>1);
417 				dst[j + 1] = (uint8_t)((((src[j+1] + src[j+2] + src[j+stride+1] + src[j+stride+2] +2)>>2) + dst[j+1] + 1)>>1);
418 				dst[j + 2] = (uint8_t)((((src[j+2] + src[j+3] + src[j+stride+2] + src[j+stride+3] +2)>>2) + dst[j+2] + 1)>>1);
419 				dst[j + 3] = (uint8_t)((((src[j+3] + src[j+4] + src[j+stride+3] + src[j+stride+4] +2)>>2) + dst[j+3] + 1)>>1);
420 				dst[j + 4] = (uint8_t)((((src[j+4] + src[j+5] + src[j+stride+4] + src[j+stride+5] +2)>>2) + dst[j+4] + 1)>>1);
421 				dst[j + 5] = (uint8_t)((((src[j+5] + src[j+6] + src[j+stride+5] + src[j+stride+6] +2)>>2) + dst[j+5] + 1)>>1);
422 				dst[j + 6] = (uint8_t)((((src[j+6] + src[j+7] + src[j+stride+6] + src[j+stride+7] +2)>>2) + dst[j+6] + 1)>>1);
423 				dst[j + 7] = (uint8_t)((((src[j+7] + src[j+8] + src[j+stride+7] + src[j+stride+8] +2)>>2) + dst[j+7] + 1)>>1);
424 		}
425 	}
426 }
427 
428 /*************************************************************
429  * QPEL STUFF STARTS HERE                                    *
430  *************************************************************/
431 
interpolate8x8_6tap_lowpass_h_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)432 void interpolate8x8_6tap_lowpass_h_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
433 {
434     int32_t i;
435 	uint8_t round_add = 16 - rounding;
436 
437     for(i = 0; i < 8; i++)
438     {
439 
440         dst[0] = CLIP((((src[-2] + src[3]) + 5 * (((src[0] + src[1])<<2) - (src[-1] + src[2])) + round_add) >> 5), 0, 255);
441         dst[1] = CLIP((((src[-1] + src[4]) + 5 * (((src[1] + src[2])<<2) - (src[0] + src[3])) + round_add) >> 5), 0, 255);
442         dst[2] = CLIP((((src[0] + src[5]) + 5 * (((src[2] + src[3])<<2) - (src[1] + src[4])) + round_add) >> 5), 0, 255);
443         dst[3] = CLIP((((src[1] + src[6]) + 5 * (((src[3] + src[4])<<2) - (src[2] + src[5])) + round_add) >> 5), 0, 255);
444         dst[4] = CLIP((((src[2] + src[7]) + 5 * (((src[4] + src[5])<<2) - (src[3] + src[6])) + round_add) >> 5), 0, 255);
445         dst[5] = CLIP((((src[3] + src[8]) + 5 * (((src[5] + src[6])<<2) - (src[4] + src[7])) + round_add) >> 5), 0, 255);
446         dst[6] = CLIP((((src[4] + src[9]) + 5 * (((src[6] + src[7])<<2) - (src[5] + src[8])) + round_add) >> 5), 0, 255);
447         dst[7] = CLIP((((src[5] + src[10]) + 5 * (((src[7] + src[8])<<2) - (src[6] + src[9])) + round_add) >> 5), 0, 255);
448 
449         dst += stride;
450         src += stride;
451     }
452 }
453 
interpolate16x16_lowpass_h_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)454 void interpolate16x16_lowpass_h_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
455 {
456     int32_t i;
457 	uint8_t round_add = 16 - rounding;
458 
459     for(i = 0; i < 17; i++)
460     {
461 
462         dst[0] = CLIP(((7 * ((src[0]<<1) - src[2]) +  23 * src[1] + 3 * src[3] - src[4] + round_add) >> 5), 0, 255);
463         dst[1] = CLIP(((19 * src[1] + 20 * src[2] - src[5] + 3 * (src[4] - src[0] - (src[3]<<1)) + round_add) >> 5), 0, 255);
464         dst[2] = CLIP(((20 * (src[2] + src[3]) + (src[0]<<1) + 3 * (src[5] - ((src[1] + src[4])<<1)) - src[6] + round_add) >> 5), 0, 255);
465 
466         dst[3] = CLIP(((20 * (src[3] + src[4]) + 3 * ((src[6] + src[1]) - ((src[2] + src[5])<<1)) - (src[0] + src[7]) + round_add) >> 5), 0, 255);
467         dst[4] = CLIP(((20 * (src[4] + src[5]) - 3 * (((src[3] + src[6])<<1) - (src[2] + src[7])) - (src[1] + src[8]) + round_add) >> 5), 0, 255);
468         dst[5] = CLIP(((20 * (src[5] + src[6]) - 3 * (((src[4] + src[7])<<1) - (src[3] + src[8])) - (src[2] + src[9]) + round_add) >> 5), 0, 255);
469         dst[6] = CLIP(((20 * (src[6] + src[7]) - 3 * (((src[5] + src[8])<<1) - (src[4] + src[9])) - (src[3] + src[10]) + round_add) >> 5), 0, 255);
470         dst[7] = CLIP(((20 * (src[7] + src[8]) - 3 * (((src[6] + src[9])<<1) - (src[5] + src[10])) - (src[4] + src[11]) + round_add) >> 5), 0, 255);
471         dst[8] = CLIP(((20 * (src[8] + src[9]) - 3 * (((src[7] + src[10])<<1) - (src[6] + src[11])) - (src[5] + src[12]) + round_add) >> 5), 0, 255);
472         dst[9] = CLIP(((20 * (src[9] + src[10]) - 3 * (((src[8] + src[11])<<1) - (src[7] + src[12])) - (src[6] + src[13]) + round_add) >> 5), 0, 255);
473         dst[10] = CLIP(((20 * (src[10] + src[11]) - 3 * (((src[9] + src[12])<<1) - (src[8] + src[13])) - (src[7] + src[14]) + round_add) >> 5), 0, 255);
474         dst[11] = CLIP(((20 * (src[11] + src[12]) - 3 * (((src[10] + src[13])<<1) - (src[9] + src[14])) - (src[8] + src[15]) + round_add) >> 5), 0, 255);
475         dst[12] = CLIP(((20 * (src[12] + src[13]) - 3 * (((src[11] + src[14])<<1) - (src[10] + src[15])) - (src[9] + src[16]) + round_add) >> 5), 0, 255);
476 
477         dst[13] = CLIP(((20 * (src[13] + src[14]) + (src[16]<<1) + 3 * (src[11] - ((src[12] + src[15]) << 1)) - src[10] + round_add) >> 5), 0, 255);
478         dst[14] = CLIP(((19 * src[15] + 20 * src[14] + 3 * (src[12] - src[16] - (src[13] << 1)) - src[11] + round_add) >> 5), 0, 255);
479         dst[15] = CLIP(((23 * src[15] + 7 * ((src[16]<<1) - src[14]) + 3 * src[13] - src[12] + round_add) >> 5), 0, 255);
480 
481         dst += stride;
482         src += stride;
483     }
484 }
485 
interpolate8x8_lowpass_h_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)486 void interpolate8x8_lowpass_h_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
487 {
488     int32_t i;
489 	uint8_t round_add = 16 - rounding;
490 
491     for(i = 0; i < 9; i++)
492     {
493 
494         dst[0] = CLIP(((7 * ((src[0]<<1) - src[2]) + 23 * src[1] + 3 * src[3] - src[4] + round_add) >> 5), 0, 255);
495         dst[1] = CLIP(((19 * src[1] + 20 * src[2] - src[5] + 3 * (src[4] - src[0] - (src[3]<<1)) + round_add) >> 5), 0, 255);
496         dst[2] = CLIP(((20 * (src[2] + src[3]) + (src[0]<<1) + 3 * (src[5] - ((src[1] + src[4])<<1)) - src[6] + round_add) >> 5), 0, 255);
497         dst[3] = CLIP(((20 * (src[3] + src[4]) + 3 * ((src[6] + src[1]) - ((src[2] + src[5])<<1)) - (src[0] + src[7]) + round_add) >> 5), 0, 255);
498         dst[4] = CLIP(((20 * (src[4] + src[5]) - 3 * (((src[3] + src[6])<<1) - (src[2] + src[7])) - (src[1] + src[8]) + round_add) >> 5), 0, 255);
499         dst[5] = CLIP(((20 * (src[5] + src[6]) + (src[8]<<1) + 3 * (src[3] - ((src[4] + src[7]) << 1)) - src[2] + round_add) >> 5), 0, 255);
500         dst[6] = CLIP(((19 * src[7] + 20 * src[6] + 3 * (src[4] - src[8] - (src[5] << 1)) - src[3] + round_add) >> 5), 0, 255);
501         dst[7] = CLIP(((23 * src[7] + 7 * ((src[8]<<1) - src[6]) + 3 * src[5] - src[4] + round_add) >> 5), 0, 255);
502 
503         dst += stride;
504         src += stride;
505     }
506 }
507 
interpolate8x8_6tap_lowpass_v_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)508 void interpolate8x8_6tap_lowpass_v_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
509 {
510     int32_t i;
511 	uint8_t round_add = 16 - rounding;
512 
513     for(i = 0; i < 8; i++)
514     {
515         int32_t src_2 = src[-2*stride];
516         int32_t src_1 = src[-stride];
517         int32_t src0 = src[0];
518         int32_t src1 = src[stride];
519         int32_t src2 = src[2 * stride];
520         int32_t src3 = src[3 * stride];
521         int32_t src4 = src[4 * stride];
522         int32_t src5 = src[5 * stride];
523         int32_t src6 = src[6 * stride];
524         int32_t src7 = src[7 * stride];
525         int32_t src8 = src[8 * stride];
526         int32_t src9 = src[9 * stride];
527         int32_t src10 = src[10 * stride];
528 
529         dst[0]			= CLIP((((src_2 + src3) + 5 * (((src0 + src1)<<2) - (src_1 + src2)) + round_add) >> 5), 0, 255);
530         dst[stride]		= CLIP((((src_1 + src4) + 5 * (((src1 + src2)<<2) - (src0 + src3)) + round_add) >> 5), 0, 255);
531         dst[2 * stride] = CLIP((((src0 + src5) + 5 * (((src2 + src3)<<2) - (src1 + src4)) + round_add) >> 5), 0, 255);
532         dst[3 * stride] = CLIP((((src1 + src6) + 5 * (((src3 + src4)<<2) - (src2 + src5)) + round_add) >> 5), 0, 255);
533         dst[4 * stride] = CLIP((((src2 + src7) + 5 * (((src4 + src5)<<2) - (src3 + src6)) + round_add) >> 5), 0, 255);
534         dst[5 * stride] = CLIP((((src3 + src8) + 5 * (((src5 + src6)<<2) - (src4 + src7)) + round_add) >> 5), 0, 255);
535         dst[6 * stride] = CLIP((((src4 + src9) + 5 * (((src6 + src7)<<2) - (src5 + src8)) + round_add) >> 5), 0, 255);
536         dst[7 * stride] = CLIP((((src5 + src10) + 5 * (((src7 + src8)<<2) - (src6 + src9)) + round_add) >> 5), 0, 255);
537 
538 		dst++;
539         src++;
540     }
541 }
542 
interpolate16x16_lowpass_v_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)543 void interpolate16x16_lowpass_v_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
544 {
545     int32_t i;
546 	uint8_t round_add = 16 - rounding;
547 
548     for(i = 0; i < 17; i++)
549     {
550         int32_t src0 = src[0];
551         int32_t src1 = src[stride];
552         int32_t src2 = src[2 * stride];
553         int32_t src3 = src[3 * stride];
554         int32_t src4 = src[4 * stride];
555         int32_t src5 = src[5 * stride];
556         int32_t src6 = src[6 * stride];
557         int32_t src7 = src[7 * stride];
558         int32_t src8 = src[8 * stride];
559         int32_t src9 = src[9 * stride];
560         int32_t src10 = src[10 * stride];
561         int32_t src11 = src[11 * stride];
562         int32_t src12 = src[12 * stride];
563         int32_t src13 = src[13 * stride];
564         int32_t src14 = src[14 * stride];
565         int32_t src15 = src[15 * stride];
566         int32_t src16 = src[16 * stride];
567 
568 
569         dst[0] = CLIP(((7 * ((src0<<1) - src2) +  23 * src1 + 3 * src3 - src4 + round_add) >> 5), 0, 255);
570         dst[stride] = CLIP(((19 * src1 + 20 * src2 - src5 + 3 * (src4 - src0 - (src3<<1)) + round_add) >> 5), 0, 255);
571         dst[2*stride] = CLIP(((20 * (src2 + src3) + (src0<<1) + 3 * (src5 - ((src1 + src4)<<1)) - src6 + round_add) >> 5), 0, 255);
572 
573         dst[3*stride] = CLIP(((20 * (src3 + src4) + 3 * ((src6 + src1) - ((src2 + src5)<<1)) - (src0 + src7) + round_add) >> 5), 0, 255);
574         dst[4*stride] = CLIP(((20 * (src4 + src5) - 3 * (((src3 + src6)<<1) - (src2 + src7)) - (src1 + src8) + round_add) >> 5), 0, 255);
575         dst[5*stride] = CLIP(((20 * (src5 + src6) - 3 * (((src4 + src7)<<1) - (src3 + src8)) - (src2 + src9) + round_add) >> 5), 0, 255);
576         dst[6*stride] = CLIP(((20 * (src6 + src7) - 3 * (((src5 + src8)<<1) - (src4 + src9)) - (src3 + src10) + round_add) >> 5), 0, 255);
577         dst[7*stride] = CLIP(((20 * (src7 + src8) - 3 * (((src6 + src9)<<1) - (src5 + src10)) - (src4 + src11) + round_add) >> 5), 0, 255);
578         dst[8*stride] = CLIP(((20 * (src8 + src9) - 3 * (((src7 + src10)<<1) - (src6 + src11)) - (src5 + src12) + round_add) >> 5), 0, 255);
579         dst[9*stride] = CLIP(((20 * (src9 + src10) - 3 * (((src8 + src11)<<1) - (src7 + src12)) - (src6 + src13) + round_add) >> 5), 0, 255);
580         dst[10*stride] = CLIP(((20 * (src10 + src11) - 3 * (((src9 + src12)<<1) - (src8 + src13)) - (src7 + src14) + round_add) >> 5), 0, 255);
581         dst[11*stride] = CLIP(((20 * (src11 + src12) - 3 * (((src10 + src13)<<1) - (src9 + src14)) - (src8 + src15) + round_add) >> 5), 0, 255);
582         dst[12*stride] = CLIP(((20 * (src12 + src13) - 3 * (((src11 + src14)<<1) - (src10 + src15)) - (src9 + src16) + round_add) >> 5), 0, 255);
583 
584         dst[13*stride] = CLIP(((20 * (src13 + src14) + (src16<<1) + 3 * (src11 - ((src12 + src15) << 1)) - src10 + round_add) >> 5), 0, 255);
585         dst[14*stride] = CLIP(((19 * src15 + 20 * src14 + 3 * (src12 - src16 - (src13 << 1)) - src11 + round_add) >> 5), 0, 255);
586         dst[15*stride] = CLIP(((23 * src15 + 7 * ((src16<<1) - src14) + 3 * src13 - src12 + round_add) >> 5), 0, 255);
587 
588 		dst++;
589         src++;
590     }
591 }
592 
interpolate8x8_lowpass_v_c(uint8_t * dst,uint8_t * src,int32_t stride,int32_t rounding)593 void interpolate8x8_lowpass_v_c(uint8_t *dst, uint8_t *src, int32_t stride, int32_t rounding)
594 {
595     int32_t i;
596 	uint8_t round_add = 16 - rounding;
597 
598     for(i = 0; i < 9; i++)
599     {
600         int32_t src0 = src[0];
601         int32_t src1 = src[stride];
602         int32_t src2 = src[2 * stride];
603         int32_t src3 = src[3 * stride];
604         int32_t src4 = src[4 * stride];
605         int32_t src5 = src[5 * stride];
606         int32_t src6 = src[6 * stride];
607         int32_t src7 = src[7 * stride];
608         int32_t src8 = src[8 * stride];
609 
610         dst[0]			= CLIP(((7 * ((src0<<1) - src2) + 23 * src1 + 3 * src3 - src4 + round_add) >> 5), 0, 255);
611         dst[stride]		= CLIP(((19 * src1 + 20 * src2 - src5 + 3 * (src4 - src0 - (src3 << 1)) + round_add) >> 5), 0, 255);
612         dst[2 * stride] = CLIP(((20 * (src2 + src3) + (src0<<1) + 3 * (src5 - ((src1 + src4) <<1 )) - src6 + round_add) >> 5), 0, 255);
613         dst[3 * stride] = CLIP(((20 * (src3 + src4) + 3 * ((src6 + src1) - ((src2 + src5)<<1)) - (src0 + src7) + round_add) >> 5), 0, 255);
614         dst[4 * stride] = CLIP(((20 * (src4 + src5) + 3 * ((src2 + src7) - ((src3 + src6)<<1)) - (src1 + src8) + round_add) >> 5), 0, 255);
615         dst[5 * stride] = CLIP(((20 * (src5 + src6) + (src8<<1) + 3 * (src3 - ((src4 + src7) << 1)) - src2 + round_add) >> 5), 0, 255);
616         dst[6 * stride] = CLIP(((19 * src7 + 20 * src6 - src3 + 3 * (src4 - src8 - (src5 << 1)) + round_add) >> 5), 0, 255);
617         dst[7 * stride] = CLIP(((7 * ((src8<<1) - src6) + 23 * src7 + 3 * src5 - src4 + round_add) >> 5), 0, 255);
618 
619 		dst++;
620         src++;
621     }
622 }
623 
interpolate16x16_lowpass_hv_c(uint8_t * dst1,uint8_t * dst2,uint8_t * src,int32_t stride,int32_t rounding)624 void interpolate16x16_lowpass_hv_c(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int32_t stride, int32_t rounding)
625 {
626 	int32_t i;
627 	uint8_t round_add = 16 - rounding;
628 	uint8_t *h_ptr = dst2;
629 
630     for(i = 0; i < 17; i++)
631     {
632 
633         h_ptr[0] = CLIP(((7 * ((src[0]<<1) - src[2]) +  23 * src[1] + 3 * src[3] - src[4] + round_add) >> 5), 0, 255);
634         h_ptr[1] = CLIP(((19 * src[1] + 20 * src[2] - src[5] + 3 * (src[4] - src[0] - (src[3]<<1)) + round_add) >> 5), 0, 255);
635         h_ptr[2] = CLIP(((20 * (src[2] + src[3]) + (src[0]<<1) + 3 * (src[5] - ((src[1] + src[4])<<1)) - src[6] + round_add) >> 5), 0, 255);
636 
637         h_ptr[3] = CLIP(((20 * (src[3] + src[4]) + 3 * ((src[6] + src[1]) - ((src[2] + src[5])<<1)) - (src[0] + src[7]) + round_add) >> 5), 0, 255);
638         h_ptr[4] = CLIP(((20 * (src[4] + src[5]) - 3 * (((src[3] + src[6])<<1) - (src[2] + src[7])) - (src[1] + src[8]) + round_add) >> 5), 0, 255);
639         h_ptr[5] = CLIP(((20 * (src[5] + src[6]) - 3 * (((src[4] + src[7])<<1) - (src[3] + src[8])) - (src[2] + src[9]) + round_add) >> 5), 0, 255);
640         h_ptr[6] = CLIP(((20 * (src[6] + src[7]) - 3 * (((src[5] + src[8])<<1) - (src[4] + src[9])) - (src[3] + src[10]) + round_add) >> 5), 0, 255);
641         h_ptr[7] = CLIP(((20 * (src[7] + src[8]) - 3 * (((src[6] + src[9])<<1) - (src[5] + src[10])) - (src[4] + src[11]) + round_add) >> 5), 0, 255);
642         h_ptr[8] = CLIP(((20 * (src[8] + src[9]) - 3 * (((src[7] + src[10])<<1) - (src[6] + src[11])) - (src[5] + src[12]) + round_add) >> 5), 0, 255);
643         h_ptr[9] = CLIP(((20 * (src[9] + src[10]) - 3 * (((src[8] + src[11])<<1) - (src[7] + src[12])) - (src[6] + src[13]) + round_add) >> 5), 0, 255);
644         h_ptr[10] = CLIP(((20 * (src[10] + src[11]) - 3 * (((src[9] + src[12])<<1) - (src[8] + src[13])) - (src[7] + src[14]) + round_add) >> 5), 0, 255);
645         h_ptr[11] = CLIP(((20 * (src[11] + src[12]) - 3 * (((src[10] + src[13])<<1) - (src[9] + src[14])) - (src[8] + src[15]) + round_add) >> 5), 0, 255);
646         h_ptr[12] = CLIP(((20 * (src[12] + src[13]) - 3 * (((src[11] + src[14])<<1) - (src[10] + src[15])) - (src[9] + src[16]) + round_add) >> 5), 0, 255);
647 
648         h_ptr[13] = CLIP(((20 * (src[13] + src[14]) + (src[16]<<1) + 3 * (src[11] - ((src[12] + src[15]) << 1)) - src[10] + round_add) >> 5), 0, 255);
649         h_ptr[14] = CLIP(((19 * src[15] + 20 * src[14] + 3 * (src[12] - src[16] - (src[13] << 1)) - src[11] + round_add) >> 5), 0, 255);
650         h_ptr[15] = CLIP(((23 * src[15] + 7 * ((src[16]<<1) - src[14]) + 3 * src[13] - src[12] + round_add) >> 5), 0, 255);
651 
652         h_ptr += stride;
653         src += stride;
654     }
655 
656 	interpolate16x16_lowpass_v_c(dst1, dst2, stride, rounding);
657 
658 }
659 
interpolate8x8_lowpass_hv_c(uint8_t * dst1,uint8_t * dst2,uint8_t * src,int32_t stride,int32_t rounding)660 void interpolate8x8_lowpass_hv_c(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int32_t stride, int32_t rounding)
661 {
662 	int32_t i;
663 	uint8_t round_add = 16 - rounding;
664 	uint8_t *h_ptr = dst2;
665 
666     for(i = 0; i < 9; i++)
667     {
668 
669         h_ptr[0] = CLIP(((7 * ((src[0]<<1) - src[2]) + 23 * src[1] + 3 * src[3] - src[4] + round_add) >> 5), 0, 255);
670         h_ptr[1] = CLIP(((19 * src[1] + 20 * src[2] - src[5] + 3 * (src[4] - src[0] - (src[3]<<1)) + round_add) >> 5), 0, 255);
671         h_ptr[2] = CLIP(((20 * (src[2] + src[3]) + (src[0]<<1) + 3 * (src[5] - ((src[1] + src[4])<<1)) - src[6] + round_add) >> 5), 0, 255);
672         h_ptr[3] = CLIP(((20 * (src[3] + src[4]) + 3 * ((src[6] + src[1]) - ((src[2] + src[5])<<1)) - (src[0] + src[7]) + round_add) >> 5), 0, 255);
673         h_ptr[4] = CLIP(((20 * (src[4] + src[5]) - 3 * (((src[3] + src[6])<<1) - (src[2] + src[7])) - (src[1] + src[8]) + round_add) >> 5), 0, 255);
674         h_ptr[5] = CLIP(((20 * (src[5] + src[6]) + (src[8]<<1) + 3 * (src[3] - ((src[4] + src[7]) << 1)) - src[2] + round_add) >> 5), 0, 255);
675         h_ptr[6] = CLIP(((19 * src[7] + 20 * src[6] + 3 * (src[4] - src[8] - (src[5] << 1)) - src[3] + round_add) >> 5), 0, 255);
676         h_ptr[7] = CLIP(((23 * src[7] + 7 * ((src[8]<<1) - src[6]) + 3 * src[5] - src[4] + round_add) >> 5), 0, 255);
677 
678         h_ptr += stride;
679         src += stride;
680     }
681 
682 	interpolate8x8_lowpass_v_c(dst1, dst2, stride, rounding);
683 
684 }
685