1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "./aom_dsp_rtcd.h"
16 #include "./av1_rtcd.h"
17 #include "av1/common/blockd.h"
18 #include "av1/common/convolve.h"
19 #include "av1/common/filter.h"
20 #include "av1/common/onyxc_int.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_ports/mem.h"
23
24 #define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
25 #define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
26 #define MAX_STEP (32)
27
av1_convolve_horiz_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)28 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
29 int dst_stride, int w, int h,
30 const InterpFilterParams filter_params,
31 const int subpel_x_q4, int x_step_q4,
32 ConvolveParams *conv_params) {
33 int x, y;
34 int filter_size = filter_params.taps;
35 assert(conv_params->round == CONVOLVE_OPT_ROUND);
36 src -= filter_size / 2 - 1;
37 for (y = 0; y < h; ++y) {
38 int x_q4 = subpel_x_q4;
39 for (x = 0; x < w; ++x) {
40 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
41 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
42 filter_params, x_q4 & SUBPEL_MASK);
43 int k, sum = 0;
44 for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
45
46 sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
47 if (conv_params->do_average)
48 dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
49 else
50 dst[x] = sum;
51
52 x_q4 += x_step_q4;
53 }
54 src += src_stride;
55 dst += dst_stride;
56 }
57 }
58
av1_convolve_horiz_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,ConvolveParams * conv_params)59 void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
60 int dst_stride, int w, int h,
61 const InterpFilterParams filter_params,
62 const int subpel_x_qn, int x_step_qn,
63 ConvolveParams *conv_params) {
64 int x, y;
65 int filter_size = filter_params.taps;
66 assert(conv_params->round == CONVOLVE_OPT_ROUND);
67 src -= filter_size / 2 - 1;
68 for (y = 0; y < h; ++y) {
69 int x_qn = subpel_x_qn;
70 for (x = 0; x < w; ++x) {
71 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
72 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
73 assert(x_filter_idx < SUBPEL_SHIFTS);
74 const int16_t *x_filter =
75 av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
76 int k, sum = 0;
77 for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
78
79 sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
80 if (conv_params->do_average)
81 dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
82 else
83 dst[x] = sum;
84
85 x_qn += x_step_qn;
86 }
87 src += src_stride;
88 dst += dst_stride;
89 }
90 }
91
av1_convolve_vert_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)92 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
93 int dst_stride, int w, int h,
94 const InterpFilterParams filter_params,
95 const int subpel_y_q4, int y_step_q4,
96 ConvolveParams *conv_params) {
97 int x, y;
98 int filter_size = filter_params.taps;
99 assert(conv_params->round == CONVOLVE_OPT_ROUND);
100 src -= src_stride * (filter_size / 2 - 1);
101 for (x = 0; x < w; ++x) {
102 int y_q4 = subpel_y_q4;
103 for (y = 0; y < h; ++y) {
104 const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
105 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
106 filter_params, y_q4 & SUBPEL_MASK);
107 int k, sum = 0;
108 for (k = 0; k < filter_size; ++k)
109 sum += src_y[k * src_stride] * y_filter[k];
110
111 sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
112 if (conv_params->do_average)
113 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
114 else
115 dst[y * dst_stride] = sum;
116
117 y_q4 += y_step_q4;
118 }
119 ++src;
120 ++dst;
121 }
122 }
123
av1_convolve_vert_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)124 void av1_convolve_vert_scale(const uint8_t *src, int src_stride, uint8_t *dst,
125 int dst_stride, int w, int h,
126 const InterpFilterParams filter_params,
127 const int subpel_y_qn, int y_step_qn,
128 ConvolveParams *conv_params) {
129 int x, y;
130 int filter_size = filter_params.taps;
131 assert(conv_params->round == CONVOLVE_OPT_ROUND);
132 src -= src_stride * (filter_size / 2 - 1);
133 for (x = 0; x < w; ++x) {
134 int y_qn = subpel_y_qn;
135 for (y = 0; y < h; ++y) {
136 const uint8_t *const src_y =
137 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
138 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
139 assert(y_filter_idx < SUBPEL_SHIFTS);
140 const int16_t *y_filter =
141 av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
142 int k, sum = 0;
143 for (k = 0; k < filter_size; ++k)
144 sum += src_y[k * src_stride] * y_filter[k];
145
146 sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
147 if (conv_params->do_average)
148 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
149 else
150 dst[y * dst_stride] = sum;
151
152 y_qn += y_step_qn;
153 }
154 ++src;
155 ++dst;
156 }
157 }
158
convolve_copy(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)159 static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
160 int dst_stride, int w, int h,
161 ConvolveParams *conv_params) {
162 assert(conv_params->round == CONVOLVE_OPT_ROUND);
163 if (conv_params->do_average == 0) {
164 int r;
165 for (r = 0; r < h; ++r) {
166 memcpy(dst, src, w);
167 src += src_stride;
168 dst += dst_stride;
169 }
170 } else {
171 int r, c;
172 for (r = 0; r < h; ++r) {
173 for (c = 0; c < w; ++c) {
174 dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
175 }
176 src += src_stride;
177 dst += dst_stride;
178 }
179 }
180 }
181
av1_convolve_horiz_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)182 void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
183 int dst_stride, int w, int h,
184 const InterpFilterParams filter_params,
185 const int subpel_x_q4, int x_step_q4,
186 ConvolveParams *conv_params) {
187 assert(conv_params->round == CONVOLVE_OPT_ROUND);
188 if (filter_params.taps == SUBPEL_TAPS) {
189 const int16_t *filter_x =
190 av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
191 if (conv_params->do_average == 0)
192 aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
193 NULL, -1, w, h);
194 else
195 aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
196 x_step_q4, NULL, -1, w, h);
197 } else {
198 av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
199 subpel_x_q4, x_step_q4, conv_params);
200 }
201 }
202
av1_convolve_horiz_facade_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)203 void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
204 uint8_t *dst, int dst_stride, int w, int h,
205 const InterpFilterParams filter_params,
206 const int subpel_x_q4, int x_step_q4,
207 ConvolveParams *conv_params) {
208 assert(conv_params->round == CONVOLVE_OPT_ROUND);
209 if (filter_params.taps == SUBPEL_TAPS) {
210 const int16_t *filter_x =
211 av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
212 if (conv_params->do_average == 0)
213 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
214 x_step_q4, NULL, -1, w, h);
215 else
216 aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
217 x_step_q4, NULL, -1, w, h);
218 } else {
219 av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
220 subpel_x_q4, x_step_q4, conv_params);
221 }
222 }
223
av1_convolve_horiz_facade_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,ConvolveParams * conv_params)224 void av1_convolve_horiz_facade_scale(const uint8_t *src, int src_stride,
225 uint8_t *dst, int dst_stride, int w, int h,
226 const InterpFilterParams filter_params,
227 const int subpel_x_qn, int x_step_qn,
228 ConvolveParams *conv_params) {
229 assert(conv_params->round == CONVOLVE_OPT_ROUND);
230 if (filter_params.taps == SUBPEL_TAPS) {
231 const int16_t *filter_x = av1_get_interp_filter_subpel_kernel(
232 filter_params, subpel_x_qn >> SCALE_EXTRA_BITS);
233 if (conv_params->do_average == 0)
234 aom_convolve8_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
235 subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
236 else
237 aom_convolve8_avg_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
238 subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
239 } else {
240 av1_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
241 filter_params, subpel_x_qn, x_step_qn,
242 conv_params);
243 }
244 }
245
av1_convolve_vert_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)246 void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
247 int dst_stride, int w, int h,
248 const InterpFilterParams filter_params,
249 const int subpel_y_q4, int y_step_q4,
250 ConvolveParams *conv_params) {
251 assert(conv_params->round == CONVOLVE_OPT_ROUND);
252 if (filter_params.taps == SUBPEL_TAPS) {
253 const int16_t *filter_y =
254 av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
255 if (conv_params->do_average == 0) {
256 aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
257 y_step_q4, w, h);
258 } else {
259 aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
260 filter_y, y_step_q4, w, h);
261 }
262 } else {
263 av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
264 subpel_y_q4, y_step_q4, conv_params);
265 }
266 }
267
av1_convolve_vert_facade_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)268 void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
269 uint8_t *dst, int dst_stride, int w, int h,
270 const InterpFilterParams filter_params,
271 const int subpel_y_q4, int y_step_q4,
272 ConvolveParams *conv_params) {
273 assert(conv_params->round == CONVOLVE_OPT_ROUND);
274 if (filter_params.taps == SUBPEL_TAPS) {
275 const int16_t *filter_y =
276 av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
277 if (conv_params->do_average == 0) {
278 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
279 y_step_q4, w, h);
280 } else {
281 aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
282 filter_y, y_step_q4, w, h);
283 }
284 } else {
285 av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
286 subpel_y_q4, y_step_q4, conv_params);
287 }
288 }
289
av1_convolve_vert_facade_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)290 void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride,
291 uint8_t *dst, int dst_stride, int w, int h,
292 const InterpFilterParams filter_params,
293 const int subpel_y_qn, int y_step_qn,
294 ConvolveParams *conv_params) {
295 assert(conv_params->round == CONVOLVE_OPT_ROUND);
296 if (filter_params.taps == SUBPEL_TAPS) {
297 const int16_t *filter_y = av1_get_interp_filter_subpel_kernel(
298 filter_params, subpel_y_qn >> SCALE_EXTRA_BITS);
299 if (conv_params->do_average == 0) {
300 aom_convolve8_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, -1,
301 filter_y, subpel_y_qn, y_step_qn, w, h);
302 } else {
303 aom_convolve8_avg_vert_scale(src, src_stride, dst, dst_stride, NULL, 0,
304 -1, filter_y, subpel_y_qn, y_step_qn, w, h);
305 }
306 } else {
307 av1_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
308 filter_params, subpel_y_qn, y_step_qn, conv_params);
309 }
310 }
311
312 #if CONFIG_CONVOLVE_ROUND
av1_convolve_rounding_c(const int32_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int bits)313 void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
314 int dst_stride, int w, int h, int bits) {
315 int r, c;
316 for (r = 0; r < h; ++r) {
317 for (c = 0; c < w; ++c) {
318 dst[r * dst_stride + c] =
319 clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
320 }
321 }
322 }
323
324 #if CONFIG_COMPOUND_ROUND
av1_convolve_2d_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)325 void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
326 int dst_stride, int w, int h,
327 InterpFilterParams *filter_params_x,
328 InterpFilterParams *filter_params_y,
329 const int subpel_x_q4, const int subpel_y_q4,
330 ConvolveParams *conv_params) {
331 int x, y, k;
332 uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
333 int im_h = h + filter_params_y->taps - 1;
334 int im_stride = w;
335 const int fo_vert = filter_params_y->taps / 2 - 1;
336 const int fo_horiz = filter_params_x->taps / 2 - 1;
337
338 // horizontal filter
339 const uint8_t *src_horiz = src - fo_vert * src_stride;
340 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
341 *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
342 for (y = 0; y < im_h; ++y) {
343 for (x = 0; x < w; ++x) {
344 int32_t sum = 0;
345 for (k = 0; k < filter_params_x->taps; ++k) {
346 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
347 }
348 im_block[y * im_stride + x] =
349 clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
350 }
351 }
352
353 // vertical filter
354 uint8_t *src_vert = im_block + fo_vert * im_stride;
355 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
356 *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
357 for (y = 0; y < h; ++y) {
358 for (x = 0; x < w; ++x) {
359 CONV_BUF_TYPE sum = 0;
360 for (k = 0; k < filter_params_y->taps; ++k) {
361 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
362 }
363 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
364 if (conv_params->do_average)
365 dst[y * dst_stride + x] += res;
366 else
367 dst[y * dst_stride + x] = res;
368 }
369 }
370 }
371
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)372 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
373 CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
374 InterpFilterParams *filter_params_x,
375 InterpFilterParams *filter_params_y,
376 const int subpel_x_qn, const int x_step_qn,
377 const int subpel_y_qn, const int y_step_qn,
378 ConvolveParams *conv_params) {
379 int x, y, k;
380 uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
381 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
382 filter_params_y->taps;
383 int im_stride = w;
384 const int fo_vert = filter_params_y->taps / 2 - 1;
385 const int fo_horiz = filter_params_x->taps / 2 - 1;
386
387 // horizontal filter
388 const uint8_t *src_horiz = src - fo_vert * src_stride;
389 for (y = 0; y < im_h; ++y) {
390 int x_qn = subpel_x_qn;
391 for (x = 0; x < w; ++x, x_qn += x_step_qn) {
392 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
393 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
394 assert(x_filter_idx < SUBPEL_SHIFTS);
395 const int16_t *x_filter =
396 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
397 int sum = 0;
398 for (k = 0; k < filter_params_x->taps; ++k)
399 sum += x_filter[k] * src_x[k - fo_horiz];
400 im_block[y * im_stride + x] =
401 clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
402 }
403 src_horiz += src_stride;
404 }
405
406 // vertical filter
407 const uint8_t *src_vert = im_block + fo_vert * im_stride;
408 for (x = 0; x < w; ++x) {
409 int y_qn = subpel_y_qn;
410 for (y = 0; y < h; ++y, y_qn += y_step_qn) {
411 const uint8_t *const src_y =
412 &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
413 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
414 assert(y_filter_idx < SUBPEL_SHIFTS);
415 const int16_t *y_filter =
416 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
417 CONV_BUF_TYPE sum = 0;
418 for (k = 0; k < filter_params_y->taps; ++k) {
419 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
420 }
421 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
422 if (conv_params->do_average)
423 dst[y * dst_stride + x] += res;
424 else
425 dst[y * dst_stride + x] = res;
426 }
427 src_vert++;
428 }
429 }
430
431 #else
432
433 /* When convolve-round is enabled and compound-round is disabled, we use a
434 high-precision convolve filter.
435 Note: For notes on hardware implementations, including the required
436 bit widths for various intermediate values, see the comments above
437 av1_warp_affine_c.
438 */
av1_convolve_2d_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)439 void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
440 int dst_stride, int w, int h,
441 InterpFilterParams *filter_params_x,
442 InterpFilterParams *filter_params_y,
443 const int subpel_x_q4, const int subpel_y_q4,
444 ConvolveParams *conv_params) {
445 int x, y, k;
446 int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
447 int im_h = h + filter_params_y->taps - 1;
448 int im_stride = w;
449 const int fo_vert = filter_params_y->taps / 2 - 1;
450 const int fo_horiz = filter_params_x->taps / 2 - 1;
451 const int bd = 8;
452
453 // horizontal filter
454 const uint8_t *src_horiz = src - fo_vert * src_stride;
455 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
456 *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
457 for (y = 0; y < im_h; ++y) {
458 for (x = 0; x < w; ++x) {
459 int32_t sum = (1 << (bd + FILTER_BITS - 1));
460 for (k = 0; k < filter_params_x->taps; ++k) {
461 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
462 }
463 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
464 im_block[y * im_stride + x] =
465 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
466 }
467 }
468
469 // vertical filter
470 int32_t *src_vert = im_block + fo_vert * im_stride;
471 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
472 *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
473 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
474 for (y = 0; y < h; ++y) {
475 for (x = 0; x < w; ++x) {
476 CONV_BUF_TYPE sum = 1 << offset_bits;
477 for (k = 0; k < filter_params_y->taps; ++k) {
478 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
479 }
480 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
481 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
482 ((1 << (offset_bits - conv_params->round_1)) +
483 (1 << (offset_bits - conv_params->round_1 - 1)));
484 if (conv_params->do_average)
485 dst[y * dst_stride + x] += res;
486 else
487 dst[y * dst_stride + x] = res;
488 }
489 }
490 }
491
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)492 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
493 CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
494 InterpFilterParams *filter_params_x,
495 InterpFilterParams *filter_params_y,
496 const int subpel_x_qn, const int x_step_qn,
497 const int subpel_y_qn, const int y_step_qn,
498 ConvolveParams *conv_params) {
499 int x, y, k;
500 int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
501 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
502 filter_params_y->taps;
503 int im_stride = w;
504 const int fo_vert = filter_params_y->taps / 2 - 1;
505 const int fo_horiz = filter_params_x->taps / 2 - 1;
506 const int bd = 8;
507
508 // horizontal filter
509 const uint8_t *src_horiz = src - fo_vert * src_stride;
510 for (y = 0; y < im_h; ++y) {
511 int x_qn = subpel_x_qn;
512 for (x = 0; x < w; ++x, x_qn += x_step_qn) {
513 const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
514 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515 assert(x_filter_idx < SUBPEL_SHIFTS);
516 const int16_t *x_filter =
517 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
518 int32_t sum = (1 << (bd + FILTER_BITS - 1));
519 for (k = 0; k < filter_params_x->taps; ++k) {
520 sum += x_filter[k] * src_x[k - fo_horiz];
521 }
522 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
523 im_block[y * im_stride + x] =
524 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
525 }
526 src_horiz += src_stride;
527 }
528
529 // vertical filter
530 int32_t *src_vert = im_block + fo_vert * im_stride;
531 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
532 for (x = 0; x < w; ++x) {
533 int y_qn = subpel_y_qn;
534 for (y = 0; y < h; ++y, y_qn += y_step_qn) {
535 const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
536 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
537 assert(y_filter_idx < SUBPEL_SHIFTS);
538 const int16_t *y_filter =
539 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
540 CONV_BUF_TYPE sum = 1 << offset_bits;
541 for (k = 0; k < filter_params_y->taps; ++k) {
542 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
543 }
544 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
545 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
546 ((1 << (offset_bits - conv_params->round_1)) +
547 (1 << (offset_bits - conv_params->round_1 - 1)));
548 if (conv_params->do_average)
549 dst[y * dst_stride + x] += res;
550 else
551 dst[y * dst_stride + x] = res;
552 }
553 src_vert++;
554 }
555 }
556 #endif // CONFIG_COMPOUND_ROUND
557
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params)558 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
559 int dst_stride, int w, int h,
560 InterpFilters interp_filters, const int subpel_x_q4,
561 int x_step_q4, const int subpel_y_q4, int y_step_q4,
562 int scaled, ConvolveParams *conv_params) {
563 (void)x_step_q4;
564 (void)y_step_q4;
565 (void)dst;
566 (void)dst_stride;
567
568 InterpFilterParams filter_params_x, filter_params_y;
569 av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
570 &filter_params_y);
571
572 if (filter_params_y.taps < filter_params_x.taps) {
573 uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
574 (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
575 int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
576 CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
577 int tr_dst_stride = MAX_SB_SIZE;
578 int fo_vert = filter_params_y.taps / 2 - 1;
579 int fo_horiz = filter_params_x.taps / 2 - 1;
580
581 transpose_uint8(tr_src, tr_src_stride,
582 src - fo_vert * src_stride - fo_horiz, src_stride,
583 w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
584 transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
585 conv_params->dst_stride, w, h);
586
587 // horizontal and vertical parameters are swapped because of the transpose
588 if (scaled)
589 av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
590 tr_src_stride, tr_dst, tr_dst_stride, h, w,
591 &filter_params_y, &filter_params_x, subpel_y_q4,
592 y_step_q4, subpel_x_q4, x_step_q4, conv_params);
593 else
594 av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
595 tr_src_stride, tr_dst, tr_dst_stride, h, w,
596 &filter_params_y, &filter_params_x, subpel_y_q4,
597 subpel_x_q4, conv_params);
598 transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
599 tr_dst_stride, h, w);
600 } else {
601 if (scaled)
602 av1_convolve_2d_scale(src, src_stride, conv_params->dst,
603 conv_params->dst_stride, w, h, &filter_params_x,
604 &filter_params_y, subpel_x_q4, x_step_q4,
605 subpel_y_q4, y_step_q4, conv_params);
606 else
607 av1_convolve_2d(src, src_stride, conv_params->dst,
608 conv_params->dst_stride, w, h, &filter_params_x,
609 &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
610 }
611 }
612
613 #if CONFIG_HIGHBITDEPTH
av1_highbd_convolve_rounding_c(const int32_t * src,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,int bits,int bd)614 void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
615 uint8_t *dst8, int dst_stride, int w, int h,
616 int bits, int bd) {
617 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
618 int r, c;
619 for (r = 0; r < h; ++r) {
620 for (c = 0; c < w; ++c) {
621 dst[r * dst_stride + c] = clip_pixel_highbd(
622 ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
623 }
624 }
625 }
626
627 #if CONFIG_COMPOUND_ROUND
av1_highbd_convolve_2d_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)628 void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
629 CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
630 InterpFilterParams *filter_params_x,
631 InterpFilterParams *filter_params_y,
632 const int subpel_x_q4, const int subpel_y_q4,
633 ConvolveParams *conv_params, int bd) {
634 int x, y, k;
635 uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
636 int im_h = h + filter_params_y->taps - 1;
637 int im_stride = w;
638 const int fo_vert = filter_params_y->taps / 2 - 1;
639 const int fo_horiz = filter_params_x->taps / 2 - 1;
640
641 // horizontal filter
642 const uint16_t *src_horiz = src - fo_vert * src_stride;
643 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
644 *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
645 for (y = 0; y < im_h; ++y) {
646 for (x = 0; x < w; ++x) {
647 int32_t sum = 0;
648 for (k = 0; k < filter_params_x->taps; ++k) {
649 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
650 }
651 im_block[y * im_stride + x] =
652 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
653 }
654 }
655
656 // vertical filter
657 uint16_t *src_vert = im_block + fo_vert * im_stride;
658 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
659 *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
660 for (y = 0; y < h; ++y) {
661 for (x = 0; x < w; ++x) {
662 CONV_BUF_TYPE sum = 0;
663 for (k = 0; k < filter_params_y->taps; ++k) {
664 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
665 }
666 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
667 if (conv_params->do_average)
668 dst[y * dst_stride + x] += res;
669 else
670 dst[y * dst_stride + x] = res;
671 }
672 }
673 }
674
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)675 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
676 CONV_BUF_TYPE *dst, int dst_stride, int w,
677 int h, InterpFilterParams *filter_params_x,
678 InterpFilterParams *filter_params_y,
679 const int subpel_x_qn, const int x_step_qn,
680 const int subpel_y_qn, const int y_step_qn,
681 ConvolveParams *conv_params, int bd) {
682 int x, y, k;
683 uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
684 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
685 filter_params_y->taps;
686 int im_stride = w;
687 const int fo_vert = filter_params_y->taps / 2 - 1;
688 const int fo_horiz = filter_params_x->taps / 2 - 1;
689 (void)bd;
690
691 // horizontal filter
692 const uint16_t *src_horiz = src - fo_vert * src_stride;
693 for (y = 0; y < im_h; ++y) {
694 int x_qn = subpel_x_qn;
695 for (x = 0; x < w; ++x, x_qn += x_step_qn) {
696 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
697 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
698 assert(x_filter_idx < SUBPEL_SHIFTS);
699 const int16_t *x_filter =
700 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
701 int sum = 0;
702 for (k = 0; k < filter_params_x->taps; ++k)
703 sum += x_filter[k] * src_x[k - fo_horiz];
704 im_block[y * im_stride + x] =
705 clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
706 }
707 src_horiz += src_stride;
708 }
709
710 // vertical filter
711 uint16_t *src_vert = im_block + fo_vert * im_stride;
712 for (x = 0; x < w; ++x) {
713 int y_qn = subpel_y_qn;
714 for (y = 0; y < h; ++y, y_qn += y_step_qn) {
715 const uint16_t *const src_y =
716 &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
717 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
718 assert(y_filter_idx < SUBPEL_SHIFTS);
719 const int16_t *y_filter =
720 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
721 CONV_BUF_TYPE sum = 0;
722 for (k = 0; k < filter_params_y->taps; ++k) {
723 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
724 }
725 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
726 if (conv_params->do_average)
727 dst[y * dst_stride + x] += res;
728 else
729 dst[y * dst_stride + x] = res;
730 }
731 src_vert++;
732 }
733 }
734
735 #else
736
av1_highbd_convolve_2d_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)737 void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
738 CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
739 InterpFilterParams *filter_params_x,
740 InterpFilterParams *filter_params_y,
741 const int subpel_x_q4, const int subpel_y_q4,
742 ConvolveParams *conv_params, int bd) {
743 int x, y, k;
744 int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
745 int im_h = h + filter_params_y->taps - 1;
746 int im_stride = w;
747 const int fo_vert = filter_params_y->taps / 2 - 1;
748 const int fo_horiz = filter_params_x->taps / 2 - 1;
749
750 // horizontal filter
751 const uint16_t *src_horiz = src - fo_vert * src_stride;
752 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
753 *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
754 for (y = 0; y < im_h; ++y) {
755 for (x = 0; x < w; ++x) {
756 int32_t sum = (1 << (bd + FILTER_BITS - 1));
757 for (k = 0; k < filter_params_x->taps; ++k) {
758 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
759 }
760 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
761 (void)bd;
762 im_block[y * im_stride + x] =
763 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
764 }
765 }
766
767 // vertical filter
768 int32_t *src_vert = im_block + fo_vert * im_stride;
769 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
770 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
771 *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
772 for (y = 0; y < h; ++y) {
773 for (x = 0; x < w; ++x) {
774 CONV_BUF_TYPE sum = 1 << offset_bits;
775 for (k = 0; k < filter_params_y->taps; ++k) {
776 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
777 }
778 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
779 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
780 ((1 << (offset_bits - conv_params->round_1)) +
781 (1 << (offset_bits - conv_params->round_1 - 1)));
782 if (conv_params->do_average)
783 dst[y * dst_stride + x] += res;
784 else
785 dst[y * dst_stride + x] = res;
786 }
787 }
788 }
789
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)790 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
791 CONV_BUF_TYPE *dst, int dst_stride, int w,
792 int h, InterpFilterParams *filter_params_x,
793 InterpFilterParams *filter_params_y,
794 const int subpel_x_qn, const int x_step_qn,
795 const int subpel_y_qn, const int y_step_qn,
796 ConvolveParams *conv_params, int bd) {
797 int x, y, k;
798 int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
799 int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
800 filter_params_y->taps;
801 int im_stride = w;
802 const int fo_vert = filter_params_y->taps / 2 - 1;
803 const int fo_horiz = filter_params_x->taps / 2 - 1;
804
805 // horizontal filter
806 const uint16_t *src_horiz = src - fo_vert * src_stride;
807 for (y = 0; y < im_h; ++y) {
808 int x_qn = subpel_x_qn;
809 for (x = 0; x < w; ++x, x_qn += x_step_qn) {
810 const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
811 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
812 assert(x_filter_idx < SUBPEL_SHIFTS);
813 const int16_t *x_filter =
814 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
815 int32_t sum = (1 << (bd + FILTER_BITS - 1));
816 for (k = 0; k < filter_params_x->taps; ++k) {
817 sum += x_filter[k] * src_x[k - fo_horiz];
818 }
819 assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
820 im_block[y * im_stride + x] =
821 ROUND_POWER_OF_TWO(sum, conv_params->round_0);
822 }
823 src_horiz += src_stride;
824 }
825
826 // vertical filter
827 int32_t *src_vert = im_block + fo_vert * im_stride;
828 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
829 for (x = 0; x < w; ++x) {
830 int y_qn = subpel_y_qn;
831 for (y = 0; y < h; ++y, y_qn += y_step_qn) {
832 const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
833 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
834 assert(y_filter_idx < SUBPEL_SHIFTS);
835 const int16_t *y_filter =
836 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
837 CONV_BUF_TYPE sum = 1 << offset_bits;
838 for (k = 0; k < filter_params_y->taps; ++k) {
839 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
840 }
841 assert(0 <= sum && sum < (1 << (offset_bits + 2)));
842 CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
843 ((1 << (offset_bits - conv_params->round_1)) +
844 (1 << (offset_bits - conv_params->round_1 - 1)));
845 if (conv_params->do_average)
846 dst[y * dst_stride + x] += res;
847 else
848 dst[y * dst_stride + x] = res;
849 }
850 src_vert++;
851 }
852 }
853 #endif // CONFIG_COMPOUND_ROUND
854
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)855 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
856 uint8_t *dst, int dst_stride, int w, int h,
857 InterpFilters interp_filters,
858 const int subpel_x_q4, int x_step_q4,
859 const int subpel_y_q4, int y_step_q4,
860 int scaled, ConvolveParams *conv_params,
861 int bd) {
862 (void)x_step_q4;
863 (void)y_step_q4;
864 (void)dst;
865 (void)dst_stride;
866
867 InterpFilterParams filter_params_x, filter_params_y;
868 av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
869 &filter_params_y);
870
871 const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
872 if (filter_params_y.taps < filter_params_x.taps) {
873 uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
874 (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
875 int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
876 CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
877 int tr_dst_stride = MAX_SB_SIZE;
878 int fo_vert = filter_params_y.taps / 2 - 1;
879 int fo_horiz = filter_params_x.taps / 2 - 1;
880
881 transpose_uint16(
882 tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
883 src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
884 transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
885 conv_params->dst_stride, w, h);
886
887 // horizontal and vertical parameters are swapped because of the transpose
888 if (scaled)
889 av1_highbd_convolve_2d_scale(
890 tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
891 tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
892 y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
893 else
894 av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
895 tr_src_stride, tr_dst, tr_dst_stride, h, w,
896 &filter_params_y, &filter_params_x, subpel_y_q4,
897 subpel_x_q4, conv_params, bd);
898 transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
899 tr_dst_stride, h, w);
900 } else {
901 if (scaled)
902 av1_highbd_convolve_2d_scale(
903 src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
904 &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
905 subpel_y_q4, y_step_q4, conv_params, bd);
906 else
907 av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
908 conv_params->dst_stride, w, h, &filter_params_x,
909 &filter_params_y, subpel_x_q4, subpel_y_q4,
910 conv_params, bd);
911 }
912 }
913 #endif // CONFIG_HIGHBITDEPTH
914
915 #endif // CONFIG_CONVOLVE_ROUND
916
917 typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
918 int dst_stride, int w, int h,
919 const InterpFilterParams filter_params,
920 const int subpel_q4, int step_q4,
921 ConvolveParams *conv_params);
922
convolve_helper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params,ConvolveFunc convolve_horiz,ConvolveFunc convolve_vert)923 static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
924 int dst_stride, int w, int h,
925 const InterpFilters interp_filters,
926 const int subpel_x_q4, int x_step_q4,
927 const int subpel_y_q4, int y_step_q4,
928 ConvolveParams *conv_params,
929 ConvolveFunc convolve_horiz,
930 ConvolveFunc convolve_vert) {
931 int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
932 int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
933
934 InterpFilterParams filter_params_x, filter_params_y;
935 av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
936 &filter_params_y);
937
938 assert(conv_params->round == CONVOLVE_OPT_ROUND);
939
940 assert(w <= MAX_BLOCK_WIDTH);
941 assert(h <= MAX_BLOCK_HEIGHT);
942 assert(y_step_q4 <= MAX_STEP);
943 assert(x_step_q4 <= MAX_STEP);
944
945 if (ignore_horiz && ignore_vert) {
946 convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
947 } else if (ignore_vert) {
948 assert(filter_params_x.taps <= MAX_FILTER_TAP);
949 convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
950 subpel_x_q4, x_step_q4, conv_params);
951 } else if (ignore_horiz) {
952 assert(filter_params_y.taps <= MAX_FILTER_TAP);
953 convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
954 subpel_y_q4, y_step_q4, conv_params);
955 } else {
956 // temp's size is set to a 256 aligned value to facilitate SIMD
957 // implementation. The value is greater than (maximum possible intermediate
958 // height or width) * MAX_SB_SIZE
959 DECLARE_ALIGNED(16, uint8_t,
960 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
961 int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
962 int filter_size;
963 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
964 av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
965
966 // we do filter with fewer taps first to reduce hardware implementation
967 // complexity
968 if (filter_params_y.taps < filter_params_x.taps) {
969 int intermediate_width;
970 int temp_stride = max_intermediate_size;
971 ConvolveParams temp_conv_params;
972 temp_conv_params.ref = 0;
973 temp_conv_params.do_average = 0;
974 temp_conv_params.round = CONVOLVE_OPT_ROUND;
975 filter_size = filter_params_x.taps;
976 intermediate_width =
977 (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
978 assert(intermediate_width <= max_intermediate_size);
979
980 assert(filter_params_y.taps <= MAX_FILTER_TAP);
981
982 convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
983 intermediate_width, h, filter_params_y, subpel_y_q4,
984 y_step_q4, &temp_conv_params);
985
986 assert(filter_params_x.taps <= MAX_FILTER_TAP);
987 convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
988 w, h, filter_params_x, subpel_x_q4, x_step_q4,
989 conv_params);
990 } else
991 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
992 {
993 int intermediate_height;
994 int temp_stride = MAX_SB_SIZE;
995 ConvolveParams temp_conv_params;
996 temp_conv_params.ref = 0;
997 temp_conv_params.do_average = 0;
998 temp_conv_params.round = CONVOLVE_OPT_ROUND;
999 filter_size = filter_params_y.taps;
1000 intermediate_height =
1001 (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
1002 assert(intermediate_height <= max_intermediate_size);
1003 (void)max_intermediate_size;
1004
1005 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1006
1007 convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
1008 temp_stride, w, intermediate_height, filter_params_x,
1009 subpel_x_q4, x_step_q4, &temp_conv_params);
1010
1011 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1012
1013 convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
1014 dst, dst_stride, w, h, filter_params_y, subpel_y_q4,
1015 y_step_q4, conv_params);
1016 }
1017 }
1018 }
1019
convolve_scale_helper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params,ConvolveFunc convolve_horiz,ConvolveFunc convolve_vert)1020 static void convolve_scale_helper(const uint8_t *src, int src_stride,
1021 uint8_t *dst, int dst_stride, int w, int h,
1022 const InterpFilters interp_filters,
1023 const int subpel_x_qn, int x_step_qn,
1024 const int subpel_y_qn, int y_step_qn,
1025 ConvolveParams *conv_params,
1026 ConvolveFunc convolve_horiz,
1027 ConvolveFunc convolve_vert) {
1028 int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
1029 int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
1030
1031 InterpFilterParams filter_params_x, filter_params_y;
1032 av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1033 &filter_params_y);
1034
1035 assert(conv_params->round == CONVOLVE_OPT_ROUND);
1036
1037 assert(w <= MAX_BLOCK_WIDTH);
1038 assert(h <= MAX_BLOCK_HEIGHT);
1039 assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1040 assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1041
1042 if (ignore_horiz && ignore_vert) {
1043 convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
1044 } else if (ignore_vert) {
1045 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1046 convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
1047 subpel_x_qn, x_step_qn, conv_params);
1048 } else if (ignore_horiz) {
1049 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1050 convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
1051 subpel_y_qn, y_step_qn, conv_params);
1052 } else {
1053 // temp's size is set to a 256 aligned value to facilitate SIMD
1054 // implementation. The value is greater than (maximum possible intermediate
1055 // height or width) * MAX_SB_SIZE
1056 DECLARE_ALIGNED(16, uint8_t,
1057 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1058 int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1059 int filter_size;
1060 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1061 av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1062
1063 // we do filter with fewer taps first to reduce hardware implementation
1064 // complexity
1065 if (filter_params_y.taps < filter_params_x.taps) {
1066 int intermediate_width;
1067 int temp_stride = max_intermediate_size;
1068 ConvolveParams temp_conv_params;
1069 temp_conv_params.ref = 0;
1070 temp_conv_params.do_average = 0;
1071 temp_conv_params.round = CONVOLVE_OPT_ROUND;
1072 filter_size = filter_params_x.taps;
1073 intermediate_width =
1074 (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
1075 filter_size;
1076 assert(intermediate_width <= max_intermediate_size);
1077
1078 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1079
1080 convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
1081 intermediate_width, h, filter_params_y, subpel_y_qn,
1082 y_step_qn, &temp_conv_params);
1083
1084 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1085 convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
1086 w, h, filter_params_x, subpel_x_qn, x_step_qn,
1087 conv_params);
1088 } else {
1089 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1090 int intermediate_height;
1091 int temp_stride = MAX_SB_SIZE;
1092 ConvolveParams temp_conv_params;
1093 temp_conv_params.ref = 0;
1094 temp_conv_params.do_average = 0;
1095 temp_conv_params.round = CONVOLVE_OPT_ROUND;
1096 filter_size = filter_params_y.taps;
1097 intermediate_height =
1098 (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1099 filter_size;
1100 assert(intermediate_height <= max_intermediate_size);
1101 (void)max_intermediate_size;
1102
1103 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1104
1105 convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
1106 temp_stride, w, intermediate_height, filter_params_x,
1107 subpel_x_qn, x_step_qn, &temp_conv_params);
1108
1109 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1110
1111 convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
1112 dst, dst_stride, w, h, filter_params_y, subpel_y_qn,
1113 y_step_qn, conv_params);
1114 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1115 }
1116 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1117 }
1118 }
1119
av1_convolve(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)1120 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
1121 int dst_stride, int w, int h, InterpFilters interp_filters,
1122 const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
1123 int y_step_q4, ConvolveParams *conv_params) {
1124 convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1125 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1126 av1_convolve_horiz_facade, av1_convolve_vert_facade);
1127 }
1128
av1_convolve_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)1129 void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
1130 int dst_stride, int w, int h, InterpFilters interp_filters,
1131 const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
1132 int y_step_q4, ConvolveParams *conv_params) {
1133 convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1134 subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1135 av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
1136 }
1137
av1_convolve_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)1138 void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
1139 int dst_stride, int w, int h,
1140 InterpFilters interp_filters, const int subpel_x_qn,
1141 int x_step_qn, const int subpel_y_qn, int y_step_qn,
1142 ConvolveParams *conv_params) {
1143 convolve_scale_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1144 subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn,
1145 conv_params, av1_convolve_horiz_facade_scale,
1146 av1_convolve_vert_facade_scale);
1147 }
1148
av1_lowbd_convolve_init_c(void)1149 void av1_lowbd_convolve_init_c(void) {
1150 // A placeholder for SIMD initialization
1151 return;
1152 }
1153
av1_highbd_convolve_init_c(void)1154 void av1_highbd_convolve_init_c(void) {
1155 // A placeholder for SIMD initialization
1156 return;
1157 }
1158
av1_convolve_init(AV1_COMMON * cm)1159 void av1_convolve_init(AV1_COMMON *cm) {
1160 #if CONFIG_HIGHBITDEPTH
1161 if (cm->use_highbitdepth)
1162 av1_highbd_convolve_init();
1163 else
1164 av1_lowbd_convolve_init();
1165 #else
1166 (void)cm;
1167 av1_lowbd_convolve_init();
1168 #endif
1169 return;
1170 }
1171
1172 #if CONFIG_HIGHBITDEPTH
av1_highbd_convolve_horiz_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,int avg,int bd)1173 void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
1174 uint16_t *dst, int dst_stride, int w, int h,
1175 const InterpFilterParams filter_params,
1176 const int subpel_x_q4, int x_step_q4, int avg,
1177 int bd) {
1178 int x, y;
1179 int filter_size = filter_params.taps;
1180 src -= filter_size / 2 - 1;
1181 for (y = 0; y < h; ++y) {
1182 int x_q4 = subpel_x_q4;
1183 for (x = 0; x < w; ++x) {
1184 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1185 const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
1186 filter_params, x_q4 & SUBPEL_MASK);
1187 int k, sum = 0;
1188 for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
1189 if (avg)
1190 dst[x] = ROUND_POWER_OF_TWO(
1191 dst[x] +
1192 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1193 1);
1194 else
1195 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1196 x_q4 += x_step_q4;
1197 }
1198 src += src_stride;
1199 dst += dst_stride;
1200 }
1201 }
1202
av1_highbd_convolve_horiz_scale(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,int avg,int bd)1203 void av1_highbd_convolve_horiz_scale(const uint16_t *src, int src_stride,
1204 uint16_t *dst, int dst_stride, int w,
1205 int h,
1206 const InterpFilterParams filter_params,
1207 const int subpel_x_qn, int x_step_qn,
1208 int avg, int bd) {
1209 int x, y;
1210 int filter_size = filter_params.taps;
1211 src -= filter_size / 2 - 1;
1212 for (y = 0; y < h; ++y) {
1213 int x_qn = subpel_x_qn;
1214 for (x = 0; x < w; ++x) {
1215 const uint16_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
1216 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1217 assert(x_filter_idx < SUBPEL_SHIFTS);
1218 const int16_t *x_filter =
1219 av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
1220 int k, sum = 0;
1221 for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
1222 if (avg)
1223 dst[x] = ROUND_POWER_OF_TWO(
1224 dst[x] +
1225 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1226 1);
1227 else
1228 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1229 x_qn += x_step_qn;
1230 }
1231 src += src_stride;
1232 dst += dst_stride;
1233 }
1234 }
1235
av1_highbd_convolve_vert_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,int avg,int bd)1236 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
1237 uint16_t *dst, int dst_stride, int w, int h,
1238 const InterpFilterParams filter_params,
1239 const int subpel_y_q4, int y_step_q4, int avg,
1240 int bd) {
1241 int x, y;
1242 int filter_size = filter_params.taps;
1243 src -= src_stride * (filter_size / 2 - 1);
1244
1245 for (x = 0; x < w; ++x) {
1246 int y_q4 = subpel_y_q4;
1247 for (y = 0; y < h; ++y) {
1248 const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1249 const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1250 filter_params, y_q4 & SUBPEL_MASK);
1251 int k, sum = 0;
1252 for (k = 0; k < filter_size; ++k)
1253 sum += src_y[k * src_stride] * y_filter[k];
1254 if (avg) {
1255 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
1256 dst[y * dst_stride] +
1257 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1258 1);
1259 } else {
1260 dst[y * dst_stride] =
1261 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1262 }
1263 y_q4 += y_step_q4;
1264 }
1265 ++src;
1266 ++dst;
1267 }
1268 }
1269
av1_highbd_convolve_vert_scale(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,int avg,int bd)1270 void av1_highbd_convolve_vert_scale(const uint16_t *src, int src_stride,
1271 uint16_t *dst, int dst_stride, int w, int h,
1272 const InterpFilterParams filter_params,
1273 const int subpel_y_qn, int y_step_qn,
1274 int avg, int bd) {
1275 int x, y;
1276 int filter_size = filter_params.taps;
1277 src -= src_stride * (filter_size / 2 - 1);
1278
1279 for (x = 0; x < w; ++x) {
1280 int y_qn = subpel_y_qn;
1281 for (y = 0; y < h; ++y) {
1282 const uint16_t *const src_y =
1283 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
1284 const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1285 assert(y_filter_idx < SUBPEL_SHIFTS);
1286 const int16_t *y_filter =
1287 av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
1288 int k, sum = 0;
1289 for (k = 0; k < filter_size; ++k)
1290 sum += src_y[k * src_stride] * y_filter[k];
1291 if (avg) {
1292 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
1293 dst[y * dst_stride] +
1294 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1295 1);
1296 } else {
1297 dst[y * dst_stride] =
1298 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1299 }
1300 y_qn += y_step_qn;
1301 }
1302 ++src;
1303 ++dst;
1304 }
1305 }
1306
highbd_convolve_copy(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int avg,int bd)1307 static void highbd_convolve_copy(const uint16_t *src, int src_stride,
1308 uint16_t *dst, int dst_stride, int w, int h,
1309 int avg, int bd) {
1310 if (avg == 0) {
1311 int r;
1312 for (r = 0; r < h; ++r) {
1313 memcpy(dst, src, w * sizeof(*src));
1314 src += src_stride;
1315 dst += dst_stride;
1316 }
1317 } else {
1318 int r, c;
1319 for (r = 0; r < h; ++r) {
1320 for (c = 0; c < w; ++c) {
1321 dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
1322 }
1323 src += src_stride;
1324 dst += dst_stride;
1325 }
1326 }
1327 }
1328
av1_highbd_convolve_horiz_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,int avg,int bd)1329 void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
1330 uint8_t *dst8, int dst_stride, int w,
1331 int h,
1332 const InterpFilterParams filter_params,
1333 const int subpel_x_q4, int x_step_q4,
1334 int avg, int bd) {
1335 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1336 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1337 if (filter_params.taps == SUBPEL_TAPS) {
1338 const int16_t *filter_x =
1339 av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
1340 if (avg == 0)
1341 aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
1342 x_step_q4, NULL, -1, w, h, bd);
1343 else
1344 aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
1345 filter_x, x_step_q4, NULL, -1, w, h, bd);
1346 } else {
1347 av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
1348 filter_params, subpel_x_q4, x_step_q4, avg, bd);
1349 }
1350 }
1351
av1_highbd_convolve_horiz_facade_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,int avg,int bd)1352 void av1_highbd_convolve_horiz_facade_scale(
1353 const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
1354 int h, const InterpFilterParams filter_params, const int subpel_x_qn,
1355 int x_step_qn, int avg, int bd) {
1356 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1357 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1358 // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
1359 // as in the function above.
1360 av1_highbd_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
1361 filter_params, subpel_x_qn, x_step_qn, avg,
1362 bd);
1363 }
1364
av1_highbd_convolve_vert_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,int avg,int bd)1365 void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
1366 uint8_t *dst8, int dst_stride, int w,
1367 int h,
1368 const InterpFilterParams filter_params,
1369 const int subpel_y_q4, int y_step_q4,
1370 int avg, int bd) {
1371 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1372 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1373
1374 if (filter_params.taps == SUBPEL_TAPS) {
1375 const int16_t *filter_y =
1376 av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
1377 if (avg == 0) {
1378 aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
1379 filter_y, y_step_q4, w, h, bd);
1380 } else {
1381 aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
1382 -1, filter_y, y_step_q4, w, h, bd);
1383 }
1384 } else {
1385 av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
1386 filter_params, subpel_y_q4, y_step_q4, avg, bd);
1387 }
1388 }
1389
av1_highbd_convolve_vert_facade_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,int avg,int bd)1390 void av1_highbd_convolve_vert_facade_scale(
1391 const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
1392 int h, const InterpFilterParams filter_params, const int subpel_y_qn,
1393 int y_step_qn, int avg, int bd) {
1394 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1395 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1396 // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
1397 // as in the function above.
1398 av1_highbd_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
1399 filter_params, subpel_y_qn, y_step_qn, avg,
1400 bd);
1401 }
1402
av1_highbd_convolve(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int ref_idx,int bd)1403 void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
1404 int dst_stride, int w, int h,
1405 InterpFilters interp_filters, const int subpel_x_q4,
1406 int x_step_q4, const int subpel_y_q4, int y_step_q4,
1407 int ref_idx, int bd) {
1408 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1409 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1410 int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
1411 int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
1412
1413 assert(w <= MAX_BLOCK_WIDTH);
1414 assert(h <= MAX_BLOCK_HEIGHT);
1415 assert(y_step_q4 <= MAX_STEP);
1416 assert(x_step_q4 <= MAX_STEP);
1417
1418 if (ignore_horiz && ignore_vert) {
1419 highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
1420 return;
1421 }
1422
1423 InterpFilterParams filter_params_x, filter_params_y;
1424 av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1425 &filter_params_y);
1426
1427 if (ignore_vert) {
1428 av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
1429 filter_params_x, subpel_x_q4, x_step_q4,
1430 ref_idx, bd);
1431 } else if (ignore_horiz) {
1432 av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
1433 filter_params_y, subpel_y_q4, y_step_q4,
1434 ref_idx, bd);
1435 } else {
1436 // temp's size is set to a 256 aligned value to facilitate SIMD
1437 // implementation. The value is greater than (maximum possible intermediate
1438 // height or width) * MAX_SB_SIZE
1439 DECLARE_ALIGNED(16, uint16_t,
1440 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1441 uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
1442 int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1443 int filter_size;
1444
1445 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1446 av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1447
1448 if (filter_params_y.taps < filter_params_x.taps) {
1449 int intermediate_width;
1450 int temp_stride = max_intermediate_size;
1451 filter_size = filter_params_x.taps;
1452 intermediate_width =
1453 (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
1454 assert(intermediate_width <= max_intermediate_size);
1455
1456 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1457
1458 av1_highbd_convolve_vert_facade(src8 - (filter_size / 2 - 1), src_stride,
1459 temp8, temp_stride, intermediate_width, h,
1460 filter_params_y, subpel_y_q4, y_step_q4,
1461 0, bd);
1462
1463 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1464
1465 av1_highbd_convolve_horiz_facade(
1466 temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
1467 filter_params_x, subpel_x_q4, x_step_q4, ref_idx, bd);
1468 } else
1469 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1470 {
1471 int intermediate_height;
1472 int temp_stride = MAX_SB_SIZE;
1473 filter_size = filter_params_y.taps;
1474
1475 intermediate_height =
1476 (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
1477 assert(intermediate_height <= max_intermediate_size);
1478 (void)max_intermediate_size;
1479
1480 av1_highbd_convolve_horiz_facade(
1481 src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
1482 temp_stride, w, intermediate_height, filter_params_x, subpel_x_q4,
1483 x_step_q4, 0, bd);
1484
1485 filter_size = filter_params_y.taps;
1486 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1487
1488 av1_highbd_convolve_vert_facade(
1489 temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
1490 dst_stride, w, h, filter_params_y, subpel_y_q4, y_step_q4, ref_idx,
1491 bd);
1492 }
1493 }
1494 }
1495
av1_highbd_convolve_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,int ref_idx,int bd)1496 void av1_highbd_convolve_scale(const uint8_t *src8, int src_stride,
1497 uint8_t *dst8, int dst_stride, int w, int h,
1498 InterpFilters interp_filters,
1499 const int subpel_x_qn, int x_step_qn,
1500 const int subpel_y_qn, int y_step_qn,
1501 int ref_idx, int bd) {
1502 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1503 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1504 int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
1505 int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
1506
1507 assert(w <= MAX_BLOCK_WIDTH);
1508 assert(h <= MAX_BLOCK_HEIGHT);
1509 assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1510 assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1511
1512 if (ignore_horiz && ignore_vert) {
1513 highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
1514 return;
1515 }
1516
1517 InterpFilterParams filter_params_x, filter_params_y;
1518 av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1519 &filter_params_y);
1520
1521 if (ignore_vert) {
1522 av1_highbd_convolve_horiz_facade_scale(src8, src_stride, dst8, dst_stride,
1523 w, h, filter_params_x, subpel_x_qn,
1524 x_step_qn, ref_idx, bd);
1525 } else if (ignore_horiz) {
1526 av1_highbd_convolve_vert_facade_scale(src8, src_stride, dst8, dst_stride, w,
1527 h, filter_params_y, subpel_y_qn,
1528 y_step_qn, ref_idx, bd);
1529 } else {
1530 // temp's size is set to a 256 aligned value to facilitate SIMD
1531 // implementation. The value is greater than (maximum possible intermediate
1532 // height or width) * MAX_SB_SIZE
1533 DECLARE_ALIGNED(16, uint16_t,
1534 temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1535 uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
1536 int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1537 int filter_size;
1538
1539 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1540 av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1541
1542 if (filter_params_y.taps < filter_params_x.taps) {
1543 int intermediate_width;
1544 int temp_stride = max_intermediate_size;
1545 filter_size = filter_params_x.taps;
1546 intermediate_width =
1547 (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
1548 filter_size;
1549 assert(intermediate_width <= max_intermediate_size);
1550
1551 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1552
1553 av1_highbd_convolve_vert_facade_scale(
1554 src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
1555 intermediate_width, h, filter_params_y, subpel_y_qn, y_step_qn, 0,
1556 bd);
1557
1558 assert(filter_params_x.taps <= MAX_FILTER_TAP);
1559
1560 av1_highbd_convolve_horiz_facade_scale(
1561 temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
1562 filter_params_x, subpel_x_qn, x_step_qn, ref_idx, bd);
1563 } else {
1564 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1565 int intermediate_height;
1566 int temp_stride = MAX_SB_SIZE;
1567 filter_size = filter_params_y.taps;
1568 intermediate_height =
1569 (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1570 filter_size;
1571 assert(intermediate_height <= max_intermediate_size);
1572 (void)max_intermediate_size;
1573
1574 av1_highbd_convolve_horiz_facade_scale(
1575 src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
1576 temp_stride, w, intermediate_height, filter_params_x, subpel_x_qn,
1577 x_step_qn, 0, bd);
1578
1579 filter_size = filter_params_y.taps;
1580 assert(filter_params_y.taps <= MAX_FILTER_TAP);
1581
1582 av1_highbd_convolve_vert_facade_scale(
1583 temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
1584 dst_stride, w, h, filter_params_y, subpel_y_qn, y_step_qn, ref_idx,
1585 bd);
1586 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1587 }
1588 #endif // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1589 }
1590 }
1591 #endif // CONFIG_HIGHBITDEPTH
1592