1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "./aom_dsp_rtcd.h"
16 #include "./av1_rtcd.h"
17 #include "av1/common/blockd.h"
18 #include "av1/common/convolve.h"
19 #include "av1/common/filter.h"
20 #include "av1/common/onyxc_int.h"
21 #include "aom_dsp/aom_dsp_common.h"
22 #include "aom_ports/mem.h"
23 
24 #define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
25 #define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
26 #define MAX_STEP (32)
27 
av1_convolve_horiz_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)28 void av1_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
29                           int dst_stride, int w, int h,
30                           const InterpFilterParams filter_params,
31                           const int subpel_x_q4, int x_step_q4,
32                           ConvolveParams *conv_params) {
33   int x, y;
34   int filter_size = filter_params.taps;
35   assert(conv_params->round == CONVOLVE_OPT_ROUND);
36   src -= filter_size / 2 - 1;
37   for (y = 0; y < h; ++y) {
38     int x_q4 = subpel_x_q4;
39     for (x = 0; x < w; ++x) {
40       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
41       const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
42           filter_params, x_q4 & SUBPEL_MASK);
43       int k, sum = 0;
44       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
45 
46       sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
47       if (conv_params->do_average)
48         dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
49       else
50         dst[x] = sum;
51 
52       x_q4 += x_step_q4;
53     }
54     src += src_stride;
55     dst += dst_stride;
56   }
57 }
58 
av1_convolve_horiz_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,ConvolveParams * conv_params)59 void av1_convolve_horiz_scale(const uint8_t *src, int src_stride, uint8_t *dst,
60                               int dst_stride, int w, int h,
61                               const InterpFilterParams filter_params,
62                               const int subpel_x_qn, int x_step_qn,
63                               ConvolveParams *conv_params) {
64   int x, y;
65   int filter_size = filter_params.taps;
66   assert(conv_params->round == CONVOLVE_OPT_ROUND);
67   src -= filter_size / 2 - 1;
68   for (y = 0; y < h; ++y) {
69     int x_qn = subpel_x_qn;
70     for (x = 0; x < w; ++x) {
71       const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
72       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
73       assert(x_filter_idx < SUBPEL_SHIFTS);
74       const int16_t *x_filter =
75           av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
76       int k, sum = 0;
77       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
78 
79       sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
80       if (conv_params->do_average)
81         dst[x] = ROUND_POWER_OF_TWO(dst[x] + sum, 1);
82       else
83         dst[x] = sum;
84 
85       x_qn += x_step_qn;
86     }
87     src += src_stride;
88     dst += dst_stride;
89   }
90 }
91 
av1_convolve_vert_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)92 void av1_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
93                          int dst_stride, int w, int h,
94                          const InterpFilterParams filter_params,
95                          const int subpel_y_q4, int y_step_q4,
96                          ConvolveParams *conv_params) {
97   int x, y;
98   int filter_size = filter_params.taps;
99   assert(conv_params->round == CONVOLVE_OPT_ROUND);
100   src -= src_stride * (filter_size / 2 - 1);
101   for (x = 0; x < w; ++x) {
102     int y_q4 = subpel_y_q4;
103     for (y = 0; y < h; ++y) {
104       const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
105       const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
106           filter_params, y_q4 & SUBPEL_MASK);
107       int k, sum = 0;
108       for (k = 0; k < filter_size; ++k)
109         sum += src_y[k * src_stride] * y_filter[k];
110 
111       sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
112       if (conv_params->do_average)
113         dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
114       else
115         dst[y * dst_stride] = sum;
116 
117       y_q4 += y_step_q4;
118     }
119     ++src;
120     ++dst;
121   }
122 }
123 
av1_convolve_vert_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)124 void av1_convolve_vert_scale(const uint8_t *src, int src_stride, uint8_t *dst,
125                              int dst_stride, int w, int h,
126                              const InterpFilterParams filter_params,
127                              const int subpel_y_qn, int y_step_qn,
128                              ConvolveParams *conv_params) {
129   int x, y;
130   int filter_size = filter_params.taps;
131   assert(conv_params->round == CONVOLVE_OPT_ROUND);
132   src -= src_stride * (filter_size / 2 - 1);
133   for (x = 0; x < w; ++x) {
134     int y_qn = subpel_y_qn;
135     for (y = 0; y < h; ++y) {
136       const uint8_t *const src_y =
137           &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
138       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
139       assert(y_filter_idx < SUBPEL_SHIFTS);
140       const int16_t *y_filter =
141           av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
142       int k, sum = 0;
143       for (k = 0; k < filter_size; ++k)
144         sum += src_y[k * src_stride] * y_filter[k];
145 
146       sum = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
147       if (conv_params->do_average)
148         dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + sum, 1);
149       else
150         dst[y * dst_stride] = sum;
151 
152       y_qn += y_step_qn;
153     }
154     ++src;
155     ++dst;
156   }
157 }
158 
convolve_copy(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,ConvolveParams * conv_params)159 static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
160                           int dst_stride, int w, int h,
161                           ConvolveParams *conv_params) {
162   assert(conv_params->round == CONVOLVE_OPT_ROUND);
163   if (conv_params->do_average == 0) {
164     int r;
165     for (r = 0; r < h; ++r) {
166       memcpy(dst, src, w);
167       src += src_stride;
168       dst += dst_stride;
169     }
170   } else {
171     int r, c;
172     for (r = 0; r < h; ++r) {
173       for (c = 0; c < w; ++c) {
174         dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
175       }
176       src += src_stride;
177       dst += dst_stride;
178     }
179   }
180 }
181 
av1_convolve_horiz_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)182 void av1_convolve_horiz_facade(const uint8_t *src, int src_stride, uint8_t *dst,
183                                int dst_stride, int w, int h,
184                                const InterpFilterParams filter_params,
185                                const int subpel_x_q4, int x_step_q4,
186                                ConvolveParams *conv_params) {
187   assert(conv_params->round == CONVOLVE_OPT_ROUND);
188   if (filter_params.taps == SUBPEL_TAPS) {
189     const int16_t *filter_x =
190         av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
191     if (conv_params->do_average == 0)
192       aom_convolve8_horiz(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
193                           NULL, -1, w, h);
194     else
195       aom_convolve8_avg_horiz(src, src_stride, dst, dst_stride, filter_x,
196                               x_step_q4, NULL, -1, w, h);
197   } else {
198     av1_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
199                        subpel_x_q4, x_step_q4, conv_params);
200   }
201 }
202 
av1_convolve_horiz_facade_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,ConvolveParams * conv_params)203 void av1_convolve_horiz_facade_c(const uint8_t *src, int src_stride,
204                                  uint8_t *dst, int dst_stride, int w, int h,
205                                  const InterpFilterParams filter_params,
206                                  const int subpel_x_q4, int x_step_q4,
207                                  ConvolveParams *conv_params) {
208   assert(conv_params->round == CONVOLVE_OPT_ROUND);
209   if (filter_params.taps == SUBPEL_TAPS) {
210     const int16_t *filter_x =
211         av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
212     if (conv_params->do_average == 0)
213       aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
214                             x_step_q4, NULL, -1, w, h);
215     else
216       aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
217                                 x_step_q4, NULL, -1, w, h);
218   } else {
219     av1_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
220                          subpel_x_q4, x_step_q4, conv_params);
221   }
222 }
223 
av1_convolve_horiz_facade_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,ConvolveParams * conv_params)224 void av1_convolve_horiz_facade_scale(const uint8_t *src, int src_stride,
225                                      uint8_t *dst, int dst_stride, int w, int h,
226                                      const InterpFilterParams filter_params,
227                                      const int subpel_x_qn, int x_step_qn,
228                                      ConvolveParams *conv_params) {
229   assert(conv_params->round == CONVOLVE_OPT_ROUND);
230   if (filter_params.taps == SUBPEL_TAPS) {
231     const int16_t *filter_x = av1_get_interp_filter_subpel_kernel(
232         filter_params, subpel_x_qn >> SCALE_EXTRA_BITS);
233     if (conv_params->do_average == 0)
234       aom_convolve8_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
235                                 subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
236     else
237       aom_convolve8_avg_horiz_scale(src, src_stride, dst, dst_stride, filter_x,
238                                     subpel_x_qn, x_step_qn, NULL, 0, -1, w, h);
239   } else {
240     av1_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
241                              filter_params, subpel_x_qn, x_step_qn,
242                              conv_params);
243   }
244 }
245 
av1_convolve_vert_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)246 void av1_convolve_vert_facade(const uint8_t *src, int src_stride, uint8_t *dst,
247                               int dst_stride, int w, int h,
248                               const InterpFilterParams filter_params,
249                               const int subpel_y_q4, int y_step_q4,
250                               ConvolveParams *conv_params) {
251   assert(conv_params->round == CONVOLVE_OPT_ROUND);
252   if (filter_params.taps == SUBPEL_TAPS) {
253     const int16_t *filter_y =
254         av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
255     if (conv_params->do_average == 0) {
256       aom_convolve8_vert(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
257                          y_step_q4, w, h);
258     } else {
259       aom_convolve8_avg_vert(src, src_stride, dst, dst_stride, NULL, -1,
260                              filter_y, y_step_q4, w, h);
261     }
262   } else {
263     av1_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
264                       subpel_y_q4, y_step_q4, conv_params);
265   }
266 }
267 
av1_convolve_vert_facade_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)268 void av1_convolve_vert_facade_c(const uint8_t *src, int src_stride,
269                                 uint8_t *dst, int dst_stride, int w, int h,
270                                 const InterpFilterParams filter_params,
271                                 const int subpel_y_q4, int y_step_q4,
272                                 ConvolveParams *conv_params) {
273   assert(conv_params->round == CONVOLVE_OPT_ROUND);
274   if (filter_params.taps == SUBPEL_TAPS) {
275     const int16_t *filter_y =
276         av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
277     if (conv_params->do_average == 0) {
278       aom_convolve8_vert_c(src, src_stride, dst, dst_stride, NULL, -1, filter_y,
279                            y_step_q4, w, h);
280     } else {
281       aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, NULL, -1,
282                                filter_y, y_step_q4, w, h);
283     }
284   } else {
285     av1_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
286                         subpel_y_q4, y_step_q4, conv_params);
287   }
288 }
289 
av1_convolve_vert_facade_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)290 void av1_convolve_vert_facade_scale(const uint8_t *src, int src_stride,
291                                     uint8_t *dst, int dst_stride, int w, int h,
292                                     const InterpFilterParams filter_params,
293                                     const int subpel_y_qn, int y_step_qn,
294                                     ConvolveParams *conv_params) {
295   assert(conv_params->round == CONVOLVE_OPT_ROUND);
296   if (filter_params.taps == SUBPEL_TAPS) {
297     const int16_t *filter_y = av1_get_interp_filter_subpel_kernel(
298         filter_params, subpel_y_qn >> SCALE_EXTRA_BITS);
299     if (conv_params->do_average == 0) {
300       aom_convolve8_vert_scale(src, src_stride, dst, dst_stride, NULL, 0, -1,
301                                filter_y, subpel_y_qn, y_step_qn, w, h);
302     } else {
303       aom_convolve8_avg_vert_scale(src, src_stride, dst, dst_stride, NULL, 0,
304                                    -1, filter_y, subpel_y_qn, y_step_qn, w, h);
305     }
306   } else {
307     av1_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
308                             filter_params, subpel_y_qn, y_step_qn, conv_params);
309   }
310 }
311 
312 #if CONFIG_CONVOLVE_ROUND
av1_convolve_rounding_c(const int32_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int bits)313 void av1_convolve_rounding_c(const int32_t *src, int src_stride, uint8_t *dst,
314                              int dst_stride, int w, int h, int bits) {
315   int r, c;
316   for (r = 0; r < h; ++r) {
317     for (c = 0; c < w; ++c) {
318       dst[r * dst_stride + c] =
319           clip_pixel(ROUND_POWER_OF_TWO(src[r * src_stride + c], bits));
320     }
321   }
322 }
323 
324 #if CONFIG_COMPOUND_ROUND
av1_convolve_2d_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)325 void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
326                        int dst_stride, int w, int h,
327                        InterpFilterParams *filter_params_x,
328                        InterpFilterParams *filter_params_y,
329                        const int subpel_x_q4, const int subpel_y_q4,
330                        ConvolveParams *conv_params) {
331   int x, y, k;
332   uint8_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
333   int im_h = h + filter_params_y->taps - 1;
334   int im_stride = w;
335   const int fo_vert = filter_params_y->taps / 2 - 1;
336   const int fo_horiz = filter_params_x->taps / 2 - 1;
337 
338   // horizontal filter
339   const uint8_t *src_horiz = src - fo_vert * src_stride;
340   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
341       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
342   for (y = 0; y < im_h; ++y) {
343     for (x = 0; x < w; ++x) {
344       int32_t sum = 0;
345       for (k = 0; k < filter_params_x->taps; ++k) {
346         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
347       }
348       im_block[y * im_stride + x] =
349           clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
350     }
351   }
352 
353   // vertical filter
354   uint8_t *src_vert = im_block + fo_vert * im_stride;
355   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
356       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
357   for (y = 0; y < h; ++y) {
358     for (x = 0; x < w; ++x) {
359       CONV_BUF_TYPE sum = 0;
360       for (k = 0; k < filter_params_y->taps; ++k) {
361         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
362       }
363       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
364       if (conv_params->do_average)
365         dst[y * dst_stride + x] += res;
366       else
367         dst[y * dst_stride + x] = res;
368     }
369   }
370 }
371 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)372 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
373                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
374                              InterpFilterParams *filter_params_x,
375                              InterpFilterParams *filter_params_y,
376                              const int subpel_x_qn, const int x_step_qn,
377                              const int subpel_y_qn, const int y_step_qn,
378                              ConvolveParams *conv_params) {
379   int x, y, k;
380   uint8_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
381   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
382              filter_params_y->taps;
383   int im_stride = w;
384   const int fo_vert = filter_params_y->taps / 2 - 1;
385   const int fo_horiz = filter_params_x->taps / 2 - 1;
386 
387   // horizontal filter
388   const uint8_t *src_horiz = src - fo_vert * src_stride;
389   for (y = 0; y < im_h; ++y) {
390     int x_qn = subpel_x_qn;
391     for (x = 0; x < w; ++x, x_qn += x_step_qn) {
392       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
393       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
394       assert(x_filter_idx < SUBPEL_SHIFTS);
395       const int16_t *x_filter =
396           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
397       int sum = 0;
398       for (k = 0; k < filter_params_x->taps; ++k)
399         sum += x_filter[k] * src_x[k - fo_horiz];
400       im_block[y * im_stride + x] =
401           clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
402     }
403     src_horiz += src_stride;
404   }
405 
406   // vertical filter
407   const uint8_t *src_vert = im_block + fo_vert * im_stride;
408   for (x = 0; x < w; ++x) {
409     int y_qn = subpel_y_qn;
410     for (y = 0; y < h; ++y, y_qn += y_step_qn) {
411       const uint8_t *const src_y =
412           &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
413       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
414       assert(y_filter_idx < SUBPEL_SHIFTS);
415       const int16_t *y_filter =
416           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
417       CONV_BUF_TYPE sum = 0;
418       for (k = 0; k < filter_params_y->taps; ++k) {
419         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
420       }
421       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
422       if (conv_params->do_average)
423         dst[y * dst_stride + x] += res;
424       else
425         dst[y * dst_stride + x] = res;
426     }
427     src_vert++;
428   }
429 }
430 
431 #else
432 
433 /* When convolve-round is enabled and compound-round is disabled, we use a
434    high-precision convolve filter.
435    Note: For notes on hardware implementations, including the required
436    bit widths for various intermediate values, see the comments above
437    av1_warp_affine_c.
438 */
av1_convolve_2d_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params)439 void av1_convolve_2d_c(const uint8_t *src, int src_stride, CONV_BUF_TYPE *dst,
440                        int dst_stride, int w, int h,
441                        InterpFilterParams *filter_params_x,
442                        InterpFilterParams *filter_params_y,
443                        const int subpel_x_q4, const int subpel_y_q4,
444                        ConvolveParams *conv_params) {
445   int x, y, k;
446   int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
447   int im_h = h + filter_params_y->taps - 1;
448   int im_stride = w;
449   const int fo_vert = filter_params_y->taps / 2 - 1;
450   const int fo_horiz = filter_params_x->taps / 2 - 1;
451   const int bd = 8;
452 
453   // horizontal filter
454   const uint8_t *src_horiz = src - fo_vert * src_stride;
455   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
456       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
457   for (y = 0; y < im_h; ++y) {
458     for (x = 0; x < w; ++x) {
459       int32_t sum = (1 << (bd + FILTER_BITS - 1));
460       for (k = 0; k < filter_params_x->taps; ++k) {
461         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
462       }
463       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
464       im_block[y * im_stride + x] =
465           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
466     }
467   }
468 
469   // vertical filter
470   int32_t *src_vert = im_block + fo_vert * im_stride;
471   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
472       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
473   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
474   for (y = 0; y < h; ++y) {
475     for (x = 0; x < w; ++x) {
476       CONV_BUF_TYPE sum = 1 << offset_bits;
477       for (k = 0; k < filter_params_y->taps; ++k) {
478         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
479       }
480       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
481       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
482                           ((1 << (offset_bits - conv_params->round_1)) +
483                            (1 << (offset_bits - conv_params->round_1 - 1)));
484       if (conv_params->do_average)
485         dst[y * dst_stride + x] += res;
486       else
487         dst[y * dst_stride + x] = res;
488     }
489   }
490 }
491 
av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)492 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride,
493                              CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
494                              InterpFilterParams *filter_params_x,
495                              InterpFilterParams *filter_params_y,
496                              const int subpel_x_qn, const int x_step_qn,
497                              const int subpel_y_qn, const int y_step_qn,
498                              ConvolveParams *conv_params) {
499   int x, y, k;
500   int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
501   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
502              filter_params_y->taps;
503   int im_stride = w;
504   const int fo_vert = filter_params_y->taps / 2 - 1;
505   const int fo_horiz = filter_params_x->taps / 2 - 1;
506   const int bd = 8;
507 
508   // horizontal filter
509   const uint8_t *src_horiz = src - fo_vert * src_stride;
510   for (y = 0; y < im_h; ++y) {
511     int x_qn = subpel_x_qn;
512     for (x = 0; x < w; ++x, x_qn += x_step_qn) {
513       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
514       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515       assert(x_filter_idx < SUBPEL_SHIFTS);
516       const int16_t *x_filter =
517           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
518       int32_t sum = (1 << (bd + FILTER_BITS - 1));
519       for (k = 0; k < filter_params_x->taps; ++k) {
520         sum += x_filter[k] * src_x[k - fo_horiz];
521       }
522       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
523       im_block[y * im_stride + x] =
524           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
525     }
526     src_horiz += src_stride;
527   }
528 
529   // vertical filter
530   int32_t *src_vert = im_block + fo_vert * im_stride;
531   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
532   for (x = 0; x < w; ++x) {
533     int y_qn = subpel_y_qn;
534     for (y = 0; y < h; ++y, y_qn += y_step_qn) {
535       const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
536       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
537       assert(y_filter_idx < SUBPEL_SHIFTS);
538       const int16_t *y_filter =
539           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
540       CONV_BUF_TYPE sum = 1 << offset_bits;
541       for (k = 0; k < filter_params_y->taps; ++k) {
542         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
543       }
544       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
545       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
546                           ((1 << (offset_bits - conv_params->round_1)) +
547                            (1 << (offset_bits - conv_params->round_1 - 1)));
548       if (conv_params->do_average)
549         dst[y * dst_stride + x] += res;
550       else
551         dst[y * dst_stride + x] = res;
552     }
553     src_vert++;
554   }
555 }
556 #endif  // CONFIG_COMPOUND_ROUND
557 
av1_convolve_2d_facade(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params)558 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
559                             int dst_stride, int w, int h,
560                             InterpFilters interp_filters, const int subpel_x_q4,
561                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
562                             int scaled, ConvolveParams *conv_params) {
563   (void)x_step_q4;
564   (void)y_step_q4;
565   (void)dst;
566   (void)dst_stride;
567 
568   InterpFilterParams filter_params_x, filter_params_y;
569   av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
570                                  &filter_params_y);
571 
572   if (filter_params_y.taps < filter_params_x.taps) {
573     uint8_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
574                    (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
575     int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
576     CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
577     int tr_dst_stride = MAX_SB_SIZE;
578     int fo_vert = filter_params_y.taps / 2 - 1;
579     int fo_horiz = filter_params_x.taps / 2 - 1;
580 
581     transpose_uint8(tr_src, tr_src_stride,
582                     src - fo_vert * src_stride - fo_horiz, src_stride,
583                     w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
584     transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
585                     conv_params->dst_stride, w, h);
586 
587     // horizontal and vertical parameters are swapped because of the transpose
588     if (scaled)
589       av1_convolve_2d_scale(tr_src + fo_horiz * tr_src_stride + fo_vert,
590                             tr_src_stride, tr_dst, tr_dst_stride, h, w,
591                             &filter_params_y, &filter_params_x, subpel_y_q4,
592                             y_step_q4, subpel_x_q4, x_step_q4, conv_params);
593     else
594       av1_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
595                       tr_src_stride, tr_dst, tr_dst_stride, h, w,
596                       &filter_params_y, &filter_params_x, subpel_y_q4,
597                       subpel_x_q4, conv_params);
598     transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
599                     tr_dst_stride, h, w);
600   } else {
601     if (scaled)
602       av1_convolve_2d_scale(src, src_stride, conv_params->dst,
603                             conv_params->dst_stride, w, h, &filter_params_x,
604                             &filter_params_y, subpel_x_q4, x_step_q4,
605                             subpel_y_q4, y_step_q4, conv_params);
606     else
607       av1_convolve_2d(src, src_stride, conv_params->dst,
608                       conv_params->dst_stride, w, h, &filter_params_x,
609                       &filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
610   }
611 }
612 
613 #if CONFIG_HIGHBITDEPTH
av1_highbd_convolve_rounding_c(const int32_t * src,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,int bits,int bd)614 void av1_highbd_convolve_rounding_c(const int32_t *src, int src_stride,
615                                     uint8_t *dst8, int dst_stride, int w, int h,
616                                     int bits, int bd) {
617   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
618   int r, c;
619   for (r = 0; r < h; ++r) {
620     for (c = 0; c < w; ++c) {
621       dst[r * dst_stride + c] = clip_pixel_highbd(
622           ROUND_POWER_OF_TWO(src[r * src_stride + c], bits), bd);
623     }
624   }
625 }
626 
627 #if CONFIG_COMPOUND_ROUND
av1_highbd_convolve_2d_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)628 void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
629                               CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
630                               InterpFilterParams *filter_params_x,
631                               InterpFilterParams *filter_params_y,
632                               const int subpel_x_q4, const int subpel_y_q4,
633                               ConvolveParams *conv_params, int bd) {
634   int x, y, k;
635   uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
636   int im_h = h + filter_params_y->taps - 1;
637   int im_stride = w;
638   const int fo_vert = filter_params_y->taps / 2 - 1;
639   const int fo_horiz = filter_params_x->taps / 2 - 1;
640 
641   // horizontal filter
642   const uint16_t *src_horiz = src - fo_vert * src_stride;
643   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
644       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
645   for (y = 0; y < im_h; ++y) {
646     for (x = 0; x < w; ++x) {
647       int32_t sum = 0;
648       for (k = 0; k < filter_params_x->taps; ++k) {
649         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
650       }
651       im_block[y * im_stride + x] =
652           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, conv_params->round_0), bd);
653     }
654   }
655 
656   // vertical filter
657   uint16_t *src_vert = im_block + fo_vert * im_stride;
658   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
659       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
660   for (y = 0; y < h; ++y) {
661     for (x = 0; x < w; ++x) {
662       CONV_BUF_TYPE sum = 0;
663       for (k = 0; k < filter_params_y->taps; ++k) {
664         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
665       }
666       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
667       if (conv_params->do_average)
668         dst[y * dst_stride + x] += res;
669       else
670         dst[y * dst_stride + x] = res;
671     }
672   }
673 }
674 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)675 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
676                                     CONV_BUF_TYPE *dst, int dst_stride, int w,
677                                     int h, InterpFilterParams *filter_params_x,
678                                     InterpFilterParams *filter_params_y,
679                                     const int subpel_x_qn, const int x_step_qn,
680                                     const int subpel_y_qn, const int y_step_qn,
681                                     ConvolveParams *conv_params, int bd) {
682   int x, y, k;
683   uint16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
684   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
685              filter_params_y->taps;
686   int im_stride = w;
687   const int fo_vert = filter_params_y->taps / 2 - 1;
688   const int fo_horiz = filter_params_x->taps / 2 - 1;
689   (void)bd;
690 
691   // horizontal filter
692   const uint16_t *src_horiz = src - fo_vert * src_stride;
693   for (y = 0; y < im_h; ++y) {
694     int x_qn = subpel_x_qn;
695     for (x = 0; x < w; ++x, x_qn += x_step_qn) {
696       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
697       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
698       assert(x_filter_idx < SUBPEL_SHIFTS);
699       const int16_t *x_filter =
700           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
701       int sum = 0;
702       for (k = 0; k < filter_params_x->taps; ++k)
703         sum += x_filter[k] * src_x[k - fo_horiz];
704       im_block[y * im_stride + x] =
705           clip_pixel(ROUND_POWER_OF_TWO(sum, conv_params->round_0));
706     }
707     src_horiz += src_stride;
708   }
709 
710   // vertical filter
711   uint16_t *src_vert = im_block + fo_vert * im_stride;
712   for (x = 0; x < w; ++x) {
713     int y_qn = subpel_y_qn;
714     for (y = 0; y < h; ++y, y_qn += y_step_qn) {
715       const uint16_t *const src_y =
716           &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
717       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
718       assert(y_filter_idx < SUBPEL_SHIFTS);
719       const int16_t *y_filter =
720           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
721       CONV_BUF_TYPE sum = 0;
722       for (k = 0; k < filter_params_y->taps; ++k) {
723         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
724       }
725       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
726       if (conv_params->do_average)
727         dst[y * dst_stride + x] += res;
728       else
729         dst[y * dst_stride + x] = res;
730     }
731     src_vert++;
732   }
733 }
734 
735 #else
736 
av1_highbd_convolve_2d_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_q4,const int subpel_y_q4,ConvolveParams * conv_params,int bd)737 void av1_highbd_convolve_2d_c(const uint16_t *src, int src_stride,
738                               CONV_BUF_TYPE *dst, int dst_stride, int w, int h,
739                               InterpFilterParams *filter_params_x,
740                               InterpFilterParams *filter_params_y,
741                               const int subpel_x_q4, const int subpel_y_q4,
742                               ConvolveParams *conv_params, int bd) {
743   int x, y, k;
744   int32_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
745   int im_h = h + filter_params_y->taps - 1;
746   int im_stride = w;
747   const int fo_vert = filter_params_y->taps / 2 - 1;
748   const int fo_horiz = filter_params_x->taps / 2 - 1;
749 
750   // horizontal filter
751   const uint16_t *src_horiz = src - fo_vert * src_stride;
752   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
753       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
754   for (y = 0; y < im_h; ++y) {
755     for (x = 0; x < w; ++x) {
756       int32_t sum = (1 << (bd + FILTER_BITS - 1));
757       for (k = 0; k < filter_params_x->taps; ++k) {
758         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
759       }
760       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
761       (void)bd;
762       im_block[y * im_stride + x] =
763           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
764     }
765   }
766 
767   // vertical filter
768   int32_t *src_vert = im_block + fo_vert * im_stride;
769   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
770   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
771       *filter_params_y, subpel_y_q4 & SUBPEL_MASK);
772   for (y = 0; y < h; ++y) {
773     for (x = 0; x < w; ++x) {
774       CONV_BUF_TYPE sum = 1 << offset_bits;
775       for (k = 0; k < filter_params_y->taps; ++k) {
776         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
777       }
778       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
779       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
780                           ((1 << (offset_bits - conv_params->round_1)) +
781                            (1 << (offset_bits - conv_params->round_1 - 1)));
782       if (conv_params->do_average)
783         dst[y * dst_stride + x] += res;
784       else
785         dst[y * dst_stride + x] = res;
786     }
787   }
788 }
789 
av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,CONV_BUF_TYPE * dst,int dst_stride,int w,int h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)790 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
791                                     CONV_BUF_TYPE *dst, int dst_stride, int w,
792                                     int h, InterpFilterParams *filter_params_x,
793                                     InterpFilterParams *filter_params_y,
794                                     const int subpel_x_qn, const int x_step_qn,
795                                     const int subpel_y_qn, const int y_step_qn,
796                                     ConvolveParams *conv_params, int bd) {
797   int x, y, k;
798   int32_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
799   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
800              filter_params_y->taps;
801   int im_stride = w;
802   const int fo_vert = filter_params_y->taps / 2 - 1;
803   const int fo_horiz = filter_params_x->taps / 2 - 1;
804 
805   // horizontal filter
806   const uint16_t *src_horiz = src - fo_vert * src_stride;
807   for (y = 0; y < im_h; ++y) {
808     int x_qn = subpel_x_qn;
809     for (x = 0; x < w; ++x, x_qn += x_step_qn) {
810       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
811       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
812       assert(x_filter_idx < SUBPEL_SHIFTS);
813       const int16_t *x_filter =
814           av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
815       int32_t sum = (1 << (bd + FILTER_BITS - 1));
816       for (k = 0; k < filter_params_x->taps; ++k) {
817         sum += x_filter[k] * src_x[k - fo_horiz];
818       }
819       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
820       im_block[y * im_stride + x] =
821           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
822     }
823     src_horiz += src_stride;
824   }
825 
826   // vertical filter
827   int32_t *src_vert = im_block + fo_vert * im_stride;
828   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
829   for (x = 0; x < w; ++x) {
830     int y_qn = subpel_y_qn;
831     for (y = 0; y < h; ++y, y_qn += y_step_qn) {
832       const int32_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
833       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
834       assert(y_filter_idx < SUBPEL_SHIFTS);
835       const int16_t *y_filter =
836           av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
837       CONV_BUF_TYPE sum = 1 << offset_bits;
838       for (k = 0; k < filter_params_y->taps; ++k) {
839         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
840       }
841       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
842       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
843                           ((1 << (offset_bits - conv_params->round_1)) +
844                            (1 << (offset_bits - conv_params->round_1 - 1)));
845       if (conv_params->do_average)
846         dst[y * dst_stride + x] += res;
847       else
848         dst[y * dst_stride + x] = res;
849     }
850     src_vert++;
851   }
852 }
853 #endif  // CONFIG_COMPOUND_ROUND
854 
av1_highbd_convolve_2d_facade(const uint8_t * src8,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int scaled,ConvolveParams * conv_params,int bd)855 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
856                                    uint8_t *dst, int dst_stride, int w, int h,
857                                    InterpFilters interp_filters,
858                                    const int subpel_x_q4, int x_step_q4,
859                                    const int subpel_y_q4, int y_step_q4,
860                                    int scaled, ConvolveParams *conv_params,
861                                    int bd) {
862   (void)x_step_q4;
863   (void)y_step_q4;
864   (void)dst;
865   (void)dst_stride;
866 
867   InterpFilterParams filter_params_x, filter_params_y;
868   av1_get_convolve_filter_params(interp_filters, 1, &filter_params_x,
869                                  &filter_params_y);
870 
871   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
872   if (filter_params_y.taps < filter_params_x.taps) {
873     uint16_t tr_src[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) *
874                     (MAX_SB_SIZE + MAX_FILTER_TAP - 1)];
875     int tr_src_stride = MAX_SB_SIZE + MAX_FILTER_TAP - 1;
876     CONV_BUF_TYPE tr_dst[MAX_SB_SIZE * MAX_SB_SIZE];
877     int tr_dst_stride = MAX_SB_SIZE;
878     int fo_vert = filter_params_y.taps / 2 - 1;
879     int fo_horiz = filter_params_x.taps / 2 - 1;
880 
881     transpose_uint16(
882         tr_src, tr_src_stride, src - fo_vert * src_stride - fo_horiz,
883         src_stride, w + filter_params_x.taps - 1, h + filter_params_y.taps - 1);
884     transpose_int32(tr_dst, tr_dst_stride, conv_params->dst,
885                     conv_params->dst_stride, w, h);
886 
887     // horizontal and vertical parameters are swapped because of the transpose
888     if (scaled)
889       av1_highbd_convolve_2d_scale(
890           tr_src + fo_horiz * tr_src_stride + fo_vert, tr_src_stride, tr_dst,
891           tr_dst_stride, h, w, &filter_params_y, &filter_params_x, subpel_y_q4,
892           y_step_q4, subpel_x_q4, x_step_q4, conv_params, bd);
893     else
894       av1_highbd_convolve_2d(tr_src + fo_horiz * tr_src_stride + fo_vert,
895                              tr_src_stride, tr_dst, tr_dst_stride, h, w,
896                              &filter_params_y, &filter_params_x, subpel_y_q4,
897                              subpel_x_q4, conv_params, bd);
898     transpose_int32(conv_params->dst, conv_params->dst_stride, tr_dst,
899                     tr_dst_stride, h, w);
900   } else {
901     if (scaled)
902       av1_highbd_convolve_2d_scale(
903           src, src_stride, conv_params->dst, conv_params->dst_stride, w, h,
904           &filter_params_x, &filter_params_y, subpel_x_q4, x_step_q4,
905           subpel_y_q4, y_step_q4, conv_params, bd);
906     else
907       av1_highbd_convolve_2d(src, src_stride, conv_params->dst,
908                              conv_params->dst_stride, w, h, &filter_params_x,
909                              &filter_params_y, subpel_x_q4, subpel_y_q4,
910                              conv_params, bd);
911   }
912 }
913 #endif  // CONFIG_HIGHBITDEPTH
914 
915 #endif  // CONFIG_CONVOLVE_ROUND
916 
917 typedef void (*ConvolveFunc)(const uint8_t *src, int src_stride, uint8_t *dst,
918                              int dst_stride, int w, int h,
919                              const InterpFilterParams filter_params,
920                              const int subpel_q4, int step_q4,
921                              ConvolveParams *conv_params);
922 
convolve_helper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params,ConvolveFunc convolve_horiz,ConvolveFunc convolve_vert)923 static void convolve_helper(const uint8_t *src, int src_stride, uint8_t *dst,
924                             int dst_stride, int w, int h,
925                             const InterpFilters interp_filters,
926                             const int subpel_x_q4, int x_step_q4,
927                             const int subpel_y_q4, int y_step_q4,
928                             ConvolveParams *conv_params,
929                             ConvolveFunc convolve_horiz,
930                             ConvolveFunc convolve_vert) {
931   int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
932   int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
933 
934   InterpFilterParams filter_params_x, filter_params_y;
935   av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
936                                  &filter_params_y);
937 
938   assert(conv_params->round == CONVOLVE_OPT_ROUND);
939 
940   assert(w <= MAX_BLOCK_WIDTH);
941   assert(h <= MAX_BLOCK_HEIGHT);
942   assert(y_step_q4 <= MAX_STEP);
943   assert(x_step_q4 <= MAX_STEP);
944 
945   if (ignore_horiz && ignore_vert) {
946     convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
947   } else if (ignore_vert) {
948     assert(filter_params_x.taps <= MAX_FILTER_TAP);
949     convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
950                    subpel_x_q4, x_step_q4, conv_params);
951   } else if (ignore_horiz) {
952     assert(filter_params_y.taps <= MAX_FILTER_TAP);
953     convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
954                   subpel_y_q4, y_step_q4, conv_params);
955   } else {
956     // temp's size is set to a 256 aligned value to facilitate SIMD
957     // implementation. The value is greater than (maximum possible intermediate
958     // height or width) * MAX_SB_SIZE
959     DECLARE_ALIGNED(16, uint8_t,
960                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
961     int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
962     int filter_size;
963 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
964     av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
965 
966     // we do filter with fewer taps first to reduce hardware implementation
967     // complexity
968     if (filter_params_y.taps < filter_params_x.taps) {
969       int intermediate_width;
970       int temp_stride = max_intermediate_size;
971       ConvolveParams temp_conv_params;
972       temp_conv_params.ref = 0;
973       temp_conv_params.do_average = 0;
974       temp_conv_params.round = CONVOLVE_OPT_ROUND;
975       filter_size = filter_params_x.taps;
976       intermediate_width =
977           (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
978       assert(intermediate_width <= max_intermediate_size);
979 
980       assert(filter_params_y.taps <= MAX_FILTER_TAP);
981 
982       convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
983                     intermediate_width, h, filter_params_y, subpel_y_q4,
984                     y_step_q4, &temp_conv_params);
985 
986       assert(filter_params_x.taps <= MAX_FILTER_TAP);
987       convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
988                      w, h, filter_params_x, subpel_x_q4, x_step_q4,
989                      conv_params);
990     } else
991 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
992     {
993       int intermediate_height;
994       int temp_stride = MAX_SB_SIZE;
995       ConvolveParams temp_conv_params;
996       temp_conv_params.ref = 0;
997       temp_conv_params.do_average = 0;
998       temp_conv_params.round = CONVOLVE_OPT_ROUND;
999       filter_size = filter_params_y.taps;
1000       intermediate_height =
1001           (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
1002       assert(intermediate_height <= max_intermediate_size);
1003       (void)max_intermediate_size;
1004 
1005       assert(filter_params_x.taps <= MAX_FILTER_TAP);
1006 
1007       convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
1008                      temp_stride, w, intermediate_height, filter_params_x,
1009                      subpel_x_q4, x_step_q4, &temp_conv_params);
1010 
1011       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1012 
1013       convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
1014                     dst, dst_stride, w, h, filter_params_y, subpel_y_q4,
1015                     y_step_q4, conv_params);
1016     }
1017   }
1018 }
1019 
convolve_scale_helper(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params,ConvolveFunc convolve_horiz,ConvolveFunc convolve_vert)1020 static void convolve_scale_helper(const uint8_t *src, int src_stride,
1021                                   uint8_t *dst, int dst_stride, int w, int h,
1022                                   const InterpFilters interp_filters,
1023                                   const int subpel_x_qn, int x_step_qn,
1024                                   const int subpel_y_qn, int y_step_qn,
1025                                   ConvolveParams *conv_params,
1026                                   ConvolveFunc convolve_horiz,
1027                                   ConvolveFunc convolve_vert) {
1028   int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
1029   int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
1030 
1031   InterpFilterParams filter_params_x, filter_params_y;
1032   av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1033                                  &filter_params_y);
1034 
1035   assert(conv_params->round == CONVOLVE_OPT_ROUND);
1036 
1037   assert(w <= MAX_BLOCK_WIDTH);
1038   assert(h <= MAX_BLOCK_HEIGHT);
1039   assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1040   assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1041 
1042   if (ignore_horiz && ignore_vert) {
1043     convolve_copy(src, src_stride, dst, dst_stride, w, h, conv_params);
1044   } else if (ignore_vert) {
1045     assert(filter_params_x.taps <= MAX_FILTER_TAP);
1046     convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params_x,
1047                    subpel_x_qn, x_step_qn, conv_params);
1048   } else if (ignore_horiz) {
1049     assert(filter_params_y.taps <= MAX_FILTER_TAP);
1050     convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params_y,
1051                   subpel_y_qn, y_step_qn, conv_params);
1052   } else {
1053     // temp's size is set to a 256 aligned value to facilitate SIMD
1054     // implementation. The value is greater than (maximum possible intermediate
1055     // height or width) * MAX_SB_SIZE
1056     DECLARE_ALIGNED(16, uint8_t,
1057                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1058     int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1059     int filter_size;
1060 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1061     av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1062 
1063     // we do filter with fewer taps first to reduce hardware implementation
1064     // complexity
1065     if (filter_params_y.taps < filter_params_x.taps) {
1066       int intermediate_width;
1067       int temp_stride = max_intermediate_size;
1068       ConvolveParams temp_conv_params;
1069       temp_conv_params.ref = 0;
1070       temp_conv_params.do_average = 0;
1071       temp_conv_params.round = CONVOLVE_OPT_ROUND;
1072       filter_size = filter_params_x.taps;
1073       intermediate_width =
1074           (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
1075           filter_size;
1076       assert(intermediate_width <= max_intermediate_size);
1077 
1078       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1079 
1080       convolve_vert(src - (filter_size / 2 - 1), src_stride, temp, temp_stride,
1081                     intermediate_width, h, filter_params_y, subpel_y_qn,
1082                     y_step_qn, &temp_conv_params);
1083 
1084       assert(filter_params_x.taps <= MAX_FILTER_TAP);
1085       convolve_horiz(temp + (filter_size / 2 - 1), temp_stride, dst, dst_stride,
1086                      w, h, filter_params_x, subpel_x_qn, x_step_qn,
1087                      conv_params);
1088     } else {
1089 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1090       int intermediate_height;
1091       int temp_stride = MAX_SB_SIZE;
1092       ConvolveParams temp_conv_params;
1093       temp_conv_params.ref = 0;
1094       temp_conv_params.do_average = 0;
1095       temp_conv_params.round = CONVOLVE_OPT_ROUND;
1096       filter_size = filter_params_y.taps;
1097       intermediate_height =
1098           (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1099           filter_size;
1100       assert(intermediate_height <= max_intermediate_size);
1101       (void)max_intermediate_size;
1102 
1103       assert(filter_params_x.taps <= MAX_FILTER_TAP);
1104 
1105       convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
1106                      temp_stride, w, intermediate_height, filter_params_x,
1107                      subpel_x_qn, x_step_qn, &temp_conv_params);
1108 
1109       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1110 
1111       convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
1112                     dst, dst_stride, w, h, filter_params_y, subpel_y_qn,
1113                     y_step_qn, conv_params);
1114 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1115     }
1116 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1117   }
1118 }
1119 
av1_convolve(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)1120 void av1_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
1121                   int dst_stride, int w, int h, InterpFilters interp_filters,
1122                   const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
1123                   int y_step_q4, ConvolveParams *conv_params) {
1124   convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1125                   subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1126                   av1_convolve_horiz_facade, av1_convolve_vert_facade);
1127 }
1128 
av1_convolve_c(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,ConvolveParams * conv_params)1129 void av1_convolve_c(const uint8_t *src, int src_stride, uint8_t *dst,
1130                     int dst_stride, int w, int h, InterpFilters interp_filters,
1131                     const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
1132                     int y_step_q4, ConvolveParams *conv_params) {
1133   convolve_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1134                   subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, conv_params,
1135                   av1_convolve_horiz_facade_c, av1_convolve_vert_facade_c);
1136 }
1137 
av1_convolve_scale(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,ConvolveParams * conv_params)1138 void av1_convolve_scale(const uint8_t *src, int src_stride, uint8_t *dst,
1139                         int dst_stride, int w, int h,
1140                         InterpFilters interp_filters, const int subpel_x_qn,
1141                         int x_step_qn, const int subpel_y_qn, int y_step_qn,
1142                         ConvolveParams *conv_params) {
1143   convolve_scale_helper(src, src_stride, dst, dst_stride, w, h, interp_filters,
1144                         subpel_x_qn, x_step_qn, subpel_y_qn, y_step_qn,
1145                         conv_params, av1_convolve_horiz_facade_scale,
1146                         av1_convolve_vert_facade_scale);
1147 }
1148 
av1_lowbd_convolve_init_c(void)1149 void av1_lowbd_convolve_init_c(void) {
1150   // A placeholder for SIMD initialization
1151   return;
1152 }
1153 
av1_highbd_convolve_init_c(void)1154 void av1_highbd_convolve_init_c(void) {
1155   // A placeholder for SIMD initialization
1156   return;
1157 }
1158 
av1_convolve_init(AV1_COMMON * cm)1159 void av1_convolve_init(AV1_COMMON *cm) {
1160 #if CONFIG_HIGHBITDEPTH
1161   if (cm->use_highbitdepth)
1162     av1_highbd_convolve_init();
1163   else
1164     av1_lowbd_convolve_init();
1165 #else
1166   (void)cm;
1167   av1_lowbd_convolve_init();
1168 #endif
1169   return;
1170 }
1171 
1172 #if CONFIG_HIGHBITDEPTH
av1_highbd_convolve_horiz_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,int avg,int bd)1173 void av1_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
1174                                  uint16_t *dst, int dst_stride, int w, int h,
1175                                  const InterpFilterParams filter_params,
1176                                  const int subpel_x_q4, int x_step_q4, int avg,
1177                                  int bd) {
1178   int x, y;
1179   int filter_size = filter_params.taps;
1180   src -= filter_size / 2 - 1;
1181   for (y = 0; y < h; ++y) {
1182     int x_q4 = subpel_x_q4;
1183     for (x = 0; x < w; ++x) {
1184       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1185       const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
1186           filter_params, x_q4 & SUBPEL_MASK);
1187       int k, sum = 0;
1188       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
1189       if (avg)
1190         dst[x] = ROUND_POWER_OF_TWO(
1191             dst[x] +
1192                 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1193             1);
1194       else
1195         dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1196       x_q4 += x_step_q4;
1197     }
1198     src += src_stride;
1199     dst += dst_stride;
1200   }
1201 }
1202 
av1_highbd_convolve_horiz_scale(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,int avg,int bd)1203 void av1_highbd_convolve_horiz_scale(const uint16_t *src, int src_stride,
1204                                      uint16_t *dst, int dst_stride, int w,
1205                                      int h,
1206                                      const InterpFilterParams filter_params,
1207                                      const int subpel_x_qn, int x_step_qn,
1208                                      int avg, int bd) {
1209   int x, y;
1210   int filter_size = filter_params.taps;
1211   src -= filter_size / 2 - 1;
1212   for (y = 0; y < h; ++y) {
1213     int x_qn = subpel_x_qn;
1214     for (x = 0; x < w; ++x) {
1215       const uint16_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
1216       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1217       assert(x_filter_idx < SUBPEL_SHIFTS);
1218       const int16_t *x_filter =
1219           av1_get_interp_filter_subpel_kernel(filter_params, x_filter_idx);
1220       int k, sum = 0;
1221       for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
1222       if (avg)
1223         dst[x] = ROUND_POWER_OF_TWO(
1224             dst[x] +
1225                 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1226             1);
1227       else
1228         dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1229       x_qn += x_step_qn;
1230     }
1231     src += src_stride;
1232     dst += dst_stride;
1233   }
1234 }
1235 
av1_highbd_convolve_vert_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,int avg,int bd)1236 void av1_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
1237                                 uint16_t *dst, int dst_stride, int w, int h,
1238                                 const InterpFilterParams filter_params,
1239                                 const int subpel_y_q4, int y_step_q4, int avg,
1240                                 int bd) {
1241   int x, y;
1242   int filter_size = filter_params.taps;
1243   src -= src_stride * (filter_size / 2 - 1);
1244 
1245   for (x = 0; x < w; ++x) {
1246     int y_q4 = subpel_y_q4;
1247     for (y = 0; y < h; ++y) {
1248       const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1249       const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
1250           filter_params, y_q4 & SUBPEL_MASK);
1251       int k, sum = 0;
1252       for (k = 0; k < filter_size; ++k)
1253         sum += src_y[k * src_stride] * y_filter[k];
1254       if (avg) {
1255         dst[y * dst_stride] = ROUND_POWER_OF_TWO(
1256             dst[y * dst_stride] +
1257                 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1258             1);
1259       } else {
1260         dst[y * dst_stride] =
1261             clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1262       }
1263       y_q4 += y_step_q4;
1264     }
1265     ++src;
1266     ++dst;
1267   }
1268 }
1269 
av1_highbd_convolve_vert_scale(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,int avg,int bd)1270 void av1_highbd_convolve_vert_scale(const uint16_t *src, int src_stride,
1271                                     uint16_t *dst, int dst_stride, int w, int h,
1272                                     const InterpFilterParams filter_params,
1273                                     const int subpel_y_qn, int y_step_qn,
1274                                     int avg, int bd) {
1275   int x, y;
1276   int filter_size = filter_params.taps;
1277   src -= src_stride * (filter_size / 2 - 1);
1278 
1279   for (x = 0; x < w; ++x) {
1280     int y_qn = subpel_y_qn;
1281     for (y = 0; y < h; ++y) {
1282       const uint16_t *const src_y =
1283           &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
1284       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
1285       assert(y_filter_idx < SUBPEL_SHIFTS);
1286       const int16_t *y_filter =
1287           av1_get_interp_filter_subpel_kernel(filter_params, y_filter_idx);
1288       int k, sum = 0;
1289       for (k = 0; k < filter_size; ++k)
1290         sum += src_y[k * src_stride] * y_filter[k];
1291       if (avg) {
1292         dst[y * dst_stride] = ROUND_POWER_OF_TWO(
1293             dst[y * dst_stride] +
1294                 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
1295             1);
1296       } else {
1297         dst[y * dst_stride] =
1298             clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
1299       }
1300       y_qn += y_step_qn;
1301     }
1302     ++src;
1303     ++dst;
1304   }
1305 }
1306 
highbd_convolve_copy(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int avg,int bd)1307 static void highbd_convolve_copy(const uint16_t *src, int src_stride,
1308                                  uint16_t *dst, int dst_stride, int w, int h,
1309                                  int avg, int bd) {
1310   if (avg == 0) {
1311     int r;
1312     for (r = 0; r < h; ++r) {
1313       memcpy(dst, src, w * sizeof(*src));
1314       src += src_stride;
1315       dst += dst_stride;
1316     }
1317   } else {
1318     int r, c;
1319     for (r = 0; r < h; ++r) {
1320       for (c = 0; c < w; ++c) {
1321         dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
1322       }
1323       src += src_stride;
1324       dst += dst_stride;
1325     }
1326   }
1327 }
1328 
av1_highbd_convolve_horiz_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_q4,int x_step_q4,int avg,int bd)1329 void av1_highbd_convolve_horiz_facade(const uint8_t *src8, int src_stride,
1330                                       uint8_t *dst8, int dst_stride, int w,
1331                                       int h,
1332                                       const InterpFilterParams filter_params,
1333                                       const int subpel_x_q4, int x_step_q4,
1334                                       int avg, int bd) {
1335   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1336   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1337   if (filter_params.taps == SUBPEL_TAPS) {
1338     const int16_t *filter_x =
1339         av1_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
1340     if (avg == 0)
1341       aom_highbd_convolve8_horiz(src8, src_stride, dst8, dst_stride, filter_x,
1342                                  x_step_q4, NULL, -1, w, h, bd);
1343     else
1344       aom_highbd_convolve8_avg_horiz(src8, src_stride, dst8, dst_stride,
1345                                      filter_x, x_step_q4, NULL, -1, w, h, bd);
1346   } else {
1347     av1_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
1348                               filter_params, subpel_x_q4, x_step_q4, avg, bd);
1349   }
1350 }
1351 
av1_highbd_convolve_horiz_facade_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_x_qn,int x_step_qn,int avg,int bd)1352 void av1_highbd_convolve_horiz_facade_scale(
1353     const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
1354     int h, const InterpFilterParams filter_params, const int subpel_x_qn,
1355     int x_step_qn, int avg, int bd) {
1356   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1357   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1358   // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
1359   // as in the function above.
1360   av1_highbd_convolve_horiz_scale(src, src_stride, dst, dst_stride, w, h,
1361                                   filter_params, subpel_x_qn, x_step_qn, avg,
1362                                   bd);
1363 }
1364 
av1_highbd_convolve_vert_facade(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_q4,int y_step_q4,int avg,int bd)1365 void av1_highbd_convolve_vert_facade(const uint8_t *src8, int src_stride,
1366                                      uint8_t *dst8, int dst_stride, int w,
1367                                      int h,
1368                                      const InterpFilterParams filter_params,
1369                                      const int subpel_y_q4, int y_step_q4,
1370                                      int avg, int bd) {
1371   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1372   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1373 
1374   if (filter_params.taps == SUBPEL_TAPS) {
1375     const int16_t *filter_y =
1376         av1_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
1377     if (avg == 0) {
1378       aom_highbd_convolve8_vert(src8, src_stride, dst8, dst_stride, NULL, -1,
1379                                 filter_y, y_step_q4, w, h, bd);
1380     } else {
1381       aom_highbd_convolve8_avg_vert(src8, src_stride, dst8, dst_stride, NULL,
1382                                     -1, filter_y, y_step_q4, w, h, bd);
1383     }
1384   } else {
1385     av1_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
1386                              filter_params, subpel_y_q4, y_step_q4, avg, bd);
1387   }
1388 }
1389 
av1_highbd_convolve_vert_facade_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,const InterpFilterParams filter_params,const int subpel_y_qn,int y_step_qn,int avg,int bd)1390 void av1_highbd_convolve_vert_facade_scale(
1391     const uint8_t *src8, int src_stride, uint8_t *dst8, int dst_stride, int w,
1392     int h, const InterpFilterParams filter_params, const int subpel_y_qn,
1393     int y_step_qn, int avg, int bd) {
1394   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1395   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1396   // TODO(debargha): Add special functions for filter_params.taps == SUBPEL_TAPS
1397   // as in the function above.
1398   av1_highbd_convolve_vert_scale(src, src_stride, dst, dst_stride, w, h,
1399                                  filter_params, subpel_y_qn, y_step_qn, avg,
1400                                  bd);
1401 }
1402 
av1_highbd_convolve(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_q4,int x_step_q4,const int subpel_y_q4,int y_step_q4,int ref_idx,int bd)1403 void av1_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
1404                          int dst_stride, int w, int h,
1405                          InterpFilters interp_filters, const int subpel_x_q4,
1406                          int x_step_q4, const int subpel_y_q4, int y_step_q4,
1407                          int ref_idx, int bd) {
1408   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1409   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1410   int ignore_horiz = x_step_q4 == SUBPEL_SHIFTS && subpel_x_q4 == 0;
1411   int ignore_vert = y_step_q4 == SUBPEL_SHIFTS && subpel_y_q4 == 0;
1412 
1413   assert(w <= MAX_BLOCK_WIDTH);
1414   assert(h <= MAX_BLOCK_HEIGHT);
1415   assert(y_step_q4 <= MAX_STEP);
1416   assert(x_step_q4 <= MAX_STEP);
1417 
1418   if (ignore_horiz && ignore_vert) {
1419     highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
1420     return;
1421   }
1422 
1423   InterpFilterParams filter_params_x, filter_params_y;
1424   av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1425                                  &filter_params_y);
1426 
1427   if (ignore_vert) {
1428     av1_highbd_convolve_horiz_facade(src8, src_stride, dst8, dst_stride, w, h,
1429                                      filter_params_x, subpel_x_q4, x_step_q4,
1430                                      ref_idx, bd);
1431   } else if (ignore_horiz) {
1432     av1_highbd_convolve_vert_facade(src8, src_stride, dst8, dst_stride, w, h,
1433                                     filter_params_y, subpel_y_q4, y_step_q4,
1434                                     ref_idx, bd);
1435   } else {
1436     // temp's size is set to a 256 aligned value to facilitate SIMD
1437     // implementation. The value is greater than (maximum possible intermediate
1438     // height or width) * MAX_SB_SIZE
1439     DECLARE_ALIGNED(16, uint16_t,
1440                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1441     uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
1442     int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1443     int filter_size;
1444 
1445 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1446     av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1447 
1448     if (filter_params_y.taps < filter_params_x.taps) {
1449       int intermediate_width;
1450       int temp_stride = max_intermediate_size;
1451       filter_size = filter_params_x.taps;
1452       intermediate_width =
1453           (((w - 1) * x_step_q4 + subpel_x_q4) >> SUBPEL_BITS) + filter_size;
1454       assert(intermediate_width <= max_intermediate_size);
1455 
1456       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1457 
1458       av1_highbd_convolve_vert_facade(src8 - (filter_size / 2 - 1), src_stride,
1459                                       temp8, temp_stride, intermediate_width, h,
1460                                       filter_params_y, subpel_y_q4, y_step_q4,
1461                                       0, bd);
1462 
1463       assert(filter_params_x.taps <= MAX_FILTER_TAP);
1464 
1465       av1_highbd_convolve_horiz_facade(
1466           temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
1467           filter_params_x, subpel_x_q4, x_step_q4, ref_idx, bd);
1468     } else
1469 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1470     {
1471       int intermediate_height;
1472       int temp_stride = MAX_SB_SIZE;
1473       filter_size = filter_params_y.taps;
1474 
1475       intermediate_height =
1476           (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
1477       assert(intermediate_height <= max_intermediate_size);
1478       (void)max_intermediate_size;
1479 
1480       av1_highbd_convolve_horiz_facade(
1481           src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
1482           temp_stride, w, intermediate_height, filter_params_x, subpel_x_q4,
1483           x_step_q4, 0, bd);
1484 
1485       filter_size = filter_params_y.taps;
1486       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1487 
1488       av1_highbd_convolve_vert_facade(
1489           temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
1490           dst_stride, w, h, filter_params_y, subpel_y_q4, y_step_q4, ref_idx,
1491           bd);
1492     }
1493   }
1494 }
1495 
av1_highbd_convolve_scale(const uint8_t * src8,int src_stride,uint8_t * dst8,int dst_stride,int w,int h,InterpFilters interp_filters,const int subpel_x_qn,int x_step_qn,const int subpel_y_qn,int y_step_qn,int ref_idx,int bd)1496 void av1_highbd_convolve_scale(const uint8_t *src8, int src_stride,
1497                                uint8_t *dst8, int dst_stride, int w, int h,
1498                                InterpFilters interp_filters,
1499                                const int subpel_x_qn, int x_step_qn,
1500                                const int subpel_y_qn, int y_step_qn,
1501                                int ref_idx, int bd) {
1502   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1503   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1504   int ignore_horiz = x_step_qn == SCALE_SUBPEL_SHIFTS && subpel_x_qn == 0;
1505   int ignore_vert = y_step_qn == SCALE_SUBPEL_SHIFTS && subpel_y_qn == 0;
1506 
1507   assert(w <= MAX_BLOCK_WIDTH);
1508   assert(h <= MAX_BLOCK_HEIGHT);
1509   assert(y_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1510   assert(x_step_qn <= (MAX_STEP << SCALE_EXTRA_BITS));
1511 
1512   if (ignore_horiz && ignore_vert) {
1513     highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
1514     return;
1515   }
1516 
1517   InterpFilterParams filter_params_x, filter_params_y;
1518   av1_get_convolve_filter_params(interp_filters, 0, &filter_params_x,
1519                                  &filter_params_y);
1520 
1521   if (ignore_vert) {
1522     av1_highbd_convolve_horiz_facade_scale(src8, src_stride, dst8, dst_stride,
1523                                            w, h, filter_params_x, subpel_x_qn,
1524                                            x_step_qn, ref_idx, bd);
1525   } else if (ignore_horiz) {
1526     av1_highbd_convolve_vert_facade_scale(src8, src_stride, dst8, dst_stride, w,
1527                                           h, filter_params_y, subpel_y_qn,
1528                                           y_step_qn, ref_idx, bd);
1529   } else {
1530     // temp's size is set to a 256 aligned value to facilitate SIMD
1531     // implementation. The value is greater than (maximum possible intermediate
1532     // height or width) * MAX_SB_SIZE
1533     DECLARE_ALIGNED(16, uint16_t,
1534                     temp[((MAX_SB_SIZE * 2 + 16) + 16) * MAX_SB_SIZE]);
1535     uint8_t *temp8 = CONVERT_TO_BYTEPTR(temp);
1536     int max_intermediate_size = ((MAX_SB_SIZE * 2 + 16) + 16);
1537     int filter_size;
1538 
1539 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1540     av1_convolve_filter_params_fixup_1212(&filter_params_x, &filter_params_y);
1541 
1542     if (filter_params_y.taps < filter_params_x.taps) {
1543       int intermediate_width;
1544       int temp_stride = max_intermediate_size;
1545       filter_size = filter_params_x.taps;
1546       intermediate_width =
1547           (((w - 1) * x_step_qn + subpel_x_qn) >> SCALE_SUBPEL_BITS) +
1548           filter_size;
1549       assert(intermediate_width <= max_intermediate_size);
1550 
1551       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1552 
1553       av1_highbd_convolve_vert_facade_scale(
1554           src8 - (filter_size / 2 - 1), src_stride, temp8, temp_stride,
1555           intermediate_width, h, filter_params_y, subpel_y_qn, y_step_qn, 0,
1556           bd);
1557 
1558       assert(filter_params_x.taps <= MAX_FILTER_TAP);
1559 
1560       av1_highbd_convolve_horiz_facade_scale(
1561           temp8 + (filter_size / 2 - 1), temp_stride, dst8, dst_stride, w, h,
1562           filter_params_x, subpel_x_qn, x_step_qn, ref_idx, bd);
1563     } else {
1564 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1565       int intermediate_height;
1566       int temp_stride = MAX_SB_SIZE;
1567       filter_size = filter_params_y.taps;
1568       intermediate_height =
1569           (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
1570           filter_size;
1571       assert(intermediate_height <= max_intermediate_size);
1572       (void)max_intermediate_size;
1573 
1574       av1_highbd_convolve_horiz_facade_scale(
1575           src8 - src_stride * (filter_size / 2 - 1), src_stride, temp8,
1576           temp_stride, w, intermediate_height, filter_params_x, subpel_x_qn,
1577           x_step_qn, 0, bd);
1578 
1579       filter_size = filter_params_y.taps;
1580       assert(filter_params_y.taps <= MAX_FILTER_TAP);
1581 
1582       av1_highbd_convolve_vert_facade_scale(
1583           temp8 + temp_stride * (filter_size / 2 - 1), temp_stride, dst8,
1584           dst_stride, w, h, filter_params_y, subpel_y_qn, y_step_qn, ref_idx,
1585           bd);
1586 #if CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1587     }
1588 #endif  // CONFIG_DUAL_FILTER && USE_EXTRA_FILTER
1589   }
1590 }
1591 #endif  // CONFIG_HIGHBITDEPTH
1592