1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string.h>
14 
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/aom_convolve.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_ports/mem.h"
22 
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)23 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
24                            uint8_t *dst, ptrdiff_t dst_stride,
25                            const InterpKernel *x_filters, int x0_q4,
26                            int x_step_q4, int w, int h) {
27   int x, y;
28   src -= SUBPEL_TAPS / 2 - 1;
29   for (y = 0; y < h; ++y) {
30     int x_q4 = x0_q4;
31     for (x = 0; x < w; ++x) {
32       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34       int k, sum = 0;
35       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37       x_q4 += x_step_q4;
38     }
39     src += src_stride;
40     dst += dst_stride;
41   }
42 }
43 
convolve_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_qn,int x_step_qn,int w,int h)44 static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
45                                    uint8_t *dst, ptrdiff_t dst_stride,
46                                    const InterpKernel *x_filters, int x0_qn,
47                                    int x_step_qn, int w, int h) {
48   int x, y;
49   src -= SUBPEL_TAPS / 2 - 1;
50   for (y = 0; y < h; ++y) {
51     int x_qn = x0_qn;
52     for (x = 0; x < w; ++x) {
53       const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
54       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
55       assert(x_filter_idx < SUBPEL_SHIFTS);
56       const int16_t *const x_filter = x_filters[x_filter_idx];
57       int k, sum = 0;
58       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
59       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
60       x_qn += x_step_qn;
61     }
62     src += src_stride;
63     dst += dst_stride;
64   }
65 }
66 
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)67 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
68                                uint8_t *dst, ptrdiff_t dst_stride,
69                                const InterpKernel *x_filters, int x0_q4,
70                                int x_step_q4, int w, int h) {
71   int x, y;
72   src -= SUBPEL_TAPS / 2 - 1;
73   for (y = 0; y < h; ++y) {
74     int x_q4 = x0_q4;
75     for (x = 0; x < w; ++x) {
76       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
77       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
78       int k, sum = 0;
79       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
80       dst[x] = ROUND_POWER_OF_TWO(
81           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
82       x_q4 += x_step_q4;
83     }
84     src += src_stride;
85     dst += dst_stride;
86   }
87 }
88 
convolve_avg_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_qn,int x_step_qn,int w,int h)89 static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
90                                        uint8_t *dst, ptrdiff_t dst_stride,
91                                        const InterpKernel *x_filters, int x0_qn,
92                                        int x_step_qn, int w, int h) {
93   int x, y;
94   src -= SUBPEL_TAPS / 2 - 1;
95   for (y = 0; y < h; ++y) {
96     int x_qn = x0_qn;
97     for (x = 0; x < w; ++x) {
98       const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
99       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
100       assert(x_filter_idx < SUBPEL_SHIFTS);
101       const int16_t *const x_filter = x_filters[x_filter_idx];
102       int k, sum = 0;
103       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
104       dst[x] = ROUND_POWER_OF_TWO(
105           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
106       x_qn += x_step_qn;
107     }
108     src += src_stride;
109     dst += dst_stride;
110   }
111 }
112 
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)113 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
114                           uint8_t *dst, ptrdiff_t dst_stride,
115                           const InterpKernel *y_filters, int y0_q4,
116                           int y_step_q4, int w, int h) {
117   int x, y;
118   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
119 
120   for (x = 0; x < w; ++x) {
121     int y_q4 = y0_q4;
122     for (y = 0; y < h; ++y) {
123       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
124       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
125       int k, sum = 0;
126       for (k = 0; k < SUBPEL_TAPS; ++k)
127         sum += src_y[k * src_stride] * y_filter[k];
128       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
129       y_q4 += y_step_q4;
130     }
131     ++src;
132     ++dst;
133   }
134 }
135 
convolve_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_qn,int y_step_qn,int w,int h)136 static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
137                                   uint8_t *dst, ptrdiff_t dst_stride,
138                                   const InterpKernel *y_filters, int y0_qn,
139                                   int y_step_qn, int w, int h) {
140   int x, y;
141   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
142 
143   for (x = 0; x < w; ++x) {
144     int y_qn = y0_qn;
145     for (y = 0; y < h; ++y) {
146       const unsigned char *src_y =
147           &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
148       const int16_t *const y_filter =
149           y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
150       int k, sum = 0;
151       for (k = 0; k < SUBPEL_TAPS; ++k)
152         sum += src_y[k * src_stride] * y_filter[k];
153       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
154       y_qn += y_step_qn;
155     }
156     ++src;
157     ++dst;
158   }
159 }
160 
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)161 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
162                               uint8_t *dst, ptrdiff_t dst_stride,
163                               const InterpKernel *y_filters, int y0_q4,
164                               int y_step_q4, int w, int h) {
165   int x, y;
166   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
167 
168   for (x = 0; x < w; ++x) {
169     int y_q4 = y0_q4;
170     for (y = 0; y < h; ++y) {
171       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
172       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
173       int k, sum = 0;
174       for (k = 0; k < SUBPEL_TAPS; ++k)
175         sum += src_y[k * src_stride] * y_filter[k];
176       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
177           dst[y * dst_stride] +
178               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
179           1);
180       y_q4 += y_step_q4;
181     }
182     ++src;
183     ++dst;
184   }
185 }
186 
convolve_avg_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_qn,int y_step_qn,int w,int h)187 static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
188                                       uint8_t *dst, ptrdiff_t dst_stride,
189                                       const InterpKernel *y_filters, int y0_qn,
190                                       int y_step_qn, int w, int h) {
191   int x, y;
192   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
193 
194   for (x = 0; x < w; ++x) {
195     int y_qn = y0_qn;
196     for (y = 0; y < h; ++y) {
197       const unsigned char *src_y =
198           &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
199       const int16_t *const y_filter =
200           y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
201       int k, sum = 0;
202       for (k = 0; k < SUBPEL_TAPS; ++k)
203         sum += src_y[k * src_stride] * y_filter[k];
204       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
205           dst[y * dst_stride] +
206               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
207           1);
208       y_qn += y_step_qn;
209     }
210     ++src;
211     ++dst;
212   }
213 }
214 
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)215 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
216                      ptrdiff_t dst_stride, const InterpKernel *const x_filters,
217                      int x0_q4, int x_step_q4,
218                      const InterpKernel *const y_filters, int y0_q4,
219                      int y_step_q4, int w, int h) {
220   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
221   // 2d filtering proceeds in 2 steps:
222   //   (1) Interpolate horizontally into an intermediate buffer, temp.
223   //   (2) Interpolate temp vertically to derive the sub-pixel result.
224   // Deriving the maximum number of rows in the temp buffer (135):
225   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
226   // --Largest block size is 64x64 pixels.
227   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
228   //   original frame (in 1/16th pixel units).
229   // --Must round-up because block may be located at sub-pixel position.
230   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
231   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
232   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
233   int intermediate_height =
234       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
235 
236   assert(w <= MAX_SB_SIZE);
237   assert(h <= MAX_SB_SIZE);
238 
239   assert(y_step_q4 <= 32);
240   assert(x_step_q4 <= 32);
241 
242   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
243                  MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
244                  intermediate_height);
245   convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
246                 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
247 }
248 
convolve_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_qn,int x_step_qn,const InterpKernel * const y_filters,int y0_qn,int y_step_qn,int w,int h)249 static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
250                              uint8_t *dst, ptrdiff_t dst_stride,
251                              const InterpKernel *const x_filters, int x0_qn,
252                              int x_step_qn, const InterpKernel *const y_filters,
253                              int y0_qn, int y_step_qn, int w, int h) {
254   // TODO(afergs): Update comment here
255   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
256   // 2d filtering proceeds in 2 steps:
257   //   (1) Interpolate horizontally into an intermediate buffer, temp.
258   //   (2) Interpolate temp vertically to derive the sub-pixel result.
259   // Deriving the maximum number of rows in the temp buffer (135):
260   // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
261   // --Largest block size is 64x64 pixels.
262   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
263   //   original frame (in 1/16th pixel units).
264   // --Must round-up because block may be located at sub-pixel position.
265   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
266   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
267   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
268   int intermediate_height =
269       (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
270 
271   assert(w <= MAX_SB_SIZE);
272   assert(h <= MAX_SB_SIZE);
273 
274   assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
275   assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
276 
277   convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
278                          temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
279                          intermediate_height);
280   convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
281                         dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
282 }
283 
get_filter_base(const int16_t * filter)284 static const InterpKernel *get_filter_base(const int16_t *filter) {
285   // NOTE: This assumes that the filter table is 256-byte aligned.
286   // TODO(agrange) Modify to make independent of table alignment.
287   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
288 }
289 
get_filter_offset(const int16_t * f,const InterpKernel * base)290 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
291   return (int)((const InterpKernel *)(intptr_t)f - base);
292 }
293 
aom_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)294 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
295                            uint8_t *dst, ptrdiff_t dst_stride,
296                            const int16_t *filter_x, int x_step_q4,
297                            const int16_t *filter_y, int y_step_q4, int w,
298                            int h) {
299   const InterpKernel *const filters_x = get_filter_base(filter_x);
300   const int x0_q4 = get_filter_offset(filter_x, filters_x);
301 
302   (void)filter_y;
303   (void)y_step_q4;
304 
305   convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
306                  w, h);
307 }
308 
aom_convolve8_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)309 void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
310                                  uint8_t *dst, ptrdiff_t dst_stride,
311                                  const int16_t *filter_x, int subpel_x,
312                                  int x_step_qn, const int16_t *filter_y,
313                                  int subpel_y, int y_step_qn, int w, int h) {
314   const InterpKernel *const filters_x = get_filter_base(filter_x);
315 
316   (void)subpel_y;
317   (void)filter_y;
318   (void)y_step_qn;
319 
320   convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
321                          x_step_qn, w, h);
322 }
323 
aom_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)324 void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
325                                uint8_t *dst, ptrdiff_t dst_stride,
326                                const int16_t *filter_x, int x_step_q4,
327                                const int16_t *filter_y, int y_step_q4, int w,
328                                int h) {
329   const InterpKernel *const filters_x = get_filter_base(filter_x);
330   const int x0_q4 = get_filter_offset(filter_x, filters_x);
331 
332   (void)filter_y;
333   (void)y_step_q4;
334 
335   convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
336                      x_step_q4, w, h);
337 }
338 
aom_convolve8_avg_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)339 void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
340                                      uint8_t *dst, ptrdiff_t dst_stride,
341                                      const int16_t *filter_x, int subpel_x,
342                                      int x_step_qn, const int16_t *filter_y,
343                                      int subpel_y, int y_step_qn, int w,
344                                      int h) {
345   const InterpKernel *const filters_x = get_filter_base(filter_x);
346 
347   (void)subpel_y;
348   (void)filter_y;
349   (void)y_step_qn;
350 
351   convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
352                              subpel_x, x_step_qn, w, h);
353 }
354 
aom_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)355 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
356                           uint8_t *dst, ptrdiff_t dst_stride,
357                           const int16_t *filter_x, int x_step_q4,
358                           const int16_t *filter_y, int y_step_q4, int w,
359                           int h) {
360   const InterpKernel *const filters_y = get_filter_base(filter_y);
361   const int y0_q4 = get_filter_offset(filter_y, filters_y);
362 
363   (void)filter_x;
364   (void)x_step_q4;
365 
366   convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
367                 w, h);
368 }
369 
aom_convolve8_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)370 void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
371                                 uint8_t *dst, ptrdiff_t dst_stride,
372                                 const int16_t *filter_x, int subpel_x,
373                                 int x_step_qn, const int16_t *filter_y,
374                                 int subpel_y, int y_step_qn, int w, int h) {
375   const InterpKernel *const filters_y = get_filter_base(filter_y);
376 
377   (void)subpel_x;
378   (void)filter_x;
379   (void)x_step_qn;
380 
381   convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
382                         y_step_qn, w, h);
383 }
384 
aom_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)385 void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
386                               uint8_t *dst, ptrdiff_t dst_stride,
387                               const int16_t *filter_x, int x_step_q4,
388                               const int16_t *filter_y, int y_step_q4, int w,
389                               int h) {
390   const InterpKernel *const filters_y = get_filter_base(filter_y);
391   const int y0_q4 = get_filter_offset(filter_y, filters_y);
392 
393   (void)filter_x;
394   (void)x_step_q4;
395 
396   convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
397                     y_step_q4, w, h);
398 }
399 
aom_convolve8_avg_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)400 void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
401                                     uint8_t *dst, ptrdiff_t dst_stride,
402                                     const int16_t *filter_x, int subpel_x,
403                                     int x_step_qn, const int16_t *filter_y,
404                                     int subpel_y, int y_step_qn, int w, int h) {
405   const InterpKernel *const filters_y = get_filter_base(filter_y);
406 
407   (void)subpel_x;
408   (void)filter_x;
409   (void)x_step_qn;
410 
411   convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
412                             subpel_y, y_step_qn, w, h);
413 }
414 
aom_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)415 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
416                      ptrdiff_t dst_stride, const int16_t *filter_x,
417                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
418                      int w, int h) {
419   const InterpKernel *const filters_x = get_filter_base(filter_x);
420   const int x0_q4 = get_filter_offset(filter_x, filters_x);
421 
422   const InterpKernel *const filters_y = get_filter_base(filter_y);
423   const int y0_q4 = get_filter_offset(filter_y, filters_y);
424 
425   convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
426            filters_y, y0_q4, y_step_q4, w, h);
427 }
428 
aom_convolve8_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)429 void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
430                            uint8_t *dst, ptrdiff_t dst_stride,
431                            const int16_t *filter_x, int subpel_x, int x_step_qn,
432                            const int16_t *filter_y, int subpel_y, int y_step_qn,
433                            int w, int h) {
434   const InterpKernel *const filters_x = get_filter_base(filter_x);
435 
436   const InterpKernel *const filters_y = get_filter_base(filter_y);
437 
438   convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
439                    x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
440 }
441 
aom_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)442 void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
443                          ptrdiff_t dst_stride, const int16_t *filter_x,
444                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
445                          int w, int h) {
446   /* Fixed size intermediate buffer places limits on parameters. */
447   DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
448   assert(w <= MAX_SB_SIZE);
449   assert(h <= MAX_SB_SIZE);
450 
451   aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
452                   filter_y, y_step_q4, w, h);
453   aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
454                      h);
455 }
456 
aom_convolve8_avg_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)457 void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
458                                uint8_t *dst, ptrdiff_t dst_stride,
459                                const int16_t *filter_x, int subpel_x,
460                                int x_step_qn, const int16_t *filter_y,
461                                int subpel_y, int y_step_qn, int w, int h) {
462   /* Fixed size intermediate buffer places limits on parameters. */
463   DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
464   assert(w <= MAX_SB_SIZE);
465   assert(h <= MAX_SB_SIZE);
466 
467   aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
468                         x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
469   aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
470                      h);
471 }
472 
aom_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)473 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
474                          ptrdiff_t dst_stride, const int16_t *filter_x,
475                          int filter_x_stride, const int16_t *filter_y,
476                          int filter_y_stride, int w, int h) {
477   int r;
478 
479   (void)filter_x;
480   (void)filter_x_stride;
481   (void)filter_y;
482   (void)filter_y_stride;
483 
484   for (r = h; r > 0; --r) {
485     memcpy(dst, src, w);
486     src += src_stride;
487     dst += dst_stride;
488   }
489 }
490 
aom_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)491 void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
492                         ptrdiff_t dst_stride, const int16_t *filter_x,
493                         int filter_x_stride, const int16_t *filter_y,
494                         int filter_y_stride, int w, int h) {
495   int x, y;
496 
497   (void)filter_x;
498   (void)filter_x_stride;
499   (void)filter_y;
500   (void)filter_y_stride;
501 
502   for (y = 0; y < h; ++y) {
503     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
504 
505     src += src_stride;
506     dst += dst_stride;
507   }
508 }
509 
aom_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)510 void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
511                         ptrdiff_t dst_stride, const int16_t *filter_x,
512                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
513                         int w, int h) {
514   aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
515                         filter_y, y_step_q4, w, h);
516 }
517 
aom_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)518 void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
519                        ptrdiff_t dst_stride, const int16_t *filter_x,
520                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
521                        int w, int h) {
522   aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
523                        filter_y, y_step_q4, w, h);
524 }
525 
aom_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)526 void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
527                      ptrdiff_t dst_stride, const int16_t *filter_x,
528                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
529                      int w, int h) {
530   aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
531                   filter_y, y_step_q4, w, h);
532 }
533 
aom_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)534 void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
535                             uint8_t *dst, ptrdiff_t dst_stride,
536                             const int16_t *filter_x, int x_step_q4,
537                             const int16_t *filter_y, int y_step_q4, int w,
538                             int h) {
539   aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
540                             x_step_q4, filter_y, y_step_q4, w, h);
541 }
542 
aom_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)543 void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
544                            uint8_t *dst, ptrdiff_t dst_stride,
545                            const int16_t *filter_x, int x_step_q4,
546                            const int16_t *filter_y, int y_step_q4, int w,
547                            int h) {
548   aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
549                            x_step_q4, filter_y, y_step_q4, w, h);
550 }
551 
aom_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)552 void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
553                          ptrdiff_t dst_stride, const int16_t *filter_x,
554                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
555                          int w, int h) {
556   aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
557                       filter_y, y_step_q4, w, h);
558 }
559 
560 // TODO(afergs): Make sure this works too
561 #if CONFIG_LOOP_RESTORATION
convolve_add_src_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)562 static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
563                                    uint8_t *dst, ptrdiff_t dst_stride,
564                                    const InterpKernel *x_filters, int x0_q4,
565                                    int x_step_q4, int w, int h) {
566   int x, y, k;
567   src -= SUBPEL_TAPS / 2 - 1;
568   for (y = 0; y < h; ++y) {
569     int x_q4 = x0_q4;
570     for (x = 0; x < w; ++x) {
571       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
572       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
573       int sum = 0;
574       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
575       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
576                           src_x[SUBPEL_TAPS / 2 - 1]);
577       x_q4 += x_step_q4;
578     }
579     src += src_stride;
580     dst += dst_stride;
581   }
582 }
583 
convolve_add_src_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)584 static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
585                                   uint8_t *dst, ptrdiff_t dst_stride,
586                                   const InterpKernel *y_filters, int y0_q4,
587                                   int y_step_q4, int w, int h) {
588   int x, y, k;
589   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
590 
591   for (x = 0; x < w; ++x) {
592     int y_q4 = y0_q4;
593     for (y = 0; y < h; ++y) {
594       const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
595       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
596       int sum = 0;
597       for (k = 0; k < SUBPEL_TAPS; ++k)
598         sum += src_y[k * src_stride] * y_filter[k];
599       dst[y * dst_stride] =
600           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
601                      src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
602       y_q4 += y_step_q4;
603     }
604     ++src;
605     ++dst;
606   }
607 }
608 
convolve_add_src(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)609 static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
610                              uint8_t *dst, ptrdiff_t dst_stride,
611                              const InterpKernel *const x_filters, int x0_q4,
612                              int x_step_q4, const InterpKernel *const y_filters,
613                              int y0_q4, int y_step_q4, int w, int h) {
614   uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
615   int intermediate_height =
616       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
617 
618   assert(w <= MAX_SB_SIZE);
619   assert(h <= MAX_SB_SIZE);
620 
621   assert(y_step_q4 <= 32);
622   assert(x_step_q4 <= 32);
623 
624   convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
625                          temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
626                          intermediate_height);
627   convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
628                         dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
629 }
630 
aom_convolve8_add_src_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)631 void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
632                                    uint8_t *dst, ptrdiff_t dst_stride,
633                                    const int16_t *filter_x, int x_step_q4,
634                                    const int16_t *filter_y, int y_step_q4,
635                                    int w, int h) {
636   const InterpKernel *const filters_x = get_filter_base(filter_x);
637   const int x0_q4 = get_filter_offset(filter_x, filters_x);
638 
639   (void)filter_y;
640   (void)y_step_q4;
641 
642   convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
643                          x_step_q4, w, h);
644 }
645 
aom_convolve8_add_src_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)646 void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
647                                   uint8_t *dst, ptrdiff_t dst_stride,
648                                   const int16_t *filter_x, int x_step_q4,
649                                   const int16_t *filter_y, int y_step_q4, int w,
650                                   int h) {
651   const InterpKernel *const filters_y = get_filter_base(filter_y);
652   const int y0_q4 = get_filter_offset(filter_y, filters_y);
653 
654   (void)filter_x;
655   (void)x_step_q4;
656 
657   convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
658                         y_step_q4, w, h);
659 }
660 
aom_convolve8_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)661 void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
662                              uint8_t *dst, ptrdiff_t dst_stride,
663                              const int16_t *filter_x, int x_step_q4,
664                              const int16_t *filter_y, int y_step_q4, int w,
665                              int h) {
666   const InterpKernel *const filters_x = get_filter_base(filter_x);
667   const int x0_q4 = get_filter_offset(filter_x, filters_x);
668 
669   const InterpKernel *const filters_y = get_filter_base(filter_y);
670   const int y0_q4 = get_filter_offset(filter_y, filters_y);
671 
672   convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
673                    x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
674 }
675 
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)676 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
677                                        uint16_t *dst, ptrdiff_t dst_stride,
678                                        const InterpKernel *x_filters, int x0_q4,
679                                        int x_step_q4, int w, int h) {
680   const int bd = 8;
681   int x, y, k;
682   src -= SUBPEL_TAPS / 2 - 1;
683   for (y = 0; y < h; ++y) {
684     int x_q4 = x0_q4;
685     for (x = 0; x < w; ++x) {
686       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
687       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
688       int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
689                 (1 << (bd + FILTER_BITS - 1));
690       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
691       dst[x] =
692           (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
693                           0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
694       x_q4 += x_step_q4;
695     }
696     src += src_stride;
697     dst += dst_stride;
698   }
699 }
700 
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)701 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
702                                       uint8_t *dst, ptrdiff_t dst_stride,
703                                       const InterpKernel *y_filters, int y0_q4,
704                                       int y_step_q4, int w, int h) {
705   const int bd = 8;
706   int x, y, k;
707   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
708 
709   for (x = 0; x < w; ++x) {
710     int y_q4 = y0_q4;
711     for (y = 0; y < h; ++y) {
712       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
713       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
714       int sum =
715           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
716           (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
717       for (k = 0; k < SUBPEL_TAPS; ++k)
718         sum += src_y[k * src_stride] * y_filter[k];
719       dst[y * dst_stride] =
720           clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
721       y_q4 += y_step_q4;
722     }
723     ++src;
724     ++dst;
725   }
726 }
727 
convolve_add_src_hip(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)728 static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
729                                  uint8_t *dst, ptrdiff_t dst_stride,
730                                  const InterpKernel *const x_filters, int x0_q4,
731                                  int x_step_q4,
732                                  const InterpKernel *const y_filters, int y0_q4,
733                                  int y_step_q4, int w, int h) {
734   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
735   int intermediate_height =
736       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
737 
738   assert(w <= MAX_SB_SIZE);
739   assert(h <= MAX_SB_SIZE);
740 
741   assert(y_step_q4 <= 32);
742   assert(x_step_q4 <= 32);
743 
744   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
745                              src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
746                              x_step_q4, w, intermediate_height);
747   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
748                             MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
749                             y_step_q4, w, h);
750 }
751 
aom_convolve8_add_src_horiz_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)752 void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
753                                        uint16_t *dst, ptrdiff_t dst_stride,
754                                        const int16_t *filter_x, int x_step_q4,
755                                        const int16_t *filter_y, int y_step_q4,
756                                        int w, int h) {
757   const InterpKernel *const filters_x = get_filter_base(filter_x);
758   const int x0_q4 = get_filter_offset(filter_x, filters_x);
759 
760   (void)filter_y;
761   (void)y_step_q4;
762 
763   convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
764                              x_step_q4, w, h);
765 }
766 
aom_convolve8_add_src_vert_hip_c(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)767 void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
768                                       uint8_t *dst, ptrdiff_t dst_stride,
769                                       const int16_t *filter_x, int x_step_q4,
770                                       const int16_t *filter_y, int y_step_q4,
771                                       int w, int h) {
772   const InterpKernel *const filters_y = get_filter_base(filter_y);
773   const int y0_q4 = get_filter_offset(filter_y, filters_y);
774 
775   (void)filter_x;
776   (void)x_step_q4;
777 
778   convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
779                             y_step_q4, w, h);
780 }
781 
aom_convolve8_add_src_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)782 void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
783                                  uint8_t *dst, ptrdiff_t dst_stride,
784                                  const int16_t *filter_x, int x_step_q4,
785                                  const int16_t *filter_y, int y_step_q4, int w,
786                                  int h) {
787   const InterpKernel *const filters_x = get_filter_base(filter_x);
788   const int x0_q4 = get_filter_offset(filter_x, filters_x);
789 
790   const InterpKernel *const filters_y = get_filter_base(filter_y);
791   const int y0_q4 = get_filter_offset(filter_y, filters_y);
792 
793   convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
794                        x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
795 }
796 #endif  // CONFIG_LOOP_RESTORATION
797 
798 // TODO(afergs): Make sure this works too
799 #if CONFIG_HIGHBITDEPTH
highbd_convolve_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)800 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
801                                   uint8_t *dst8, ptrdiff_t dst_stride,
802                                   const InterpKernel *x_filters, int x0_q4,
803                                   int x_step_q4, int w, int h, int bd) {
804   int x, y;
805   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
806   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
807   src -= SUBPEL_TAPS / 2 - 1;
808   for (y = 0; y < h; ++y) {
809     int x_q4 = x0_q4;
810     for (x = 0; x < w; ++x) {
811       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
812       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
813       int k, sum = 0;
814       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
815       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
816       x_q4 += x_step_q4;
817     }
818     src += src_stride;
819     dst += dst_stride;
820   }
821 }
822 
highbd_convolve_avg_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)823 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
824                                       uint8_t *dst8, ptrdiff_t dst_stride,
825                                       const InterpKernel *x_filters, int x0_q4,
826                                       int x_step_q4, int w, int h, int bd) {
827   int x, y;
828   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
829   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
830   src -= SUBPEL_TAPS / 2 - 1;
831   for (y = 0; y < h; ++y) {
832     int x_q4 = x0_q4;
833     for (x = 0; x < w; ++x) {
834       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
835       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
836       int k, sum = 0;
837       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
838       dst[x] = ROUND_POWER_OF_TWO(
839           dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
840           1);
841       x_q4 += x_step_q4;
842     }
843     src += src_stride;
844     dst += dst_stride;
845   }
846 }
847 
highbd_convolve_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)848 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
849                                  uint8_t *dst8, ptrdiff_t dst_stride,
850                                  const InterpKernel *y_filters, int y0_q4,
851                                  int y_step_q4, int w, int h, int bd) {
852   int x, y;
853   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
854   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
855   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
856   for (x = 0; x < w; ++x) {
857     int y_q4 = y0_q4;
858     for (y = 0; y < h; ++y) {
859       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
860       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
861       int k, sum = 0;
862       for (k = 0; k < SUBPEL_TAPS; ++k)
863         sum += src_y[k * src_stride] * y_filter[k];
864       dst[y * dst_stride] =
865           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
866       y_q4 += y_step_q4;
867     }
868     ++src;
869     ++dst;
870   }
871 }
872 
highbd_convolve_avg_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)873 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
874                                      uint8_t *dst8, ptrdiff_t dst_stride,
875                                      const InterpKernel *y_filters, int y0_q4,
876                                      int y_step_q4, int w, int h, int bd) {
877   int x, y;
878   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
879   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
880   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
881   for (x = 0; x < w; ++x) {
882     int y_q4 = y0_q4;
883     for (y = 0; y < h; ++y) {
884       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
885       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
886       int k, sum = 0;
887       for (k = 0; k < SUBPEL_TAPS; ++k)
888         sum += src_y[k * src_stride] * y_filter[k];
889       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
890           dst[y * dst_stride] +
891               clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
892           1);
893       y_q4 += y_step_q4;
894     }
895     ++src;
896     ++dst;
897   }
898 }
899 
highbd_convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)900 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
901                             uint8_t *dst, ptrdiff_t dst_stride,
902                             const InterpKernel *const x_filters, int x0_q4,
903                             int x_step_q4, const InterpKernel *const y_filters,
904                             int y0_q4, int y_step_q4, int w, int h, int bd) {
905   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
906   // 2d filtering proceeds in 2 steps:
907   //   (1) Interpolate horizontally into an intermediate buffer, temp.
908   //   (2) Interpolate temp vertically to derive the sub-pixel result.
909   // Deriving the maximum number of rows in the temp buffer (135):
910   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
911   // --Largest block size is 64x64 pixels.
912   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
913   //   original frame (in 1/16th pixel units).
914   // --Must round-up because block may be located at sub-pixel position.
915   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
916   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
917   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
918   int intermediate_height =
919       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
920 
921   assert(w <= MAX_SB_SIZE);
922   assert(h <= MAX_SB_SIZE);
923   assert(y_step_q4 <= 32);
924   assert(x_step_q4 <= 32);
925 
926   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
927                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
928                         x_step_q4, w, intermediate_height, bd);
929   highbd_convolve_vert(
930       CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
931       MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
932 }
933 
aom_highbd_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)934 void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
935                                   uint8_t *dst, ptrdiff_t dst_stride,
936                                   const int16_t *filter_x, int x_step_q4,
937                                   const int16_t *filter_y, int y_step_q4, int w,
938                                   int h, int bd) {
939   const InterpKernel *const filters_x = get_filter_base(filter_x);
940   const int x0_q4 = get_filter_offset(filter_x, filters_x);
941   (void)filter_y;
942   (void)y_step_q4;
943 
944   highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
945                         x_step_q4, w, h, bd);
946 }
947 
aom_highbd_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)948 void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
949                                       uint8_t *dst, ptrdiff_t dst_stride,
950                                       const int16_t *filter_x, int x_step_q4,
951                                       const int16_t *filter_y, int y_step_q4,
952                                       int w, int h, int bd) {
953   const InterpKernel *const filters_x = get_filter_base(filter_x);
954   const int x0_q4 = get_filter_offset(filter_x, filters_x);
955   (void)filter_y;
956   (void)y_step_q4;
957 
958   highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
959                             x_step_q4, w, h, bd);
960 }
961 
aom_highbd_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)962 void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
963                                  uint8_t *dst, ptrdiff_t dst_stride,
964                                  const int16_t *filter_x, int x_step_q4,
965                                  const int16_t *filter_y, int y_step_q4, int w,
966                                  int h, int bd) {
967   const InterpKernel *const filters_y = get_filter_base(filter_y);
968   const int y0_q4 = get_filter_offset(filter_y, filters_y);
969   (void)filter_x;
970   (void)x_step_q4;
971 
972   highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
973                        y_step_q4, w, h, bd);
974 }
975 
aom_highbd_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)976 void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
977                                      uint8_t *dst, ptrdiff_t dst_stride,
978                                      const int16_t *filter_x, int x_step_q4,
979                                      const int16_t *filter_y, int y_step_q4,
980                                      int w, int h, int bd) {
981   const InterpKernel *const filters_y = get_filter_base(filter_y);
982   const int y0_q4 = get_filter_offset(filter_y, filters_y);
983   (void)filter_x;
984   (void)x_step_q4;
985 
986   highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
987                            y_step_q4, w, h, bd);
988 }
989 
aom_highbd_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)990 void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
991                             uint8_t *dst, ptrdiff_t dst_stride,
992                             const int16_t *filter_x, int x_step_q4,
993                             const int16_t *filter_y, int y_step_q4, int w,
994                             int h, int bd) {
995   const InterpKernel *const filters_x = get_filter_base(filter_x);
996   const int x0_q4 = get_filter_offset(filter_x, filters_x);
997 
998   const InterpKernel *const filters_y = get_filter_base(filter_y);
999   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1000 
1001   highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
1002                   filters_y, y0_q4, y_step_q4, w, h, bd);
1003 }
1004 
aom_highbd_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1005 void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
1006                                 uint8_t *dst, ptrdiff_t dst_stride,
1007                                 const int16_t *filter_x, int x_step_q4,
1008                                 const int16_t *filter_y, int y_step_q4, int w,
1009                                 int h, int bd) {
1010   // Fixed size intermediate buffer places limits on parameters.
1011   DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
1012   assert(w <= MAX_SB_SIZE);
1013   assert(h <= MAX_SB_SIZE);
1014 
1015   aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
1016                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
1017   aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
1018                             dst_stride, NULL, 0, NULL, 0, w, h, bd);
1019 }
1020 
aom_highbd_convolve_copy_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)1021 void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
1022                                 uint8_t *dst8, ptrdiff_t dst_stride,
1023                                 const int16_t *filter_x, int filter_x_stride,
1024                                 const int16_t *filter_y, int filter_y_stride,
1025                                 int w, int h, int bd) {
1026   int r;
1027   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1028   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1029   (void)filter_x;
1030   (void)filter_y;
1031   (void)filter_x_stride;
1032   (void)filter_y_stride;
1033   (void)bd;
1034 
1035   for (r = h; r > 0; --r) {
1036     memcpy(dst, src, w * sizeof(uint16_t));
1037     src += src_stride;
1038     dst += dst_stride;
1039   }
1040 }
1041 
aom_highbd_convolve_avg_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)1042 void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
1043                                uint8_t *dst8, ptrdiff_t dst_stride,
1044                                const int16_t *filter_x, int filter_x_stride,
1045                                const int16_t *filter_y, int filter_y_stride,
1046                                int w, int h, int bd) {
1047   int x, y;
1048   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1049   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1050   (void)filter_x;
1051   (void)filter_y;
1052   (void)filter_x_stride;
1053   (void)filter_y_stride;
1054   (void)bd;
1055 
1056   for (y = 0; y < h; ++y) {
1057     for (x = 0; x < w; ++x) {
1058       dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
1059     }
1060     src += src_stride;
1061     dst += dst_stride;
1062   }
1063 }
1064 
1065 #if CONFIG_LOOP_RESTORATION
highbd_convolve_add_src_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)1066 static void highbd_convolve_add_src_horiz(const uint8_t *src8,
1067                                           ptrdiff_t src_stride, uint8_t *dst8,
1068                                           ptrdiff_t dst_stride,
1069                                           const InterpKernel *x_filters,
1070                                           int x0_q4, int x_step_q4, int w,
1071                                           int h, int bd) {
1072   int x, y, k;
1073   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1074   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1075   src -= SUBPEL_TAPS / 2 - 1;
1076   for (y = 0; y < h; ++y) {
1077     int x_q4 = x0_q4;
1078     for (x = 0; x < w; ++x) {
1079       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1080       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1081       int sum = 0;
1082       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
1083       dst[x] = clip_pixel_highbd(
1084           ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
1085           bd);
1086       x_q4 += x_step_q4;
1087     }
1088     src += src_stride;
1089     dst += dst_stride;
1090   }
1091 }
1092 
highbd_convolve_add_src_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1093 static void highbd_convolve_add_src_vert(const uint8_t *src8,
1094                                          ptrdiff_t src_stride, uint8_t *dst8,
1095                                          ptrdiff_t dst_stride,
1096                                          const InterpKernel *y_filters,
1097                                          int y0_q4, int y_step_q4, int w, int h,
1098                                          int bd) {
1099   int x, y, k;
1100   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1101   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1102   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1103   for (x = 0; x < w; ++x) {
1104     int y_q4 = y0_q4;
1105     for (y = 0; y < h; ++y) {
1106       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1107       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1108       int sum = 0;
1109       for (k = 0; k < SUBPEL_TAPS; ++k)
1110         sum += src_y[k * src_stride] * y_filter[k];
1111       dst[y * dst_stride] =
1112           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
1113                                 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
1114                             bd);
1115       y_q4 += y_step_q4;
1116     }
1117     ++src;
1118     ++dst;
1119   }
1120 }
1121 
highbd_convolve_add_src(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1122 static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
1123                                     uint8_t *dst, ptrdiff_t dst_stride,
1124                                     const InterpKernel *const x_filters,
1125                                     int x0_q4, int x_step_q4,
1126                                     const InterpKernel *const y_filters,
1127                                     int y0_q4, int y_step_q4, int w, int h,
1128                                     int bd) {
1129   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1130   // 2d filtering proceeds in 2 steps:
1131   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1132   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1133   // Deriving the maximum number of rows in the temp buffer (135):
1134   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1135   // --Largest block size is 64x64 pixels.
1136   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1137   //   original frame (in 1/16th pixel units).
1138   // --Must round-up because block may be located at sub-pixel position.
1139   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1140   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1141   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1142   int intermediate_height =
1143       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1144 
1145   assert(w <= MAX_SB_SIZE);
1146   assert(h <= MAX_SB_SIZE);
1147   assert(y_step_q4 <= 32);
1148   assert(x_step_q4 <= 32);
1149 
1150   highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1151                                 src_stride, CONVERT_TO_BYTEPTR(temp),
1152                                 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
1153                                 intermediate_height, bd);
1154   highbd_convolve_add_src_vert(
1155       CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1156       MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
1157 }
1158 
aom_highbd_convolve8_add_src_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1159 void aom_highbd_convolve8_add_src_horiz_c(
1160     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1161     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1162     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1163   const InterpKernel *const filters_x = get_filter_base(filter_x);
1164   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1165   (void)filter_y;
1166   (void)y_step_q4;
1167 
1168   highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
1169                                 x0_q4, x_step_q4, w, h, bd);
1170 }
1171 
aom_highbd_convolve8_add_src_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1172 void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
1173                                          ptrdiff_t src_stride, uint8_t *dst,
1174                                          ptrdiff_t dst_stride,
1175                                          const int16_t *filter_x, int x_step_q4,
1176                                          const int16_t *filter_y, int y_step_q4,
1177                                          int w, int h, int bd) {
1178   const InterpKernel *const filters_y = get_filter_base(filter_y);
1179   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1180   (void)filter_x;
1181   (void)x_step_q4;
1182 
1183   highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
1184                                y0_q4, y_step_q4, w, h, bd);
1185 }
1186 
aom_highbd_convolve8_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1187 void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1188                                     uint8_t *dst, ptrdiff_t dst_stride,
1189                                     const int16_t *filter_x, int x_step_q4,
1190                                     const int16_t *filter_y, int y_step_q4,
1191                                     int w, int h, int bd) {
1192   const InterpKernel *const filters_x = get_filter_base(filter_x);
1193   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1194 
1195   const InterpKernel *const filters_y = get_filter_base(filter_y);
1196   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1197 
1198   highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
1199                           x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
1200 }
1201 
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)1202 static void highbd_convolve_add_src_horiz_hip(
1203     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1204     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1205     int x_step_q4, int w, int h, int bd) {
1206   const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
1207   int x, y, k;
1208   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1209   src -= SUBPEL_TAPS / 2 - 1;
1210   for (y = 0; y < h; ++y) {
1211     int x_q4 = x0_q4;
1212     for (x = 0; x < w; ++x) {
1213       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1214       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1215       int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1216                 (1 << (bd + FILTER_BITS - 1));
1217       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
1218       dst[x] =
1219           (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
1220                           0, extraprec_clamp_limit - 1);
1221       x_q4 += x_step_q4;
1222     }
1223     src += src_stride;
1224     dst += dst_stride;
1225   }
1226 }
1227 
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1228 static void highbd_convolve_add_src_vert_hip(
1229     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1230     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1231     int y_step_q4, int w, int h, int bd) {
1232   int x, y, k;
1233   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1234   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1235   for (x = 0; x < w; ++x) {
1236     int y_q4 = y0_q4;
1237     for (y = 0; y < h; ++y) {
1238       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1239       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1240       int sum =
1241           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1242           (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
1243       for (k = 0; k < SUBPEL_TAPS; ++k)
1244         sum += src_y[k * src_stride] * y_filter[k];
1245       dst[y * dst_stride] = clip_pixel_highbd(
1246           ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1247       y_q4 += y_step_q4;
1248     }
1249     ++src;
1250     ++dst;
1251   }
1252 }
1253 
highbd_convolve_add_src_hip(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1254 static void highbd_convolve_add_src_hip(
1255     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1256     ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1257     int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1258     int y_step_q4, int w, int h, int bd) {
1259   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1260   // 2d filtering proceeds in 2 steps:
1261   //   (1) Interpolate horizontally into an intermediate buffer, temp.
1262   //   (2) Interpolate temp vertically to derive the sub-pixel result.
1263   // Deriving the maximum number of rows in the temp buffer (135):
1264   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1265   // --Largest block size is 64x64 pixels.
1266   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1267   //   original frame (in 1/16th pixel units).
1268   // --Must round-up because block may be located at sub-pixel position.
1269   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1270   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1271   uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1272   int intermediate_height =
1273       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1274 
1275   assert(w <= MAX_SB_SIZE);
1276   assert(h <= MAX_SB_SIZE);
1277   assert(y_step_q4 <= 32);
1278   assert(x_step_q4 <= 32);
1279 
1280   highbd_convolve_add_src_horiz_hip(
1281       src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1282       x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1283   highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1284                                    MAX_SB_SIZE, dst, dst_stride, y_filters,
1285                                    y0_q4, y_step_q4, w, h, bd);
1286 }
1287 
aom_highbd_convolve8_add_src_horiz_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1288 void aom_highbd_convolve8_add_src_horiz_hip_c(
1289     const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
1290     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1291     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1292   const InterpKernel *const filters_x = get_filter_base(filter_x);
1293   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1294   (void)filter_y;
1295   (void)y_step_q4;
1296 
1297   highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
1298                                     x0_q4, x_step_q4, w, h, bd);
1299 }
1300 
aom_highbd_convolve8_add_src_vert_hip_c(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1301 void aom_highbd_convolve8_add_src_vert_hip_c(
1302     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
1303     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1304     const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1305   const InterpKernel *const filters_y = get_filter_base(filter_y);
1306   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1307   (void)filter_x;
1308   (void)x_step_q4;
1309 
1310   highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
1311                                    y0_q4, y_step_q4, w, h, bd);
1312 }
1313 
aom_highbd_convolve8_add_src_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1314 void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1315                                         ptrdiff_t src_stride, uint8_t *dst,
1316                                         ptrdiff_t dst_stride,
1317                                         const int16_t *filter_x, int x_step_q4,
1318                                         const int16_t *filter_y, int y_step_q4,
1319                                         int w, int h, int bd) {
1320   const InterpKernel *const filters_x = get_filter_base(filter_x);
1321   const int x0_q4 = get_filter_offset(filter_x, filters_x);
1322 
1323   const InterpKernel *const filters_y = get_filter_base(filter_y);
1324   const int y0_q4 = get_filter_offset(filter_y, filters_y);
1325 
1326   highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1327                               x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1328                               h, bd);
1329 }
1330 
1331 #endif  // CONFIG_LOOP_RESTORATION
1332 #endif  // CONFIG_HIGHBITDEPTH
1333