1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string.h>
14
15 #include "./aom_config.h"
16 #include "./aom_dsp_rtcd.h"
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/aom_convolve.h"
19 #include "aom_dsp/aom_dsp_common.h"
20 #include "aom_dsp/aom_filter.h"
21 #include "aom_ports/mem.h"
22
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)23 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
24 uint8_t *dst, ptrdiff_t dst_stride,
25 const InterpKernel *x_filters, int x0_q4,
26 int x_step_q4, int w, int h) {
27 int x, y;
28 src -= SUBPEL_TAPS / 2 - 1;
29 for (y = 0; y < h; ++y) {
30 int x_q4 = x0_q4;
31 for (x = 0; x < w; ++x) {
32 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
33 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
34 int k, sum = 0;
35 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
36 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
37 x_q4 += x_step_q4;
38 }
39 src += src_stride;
40 dst += dst_stride;
41 }
42 }
43
convolve_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_qn,int x_step_qn,int w,int h)44 static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 const InterpKernel *x_filters, int x0_qn,
47 int x_step_qn, int w, int h) {
48 int x, y;
49 src -= SUBPEL_TAPS / 2 - 1;
50 for (y = 0; y < h; ++y) {
51 int x_qn = x0_qn;
52 for (x = 0; x < w; ++x) {
53 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8
54 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
55 assert(x_filter_idx < SUBPEL_SHIFTS);
56 const int16_t *const x_filter = x_filters[x_filter_idx];
57 int k, sum = 0;
58 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
59 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
60 x_qn += x_step_qn;
61 }
62 src += src_stride;
63 dst += dst_stride;
64 }
65 }
66
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)67 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
68 uint8_t *dst, ptrdiff_t dst_stride,
69 const InterpKernel *x_filters, int x0_q4,
70 int x_step_q4, int w, int h) {
71 int x, y;
72 src -= SUBPEL_TAPS / 2 - 1;
73 for (y = 0; y < h; ++y) {
74 int x_q4 = x0_q4;
75 for (x = 0; x < w; ++x) {
76 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
77 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
78 int k, sum = 0;
79 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
80 dst[x] = ROUND_POWER_OF_TWO(
81 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
82 x_q4 += x_step_q4;
83 }
84 src += src_stride;
85 dst += dst_stride;
86 }
87 }
88
convolve_avg_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_qn,int x_step_qn,int w,int h)89 static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
90 uint8_t *dst, ptrdiff_t dst_stride,
91 const InterpKernel *x_filters, int x0_qn,
92 int x_step_qn, int w, int h) {
93 int x, y;
94 src -= SUBPEL_TAPS / 2 - 1;
95 for (y = 0; y < h; ++y) {
96 int x_qn = x0_qn;
97 for (x = 0; x < w; ++x) {
98 const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
99 const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
100 assert(x_filter_idx < SUBPEL_SHIFTS);
101 const int16_t *const x_filter = x_filters[x_filter_idx];
102 int k, sum = 0;
103 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
104 dst[x] = ROUND_POWER_OF_TWO(
105 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
106 x_qn += x_step_qn;
107 }
108 src += src_stride;
109 dst += dst_stride;
110 }
111 }
112
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)113 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
114 uint8_t *dst, ptrdiff_t dst_stride,
115 const InterpKernel *y_filters, int y0_q4,
116 int y_step_q4, int w, int h) {
117 int x, y;
118 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
119
120 for (x = 0; x < w; ++x) {
121 int y_q4 = y0_q4;
122 for (y = 0; y < h; ++y) {
123 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
124 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
125 int k, sum = 0;
126 for (k = 0; k < SUBPEL_TAPS; ++k)
127 sum += src_y[k * src_stride] * y_filter[k];
128 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
129 y_q4 += y_step_q4;
130 }
131 ++src;
132 ++dst;
133 }
134 }
135
convolve_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_qn,int y_step_qn,int w,int h)136 static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
137 uint8_t *dst, ptrdiff_t dst_stride,
138 const InterpKernel *y_filters, int y0_qn,
139 int y_step_qn, int w, int h) {
140 int x, y;
141 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
142
143 for (x = 0; x < w; ++x) {
144 int y_qn = y0_qn;
145 for (y = 0; y < h; ++y) {
146 const unsigned char *src_y =
147 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
148 const int16_t *const y_filter =
149 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
150 int k, sum = 0;
151 for (k = 0; k < SUBPEL_TAPS; ++k)
152 sum += src_y[k * src_stride] * y_filter[k];
153 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
154 y_qn += y_step_qn;
155 }
156 ++src;
157 ++dst;
158 }
159 }
160
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)161 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
162 uint8_t *dst, ptrdiff_t dst_stride,
163 const InterpKernel *y_filters, int y0_q4,
164 int y_step_q4, int w, int h) {
165 int x, y;
166 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
167
168 for (x = 0; x < w; ++x) {
169 int y_q4 = y0_q4;
170 for (y = 0; y < h; ++y) {
171 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
172 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
173 int k, sum = 0;
174 for (k = 0; k < SUBPEL_TAPS; ++k)
175 sum += src_y[k * src_stride] * y_filter[k];
176 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
177 dst[y * dst_stride] +
178 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
179 1);
180 y_q4 += y_step_q4;
181 }
182 ++src;
183 ++dst;
184 }
185 }
186
convolve_avg_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_qn,int y_step_qn,int w,int h)187 static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
188 uint8_t *dst, ptrdiff_t dst_stride,
189 const InterpKernel *y_filters, int y0_qn,
190 int y_step_qn, int w, int h) {
191 int x, y;
192 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
193
194 for (x = 0; x < w; ++x) {
195 int y_qn = y0_qn;
196 for (y = 0; y < h; ++y) {
197 const unsigned char *src_y =
198 &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
199 const int16_t *const y_filter =
200 y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
201 int k, sum = 0;
202 for (k = 0; k < SUBPEL_TAPS; ++k)
203 sum += src_y[k * src_stride] * y_filter[k];
204 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
205 dst[y * dst_stride] +
206 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
207 1);
208 y_qn += y_step_qn;
209 }
210 ++src;
211 ++dst;
212 }
213 }
214
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)215 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
216 ptrdiff_t dst_stride, const InterpKernel *const x_filters,
217 int x0_q4, int x_step_q4,
218 const InterpKernel *const y_filters, int y0_q4,
219 int y_step_q4, int w, int h) {
220 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
221 // 2d filtering proceeds in 2 steps:
222 // (1) Interpolate horizontally into an intermediate buffer, temp.
223 // (2) Interpolate temp vertically to derive the sub-pixel result.
224 // Deriving the maximum number of rows in the temp buffer (135):
225 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
226 // --Largest block size is 64x64 pixels.
227 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
228 // original frame (in 1/16th pixel units).
229 // --Must round-up because block may be located at sub-pixel position.
230 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
231 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
232 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
233 int intermediate_height =
234 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
235
236 assert(w <= MAX_SB_SIZE);
237 assert(h <= MAX_SB_SIZE);
238
239 assert(y_step_q4 <= 32);
240 assert(x_step_q4 <= 32);
241
242 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
243 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
244 intermediate_height);
245 convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst,
246 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
247 }
248
convolve_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_qn,int x_step_qn,const InterpKernel * const y_filters,int y0_qn,int y_step_qn,int w,int h)249 static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
250 uint8_t *dst, ptrdiff_t dst_stride,
251 const InterpKernel *const x_filters, int x0_qn,
252 int x_step_qn, const InterpKernel *const y_filters,
253 int y0_qn, int y_step_qn, int w, int h) {
254 // TODO(afergs): Update comment here
255 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
256 // 2d filtering proceeds in 2 steps:
257 // (1) Interpolate horizontally into an intermediate buffer, temp.
258 // (2) Interpolate temp vertically to derive the sub-pixel result.
259 // Deriving the maximum number of rows in the temp buffer (135):
260 // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
261 // --Largest block size is 64x64 pixels.
262 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
263 // original frame (in 1/16th pixel units).
264 // --Must round-up because block may be located at sub-pixel position.
265 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
266 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
267 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
268 int intermediate_height =
269 (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
270
271 assert(w <= MAX_SB_SIZE);
272 assert(h <= MAX_SB_SIZE);
273
274 assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
275 assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
276
277 convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
278 temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
279 intermediate_height);
280 convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
281 dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
282 }
283
get_filter_base(const int16_t * filter)284 static const InterpKernel *get_filter_base(const int16_t *filter) {
285 // NOTE: This assumes that the filter table is 256-byte aligned.
286 // TODO(agrange) Modify to make independent of table alignment.
287 return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
288 }
289
get_filter_offset(const int16_t * f,const InterpKernel * base)290 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
291 return (int)((const InterpKernel *)(intptr_t)f - base);
292 }
293
aom_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)294 void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
295 uint8_t *dst, ptrdiff_t dst_stride,
296 const int16_t *filter_x, int x_step_q4,
297 const int16_t *filter_y, int y_step_q4, int w,
298 int h) {
299 const InterpKernel *const filters_x = get_filter_base(filter_x);
300 const int x0_q4 = get_filter_offset(filter_x, filters_x);
301
302 (void)filter_y;
303 (void)y_step_q4;
304
305 convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
306 w, h);
307 }
308
aom_convolve8_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)309 void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
310 uint8_t *dst, ptrdiff_t dst_stride,
311 const int16_t *filter_x, int subpel_x,
312 int x_step_qn, const int16_t *filter_y,
313 int subpel_y, int y_step_qn, int w, int h) {
314 const InterpKernel *const filters_x = get_filter_base(filter_x);
315
316 (void)subpel_y;
317 (void)filter_y;
318 (void)y_step_qn;
319
320 convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
321 x_step_qn, w, h);
322 }
323
aom_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)324 void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
325 uint8_t *dst, ptrdiff_t dst_stride,
326 const int16_t *filter_x, int x_step_q4,
327 const int16_t *filter_y, int y_step_q4, int w,
328 int h) {
329 const InterpKernel *const filters_x = get_filter_base(filter_x);
330 const int x0_q4 = get_filter_offset(filter_x, filters_x);
331
332 (void)filter_y;
333 (void)y_step_q4;
334
335 convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
336 x_step_q4, w, h);
337 }
338
aom_convolve8_avg_horiz_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)339 void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
340 uint8_t *dst, ptrdiff_t dst_stride,
341 const int16_t *filter_x, int subpel_x,
342 int x_step_qn, const int16_t *filter_y,
343 int subpel_y, int y_step_qn, int w,
344 int h) {
345 const InterpKernel *const filters_x = get_filter_base(filter_x);
346
347 (void)subpel_y;
348 (void)filter_y;
349 (void)y_step_qn;
350
351 convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
352 subpel_x, x_step_qn, w, h);
353 }
354
aom_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)355 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
356 uint8_t *dst, ptrdiff_t dst_stride,
357 const int16_t *filter_x, int x_step_q4,
358 const int16_t *filter_y, int y_step_q4, int w,
359 int h) {
360 const InterpKernel *const filters_y = get_filter_base(filter_y);
361 const int y0_q4 = get_filter_offset(filter_y, filters_y);
362
363 (void)filter_x;
364 (void)x_step_q4;
365
366 convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
367 w, h);
368 }
369
aom_convolve8_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)370 void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
371 uint8_t *dst, ptrdiff_t dst_stride,
372 const int16_t *filter_x, int subpel_x,
373 int x_step_qn, const int16_t *filter_y,
374 int subpel_y, int y_step_qn, int w, int h) {
375 const InterpKernel *const filters_y = get_filter_base(filter_y);
376
377 (void)subpel_x;
378 (void)filter_x;
379 (void)x_step_qn;
380
381 convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
382 y_step_qn, w, h);
383 }
384
aom_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)385 void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
386 uint8_t *dst, ptrdiff_t dst_stride,
387 const int16_t *filter_x, int x_step_q4,
388 const int16_t *filter_y, int y_step_q4, int w,
389 int h) {
390 const InterpKernel *const filters_y = get_filter_base(filter_y);
391 const int y0_q4 = get_filter_offset(filter_y, filters_y);
392
393 (void)filter_x;
394 (void)x_step_q4;
395
396 convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
397 y_step_q4, w, h);
398 }
399
aom_convolve8_avg_vert_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)400 void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
401 uint8_t *dst, ptrdiff_t dst_stride,
402 const int16_t *filter_x, int subpel_x,
403 int x_step_qn, const int16_t *filter_y,
404 int subpel_y, int y_step_qn, int w, int h) {
405 const InterpKernel *const filters_y = get_filter_base(filter_y);
406
407 (void)subpel_x;
408 (void)filter_x;
409 (void)x_step_qn;
410
411 convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
412 subpel_y, y_step_qn, w, h);
413 }
414
aom_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)415 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
416 ptrdiff_t dst_stride, const int16_t *filter_x,
417 int x_step_q4, const int16_t *filter_y, int y_step_q4,
418 int w, int h) {
419 const InterpKernel *const filters_x = get_filter_base(filter_x);
420 const int x0_q4 = get_filter_offset(filter_x, filters_x);
421
422 const InterpKernel *const filters_y = get_filter_base(filter_y);
423 const int y0_q4 = get_filter_offset(filter_y, filters_y);
424
425 convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
426 filters_y, y0_q4, y_step_q4, w, h);
427 }
428
aom_convolve8_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)429 void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
430 uint8_t *dst, ptrdiff_t dst_stride,
431 const int16_t *filter_x, int subpel_x, int x_step_qn,
432 const int16_t *filter_y, int subpel_y, int y_step_qn,
433 int w, int h) {
434 const InterpKernel *const filters_x = get_filter_base(filter_x);
435
436 const InterpKernel *const filters_y = get_filter_base(filter_y);
437
438 convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
439 x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
440 }
441
aom_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)442 void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
443 ptrdiff_t dst_stride, const int16_t *filter_x,
444 int x_step_q4, const int16_t *filter_y, int y_step_q4,
445 int w, int h) {
446 /* Fixed size intermediate buffer places limits on parameters. */
447 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
448 assert(w <= MAX_SB_SIZE);
449 assert(h <= MAX_SB_SIZE);
450
451 aom_convolve8_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, x_step_q4,
452 filter_y, y_step_q4, w, h);
453 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
454 h);
455 }
456
aom_convolve8_avg_scale_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int subpel_x,int x_step_qn,const int16_t * filter_y,int subpel_y,int y_step_qn,int w,int h)457 void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
458 uint8_t *dst, ptrdiff_t dst_stride,
459 const int16_t *filter_x, int subpel_x,
460 int x_step_qn, const int16_t *filter_y,
461 int subpel_y, int y_step_qn, int w, int h) {
462 /* Fixed size intermediate buffer places limits on parameters. */
463 DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
464 assert(w <= MAX_SB_SIZE);
465 assert(h <= MAX_SB_SIZE);
466
467 aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
468 x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
469 aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
470 h);
471 }
472
aom_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)473 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
474 ptrdiff_t dst_stride, const int16_t *filter_x,
475 int filter_x_stride, const int16_t *filter_y,
476 int filter_y_stride, int w, int h) {
477 int r;
478
479 (void)filter_x;
480 (void)filter_x_stride;
481 (void)filter_y;
482 (void)filter_y_stride;
483
484 for (r = h; r > 0; --r) {
485 memcpy(dst, src, w);
486 src += src_stride;
487 dst += dst_stride;
488 }
489 }
490
aom_convolve_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)491 void aom_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
492 ptrdiff_t dst_stride, const int16_t *filter_x,
493 int filter_x_stride, const int16_t *filter_y,
494 int filter_y_stride, int w, int h) {
495 int x, y;
496
497 (void)filter_x;
498 (void)filter_x_stride;
499 (void)filter_y;
500 (void)filter_y_stride;
501
502 for (y = 0; y < h; ++y) {
503 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
504
505 src += src_stride;
506 dst += dst_stride;
507 }
508 }
509
aom_scaled_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)510 void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
511 ptrdiff_t dst_stride, const int16_t *filter_x,
512 int x_step_q4, const int16_t *filter_y, int y_step_q4,
513 int w, int h) {
514 aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
515 filter_y, y_step_q4, w, h);
516 }
517
aom_scaled_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)518 void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
519 ptrdiff_t dst_stride, const int16_t *filter_x,
520 int x_step_q4, const int16_t *filter_y, int y_step_q4,
521 int w, int h) {
522 aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
523 filter_y, y_step_q4, w, h);
524 }
525
aom_scaled_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)526 void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
527 ptrdiff_t dst_stride, const int16_t *filter_x,
528 int x_step_q4, const int16_t *filter_y, int y_step_q4,
529 int w, int h) {
530 aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
531 filter_y, y_step_q4, w, h);
532 }
533
aom_scaled_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)534 void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
535 uint8_t *dst, ptrdiff_t dst_stride,
536 const int16_t *filter_x, int x_step_q4,
537 const int16_t *filter_y, int y_step_q4, int w,
538 int h) {
539 aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
540 x_step_q4, filter_y, y_step_q4, w, h);
541 }
542
aom_scaled_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)543 void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
544 uint8_t *dst, ptrdiff_t dst_stride,
545 const int16_t *filter_x, int x_step_q4,
546 const int16_t *filter_y, int y_step_q4, int w,
547 int h) {
548 aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
549 x_step_q4, filter_y, y_step_q4, w, h);
550 }
551
aom_scaled_avg_2d_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)552 void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
553 ptrdiff_t dst_stride, const int16_t *filter_x,
554 int x_step_q4, const int16_t *filter_y, int y_step_q4,
555 int w, int h) {
556 aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
557 filter_y, y_step_q4, w, h);
558 }
559
560 // TODO(afergs): Make sure this works too
561 #if CONFIG_LOOP_RESTORATION
convolve_add_src_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)562 static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
563 uint8_t *dst, ptrdiff_t dst_stride,
564 const InterpKernel *x_filters, int x0_q4,
565 int x_step_q4, int w, int h) {
566 int x, y, k;
567 src -= SUBPEL_TAPS / 2 - 1;
568 for (y = 0; y < h; ++y) {
569 int x_q4 = x0_q4;
570 for (x = 0; x < w; ++x) {
571 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
572 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
573 int sum = 0;
574 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
575 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
576 src_x[SUBPEL_TAPS / 2 - 1]);
577 x_q4 += x_step_q4;
578 }
579 src += src_stride;
580 dst += dst_stride;
581 }
582 }
583
convolve_add_src_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)584 static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
585 uint8_t *dst, ptrdiff_t dst_stride,
586 const InterpKernel *y_filters, int y0_q4,
587 int y_step_q4, int w, int h) {
588 int x, y, k;
589 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
590
591 for (x = 0; x < w; ++x) {
592 int y_q4 = y0_q4;
593 for (y = 0; y < h; ++y) {
594 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
595 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
596 int sum = 0;
597 for (k = 0; k < SUBPEL_TAPS; ++k)
598 sum += src_y[k * src_stride] * y_filter[k];
599 dst[y * dst_stride] =
600 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
601 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
602 y_q4 += y_step_q4;
603 }
604 ++src;
605 ++dst;
606 }
607 }
608
convolve_add_src(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)609 static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
610 uint8_t *dst, ptrdiff_t dst_stride,
611 const InterpKernel *const x_filters, int x0_q4,
612 int x_step_q4, const InterpKernel *const y_filters,
613 int y0_q4, int y_step_q4, int w, int h) {
614 uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
615 int intermediate_height =
616 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
617
618 assert(w <= MAX_SB_SIZE);
619 assert(h <= MAX_SB_SIZE);
620
621 assert(y_step_q4 <= 32);
622 assert(x_step_q4 <= 32);
623
624 convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
625 temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
626 intermediate_height);
627 convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
628 dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
629 }
630
aom_convolve8_add_src_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)631 void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
632 uint8_t *dst, ptrdiff_t dst_stride,
633 const int16_t *filter_x, int x_step_q4,
634 const int16_t *filter_y, int y_step_q4,
635 int w, int h) {
636 const InterpKernel *const filters_x = get_filter_base(filter_x);
637 const int x0_q4 = get_filter_offset(filter_x, filters_x);
638
639 (void)filter_y;
640 (void)y_step_q4;
641
642 convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
643 x_step_q4, w, h);
644 }
645
aom_convolve8_add_src_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)646 void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
647 uint8_t *dst, ptrdiff_t dst_stride,
648 const int16_t *filter_x, int x_step_q4,
649 const int16_t *filter_y, int y_step_q4, int w,
650 int h) {
651 const InterpKernel *const filters_y = get_filter_base(filter_y);
652 const int y0_q4 = get_filter_offset(filter_y, filters_y);
653
654 (void)filter_x;
655 (void)x_step_q4;
656
657 convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
658 y_step_q4, w, h);
659 }
660
aom_convolve8_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)661 void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
662 uint8_t *dst, ptrdiff_t dst_stride,
663 const int16_t *filter_x, int x_step_q4,
664 const int16_t *filter_y, int y_step_q4, int w,
665 int h) {
666 const InterpKernel *const filters_x = get_filter_base(filter_x);
667 const int x0_q4 = get_filter_offset(filter_x, filters_x);
668
669 const InterpKernel *const filters_y = get_filter_base(filter_y);
670 const int y0_q4 = get_filter_offset(filter_y, filters_y);
671
672 convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
673 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
674 }
675
convolve_add_src_horiz_hip(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)676 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
677 uint16_t *dst, ptrdiff_t dst_stride,
678 const InterpKernel *x_filters, int x0_q4,
679 int x_step_q4, int w, int h) {
680 const int bd = 8;
681 int x, y, k;
682 src -= SUBPEL_TAPS / 2 - 1;
683 for (y = 0; y < h; ++y) {
684 int x_q4 = x0_q4;
685 for (x = 0; x < w; ++x) {
686 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
687 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
688 int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
689 (1 << (bd + FILTER_BITS - 1));
690 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
691 dst[x] =
692 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
693 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
694 x_q4 += x_step_q4;
695 }
696 src += src_stride;
697 dst += dst_stride;
698 }
699 }
700
convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)701 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
702 uint8_t *dst, ptrdiff_t dst_stride,
703 const InterpKernel *y_filters, int y0_q4,
704 int y_step_q4, int w, int h) {
705 const int bd = 8;
706 int x, y, k;
707 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
708
709 for (x = 0; x < w; ++x) {
710 int y_q4 = y0_q4;
711 for (y = 0; y < h; ++y) {
712 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
713 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
714 int sum =
715 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
716 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
717 for (k = 0; k < SUBPEL_TAPS; ++k)
718 sum += src_y[k * src_stride] * y_filter[k];
719 dst[y * dst_stride] =
720 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
721 y_q4 += y_step_q4;
722 }
723 ++src;
724 ++dst;
725 }
726 }
727
convolve_add_src_hip(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h)728 static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
729 uint8_t *dst, ptrdiff_t dst_stride,
730 const InterpKernel *const x_filters, int x0_q4,
731 int x_step_q4,
732 const InterpKernel *const y_filters, int y0_q4,
733 int y_step_q4, int w, int h) {
734 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
735 int intermediate_height =
736 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
737
738 assert(w <= MAX_SB_SIZE);
739 assert(h <= MAX_SB_SIZE);
740
741 assert(y_step_q4 <= 32);
742 assert(x_step_q4 <= 32);
743
744 convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
745 src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
746 x_step_q4, w, intermediate_height);
747 convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
748 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
749 y_step_q4, w, h);
750 }
751
aom_convolve8_add_src_horiz_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)752 void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
753 uint16_t *dst, ptrdiff_t dst_stride,
754 const int16_t *filter_x, int x_step_q4,
755 const int16_t *filter_y, int y_step_q4,
756 int w, int h) {
757 const InterpKernel *const filters_x = get_filter_base(filter_x);
758 const int x0_q4 = get_filter_offset(filter_x, filters_x);
759
760 (void)filter_y;
761 (void)y_step_q4;
762
763 convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
764 x_step_q4, w, h);
765 }
766
aom_convolve8_add_src_vert_hip_c(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)767 void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
768 uint8_t *dst, ptrdiff_t dst_stride,
769 const int16_t *filter_x, int x_step_q4,
770 const int16_t *filter_y, int y_step_q4,
771 int w, int h) {
772 const InterpKernel *const filters_y = get_filter_base(filter_y);
773 const int y0_q4 = get_filter_offset(filter_y, filters_y);
774
775 (void)filter_x;
776 (void)x_step_q4;
777
778 convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
779 y_step_q4, w, h);
780 }
781
aom_convolve8_add_src_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)782 void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
783 uint8_t *dst, ptrdiff_t dst_stride,
784 const int16_t *filter_x, int x_step_q4,
785 const int16_t *filter_y, int y_step_q4, int w,
786 int h) {
787 const InterpKernel *const filters_x = get_filter_base(filter_x);
788 const int x0_q4 = get_filter_offset(filter_x, filters_x);
789
790 const InterpKernel *const filters_y = get_filter_base(filter_y);
791 const int y0_q4 = get_filter_offset(filter_y, filters_y);
792
793 convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
794 x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
795 }
796 #endif // CONFIG_LOOP_RESTORATION
797
798 // TODO(afergs): Make sure this works too
799 #if CONFIG_HIGHBITDEPTH
highbd_convolve_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)800 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
801 uint8_t *dst8, ptrdiff_t dst_stride,
802 const InterpKernel *x_filters, int x0_q4,
803 int x_step_q4, int w, int h, int bd) {
804 int x, y;
805 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
806 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
807 src -= SUBPEL_TAPS / 2 - 1;
808 for (y = 0; y < h; ++y) {
809 int x_q4 = x0_q4;
810 for (x = 0; x < w; ++x) {
811 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
812 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
813 int k, sum = 0;
814 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
815 dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
816 x_q4 += x_step_q4;
817 }
818 src += src_stride;
819 dst += dst_stride;
820 }
821 }
822
highbd_convolve_avg_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)823 static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride,
824 uint8_t *dst8, ptrdiff_t dst_stride,
825 const InterpKernel *x_filters, int x0_q4,
826 int x_step_q4, int w, int h, int bd) {
827 int x, y;
828 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
829 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
830 src -= SUBPEL_TAPS / 2 - 1;
831 for (y = 0; y < h; ++y) {
832 int x_q4 = x0_q4;
833 for (x = 0; x < w; ++x) {
834 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
835 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
836 int k, sum = 0;
837 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
838 dst[x] = ROUND_POWER_OF_TWO(
839 dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
840 1);
841 x_q4 += x_step_q4;
842 }
843 src += src_stride;
844 dst += dst_stride;
845 }
846 }
847
highbd_convolve_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)848 static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
849 uint8_t *dst8, ptrdiff_t dst_stride,
850 const InterpKernel *y_filters, int y0_q4,
851 int y_step_q4, int w, int h, int bd) {
852 int x, y;
853 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
854 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
855 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
856 for (x = 0; x < w; ++x) {
857 int y_q4 = y0_q4;
858 for (y = 0; y < h; ++y) {
859 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
860 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
861 int k, sum = 0;
862 for (k = 0; k < SUBPEL_TAPS; ++k)
863 sum += src_y[k * src_stride] * y_filter[k];
864 dst[y * dst_stride] =
865 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
866 y_q4 += y_step_q4;
867 }
868 ++src;
869 ++dst;
870 }
871 }
872
highbd_convolve_avg_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)873 static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride,
874 uint8_t *dst8, ptrdiff_t dst_stride,
875 const InterpKernel *y_filters, int y0_q4,
876 int y_step_q4, int w, int h, int bd) {
877 int x, y;
878 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
879 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
880 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
881 for (x = 0; x < w; ++x) {
882 int y_q4 = y0_q4;
883 for (y = 0; y < h; ++y) {
884 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
885 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
886 int k, sum = 0;
887 for (k = 0; k < SUBPEL_TAPS; ++k)
888 sum += src_y[k * src_stride] * y_filter[k];
889 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
890 dst[y * dst_stride] +
891 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
892 1);
893 y_q4 += y_step_q4;
894 }
895 ++src;
896 ++dst;
897 }
898 }
899
highbd_convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)900 static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride,
901 uint8_t *dst, ptrdiff_t dst_stride,
902 const InterpKernel *const x_filters, int x0_q4,
903 int x_step_q4, const InterpKernel *const y_filters,
904 int y0_q4, int y_step_q4, int w, int h, int bd) {
905 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
906 // 2d filtering proceeds in 2 steps:
907 // (1) Interpolate horizontally into an intermediate buffer, temp.
908 // (2) Interpolate temp vertically to derive the sub-pixel result.
909 // Deriving the maximum number of rows in the temp buffer (135):
910 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
911 // --Largest block size is 64x64 pixels.
912 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
913 // original frame (in 1/16th pixel units).
914 // --Must round-up because block may be located at sub-pixel position.
915 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
916 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
917 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
918 int intermediate_height =
919 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
920
921 assert(w <= MAX_SB_SIZE);
922 assert(h <= MAX_SB_SIZE);
923 assert(y_step_q4 <= 32);
924 assert(x_step_q4 <= 32);
925
926 highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
927 CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, x_filters, x0_q4,
928 x_step_q4, w, intermediate_height, bd);
929 highbd_convolve_vert(
930 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
931 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
932 }
933
aom_highbd_convolve8_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)934 void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
935 uint8_t *dst, ptrdiff_t dst_stride,
936 const int16_t *filter_x, int x_step_q4,
937 const int16_t *filter_y, int y_step_q4, int w,
938 int h, int bd) {
939 const InterpKernel *const filters_x = get_filter_base(filter_x);
940 const int x0_q4 = get_filter_offset(filter_x, filters_x);
941 (void)filter_y;
942 (void)y_step_q4;
943
944 highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
945 x_step_q4, w, h, bd);
946 }
947
aom_highbd_convolve8_avg_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)948 void aom_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
949 uint8_t *dst, ptrdiff_t dst_stride,
950 const int16_t *filter_x, int x_step_q4,
951 const int16_t *filter_y, int y_step_q4,
952 int w, int h, int bd) {
953 const InterpKernel *const filters_x = get_filter_base(filter_x);
954 const int x0_q4 = get_filter_offset(filter_x, filters_x);
955 (void)filter_y;
956 (void)y_step_q4;
957
958 highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
959 x_step_q4, w, h, bd);
960 }
961
aom_highbd_convolve8_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)962 void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
963 uint8_t *dst, ptrdiff_t dst_stride,
964 const int16_t *filter_x, int x_step_q4,
965 const int16_t *filter_y, int y_step_q4, int w,
966 int h, int bd) {
967 const InterpKernel *const filters_y = get_filter_base(filter_y);
968 const int y0_q4 = get_filter_offset(filter_y, filters_y);
969 (void)filter_x;
970 (void)x_step_q4;
971
972 highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
973 y_step_q4, w, h, bd);
974 }
975
aom_highbd_convolve8_avg_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)976 void aom_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
977 uint8_t *dst, ptrdiff_t dst_stride,
978 const int16_t *filter_x, int x_step_q4,
979 const int16_t *filter_y, int y_step_q4,
980 int w, int h, int bd) {
981 const InterpKernel *const filters_y = get_filter_base(filter_y);
982 const int y0_q4 = get_filter_offset(filter_y, filters_y);
983 (void)filter_x;
984 (void)x_step_q4;
985
986 highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
987 y_step_q4, w, h, bd);
988 }
989
aom_highbd_convolve8_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)990 void aom_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
991 uint8_t *dst, ptrdiff_t dst_stride,
992 const int16_t *filter_x, int x_step_q4,
993 const int16_t *filter_y, int y_step_q4, int w,
994 int h, int bd) {
995 const InterpKernel *const filters_x = get_filter_base(filter_x);
996 const int x0_q4 = get_filter_offset(filter_x, filters_x);
997
998 const InterpKernel *const filters_y = get_filter_base(filter_y);
999 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1000
1001 highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
1002 filters_y, y0_q4, y_step_q4, w, h, bd);
1003 }
1004
aom_highbd_convolve8_avg_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1005 void aom_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
1006 uint8_t *dst, ptrdiff_t dst_stride,
1007 const int16_t *filter_x, int x_step_q4,
1008 const int16_t *filter_y, int y_step_q4, int w,
1009 int h, int bd) {
1010 // Fixed size intermediate buffer places limits on parameters.
1011 DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
1012 assert(w <= MAX_SB_SIZE);
1013 assert(h <= MAX_SB_SIZE);
1014
1015 aom_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
1016 filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
1017 aom_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE, dst,
1018 dst_stride, NULL, 0, NULL, 0, w, h, bd);
1019 }
1020
aom_highbd_convolve_copy_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)1021 void aom_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride,
1022 uint8_t *dst8, ptrdiff_t dst_stride,
1023 const int16_t *filter_x, int filter_x_stride,
1024 const int16_t *filter_y, int filter_y_stride,
1025 int w, int h, int bd) {
1026 int r;
1027 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1028 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1029 (void)filter_x;
1030 (void)filter_y;
1031 (void)filter_x_stride;
1032 (void)filter_y_stride;
1033 (void)bd;
1034
1035 for (r = h; r > 0; --r) {
1036 memcpy(dst, src, w * sizeof(uint16_t));
1037 src += src_stride;
1038 dst += dst_stride;
1039 }
1040 }
1041
aom_highbd_convolve_avg_c(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h,int bd)1042 void aom_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride,
1043 uint8_t *dst8, ptrdiff_t dst_stride,
1044 const int16_t *filter_x, int filter_x_stride,
1045 const int16_t *filter_y, int filter_y_stride,
1046 int w, int h, int bd) {
1047 int x, y;
1048 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1049 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1050 (void)filter_x;
1051 (void)filter_y;
1052 (void)filter_x_stride;
1053 (void)filter_y_stride;
1054 (void)bd;
1055
1056 for (y = 0; y < h; ++y) {
1057 for (x = 0; x < w; ++x) {
1058 dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
1059 }
1060 src += src_stride;
1061 dst += dst_stride;
1062 }
1063 }
1064
1065 #if CONFIG_LOOP_RESTORATION
highbd_convolve_add_src_horiz(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)1066 static void highbd_convolve_add_src_horiz(const uint8_t *src8,
1067 ptrdiff_t src_stride, uint8_t *dst8,
1068 ptrdiff_t dst_stride,
1069 const InterpKernel *x_filters,
1070 int x0_q4, int x_step_q4, int w,
1071 int h, int bd) {
1072 int x, y, k;
1073 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1074 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1075 src -= SUBPEL_TAPS / 2 - 1;
1076 for (y = 0; y < h; ++y) {
1077 int x_q4 = x0_q4;
1078 for (x = 0; x < w; ++x) {
1079 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1080 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1081 int sum = 0;
1082 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
1083 dst[x] = clip_pixel_highbd(
1084 ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
1085 bd);
1086 x_q4 += x_step_q4;
1087 }
1088 src += src_stride;
1089 dst += dst_stride;
1090 }
1091 }
1092
highbd_convolve_add_src_vert(const uint8_t * src8,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1093 static void highbd_convolve_add_src_vert(const uint8_t *src8,
1094 ptrdiff_t src_stride, uint8_t *dst8,
1095 ptrdiff_t dst_stride,
1096 const InterpKernel *y_filters,
1097 int y0_q4, int y_step_q4, int w, int h,
1098 int bd) {
1099 int x, y, k;
1100 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1101 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1102 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1103 for (x = 0; x < w; ++x) {
1104 int y_q4 = y0_q4;
1105 for (y = 0; y < h; ++y) {
1106 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1107 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1108 int sum = 0;
1109 for (k = 0; k < SUBPEL_TAPS; ++k)
1110 sum += src_y[k * src_stride] * y_filter[k];
1111 dst[y * dst_stride] =
1112 clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
1113 src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
1114 bd);
1115 y_q4 += y_step_q4;
1116 }
1117 ++src;
1118 ++dst;
1119 }
1120 }
1121
highbd_convolve_add_src(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1122 static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
1123 uint8_t *dst, ptrdiff_t dst_stride,
1124 const InterpKernel *const x_filters,
1125 int x0_q4, int x_step_q4,
1126 const InterpKernel *const y_filters,
1127 int y0_q4, int y_step_q4, int w, int h,
1128 int bd) {
1129 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1130 // 2d filtering proceeds in 2 steps:
1131 // (1) Interpolate horizontally into an intermediate buffer, temp.
1132 // (2) Interpolate temp vertically to derive the sub-pixel result.
1133 // Deriving the maximum number of rows in the temp buffer (135):
1134 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1135 // --Largest block size is 64x64 pixels.
1136 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1137 // original frame (in 1/16th pixel units).
1138 // --Must round-up because block may be located at sub-pixel position.
1139 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1140 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1141 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1142 int intermediate_height =
1143 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1144
1145 assert(w <= MAX_SB_SIZE);
1146 assert(h <= MAX_SB_SIZE);
1147 assert(y_step_q4 <= 32);
1148 assert(x_step_q4 <= 32);
1149
1150 highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
1151 src_stride, CONVERT_TO_BYTEPTR(temp),
1152 MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
1153 intermediate_height, bd);
1154 highbd_convolve_add_src_vert(
1155 CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1156 MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
1157 }
1158
aom_highbd_convolve8_add_src_horiz_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1159 void aom_highbd_convolve8_add_src_horiz_c(
1160 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1161 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1162 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1163 const InterpKernel *const filters_x = get_filter_base(filter_x);
1164 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1165 (void)filter_y;
1166 (void)y_step_q4;
1167
1168 highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
1169 x0_q4, x_step_q4, w, h, bd);
1170 }
1171
aom_highbd_convolve8_add_src_vert_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1172 void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
1173 ptrdiff_t src_stride, uint8_t *dst,
1174 ptrdiff_t dst_stride,
1175 const int16_t *filter_x, int x_step_q4,
1176 const int16_t *filter_y, int y_step_q4,
1177 int w, int h, int bd) {
1178 const InterpKernel *const filters_y = get_filter_base(filter_y);
1179 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1180 (void)filter_x;
1181 (void)x_step_q4;
1182
1183 highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
1184 y0_q4, y_step_q4, w, h, bd);
1185 }
1186
aom_highbd_convolve8_add_src_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1187 void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
1188 uint8_t *dst, ptrdiff_t dst_stride,
1189 const int16_t *filter_x, int x_step_q4,
1190 const int16_t *filter_y, int y_step_q4,
1191 int w, int h, int bd) {
1192 const InterpKernel *const filters_x = get_filter_base(filter_x);
1193 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1194
1195 const InterpKernel *const filters_y = get_filter_base(filter_y);
1196 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1197
1198 highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
1199 x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
1200 }
1201
highbd_convolve_add_src_horiz_hip(const uint8_t * src8,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h,int bd)1202 static void highbd_convolve_add_src_horiz_hip(
1203 const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
1204 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
1205 int x_step_q4, int w, int h, int bd) {
1206 const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
1207 int x, y, k;
1208 uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1209 src -= SUBPEL_TAPS / 2 - 1;
1210 for (y = 0; y < h; ++y) {
1211 int x_q4 = x0_q4;
1212 for (x = 0; x < w; ++x) {
1213 const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
1214 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
1215 int sum = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
1216 (1 << (bd + FILTER_BITS - 1));
1217 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
1218 dst[x] =
1219 (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
1220 0, extraprec_clamp_limit - 1);
1221 x_q4 += x_step_q4;
1222 }
1223 src += src_stride;
1224 dst += dst_stride;
1225 }
1226 }
1227
highbd_convolve_add_src_vert_hip(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst8,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1228 static void highbd_convolve_add_src_vert_hip(
1229 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
1230 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
1231 int y_step_q4, int w, int h, int bd) {
1232 int x, y, k;
1233 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
1234 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
1235 for (x = 0; x < w; ++x) {
1236 int y_q4 = y0_q4;
1237 for (y = 0; y < h; ++y) {
1238 const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
1239 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
1240 int sum =
1241 ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
1242 (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
1243 for (k = 0; k < SUBPEL_TAPS; ++k)
1244 sum += src_y[k * src_stride] * y_filter[k];
1245 dst[y * dst_stride] = clip_pixel_highbd(
1246 ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
1247 y_q4 += y_step_q4;
1248 }
1249 ++src;
1250 ++dst;
1251 }
1252 }
1253
highbd_convolve_add_src_hip(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const x_filters,int x0_q4,int x_step_q4,const InterpKernel * const y_filters,int y0_q4,int y_step_q4,int w,int h,int bd)1254 static void highbd_convolve_add_src_hip(
1255 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1256 ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
1257 int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
1258 int y_step_q4, int w, int h, int bd) {
1259 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
1260 // 2d filtering proceeds in 2 steps:
1261 // (1) Interpolate horizontally into an intermediate buffer, temp.
1262 // (2) Interpolate temp vertically to derive the sub-pixel result.
1263 // Deriving the maximum number of rows in the temp buffer (135):
1264 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
1265 // --Largest block size is 64x64 pixels.
1266 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
1267 // original frame (in 1/16th pixel units).
1268 // --Must round-up because block may be located at sub-pixel position.
1269 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
1270 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
1271 uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
1272 int intermediate_height =
1273 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
1274
1275 assert(w <= MAX_SB_SIZE);
1276 assert(h <= MAX_SB_SIZE);
1277 assert(y_step_q4 <= 32);
1278 assert(x_step_q4 <= 32);
1279
1280 highbd_convolve_add_src_horiz_hip(
1281 src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
1282 x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
1283 highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
1284 MAX_SB_SIZE, dst, dst_stride, y_filters,
1285 y0_q4, y_step_q4, w, h, bd);
1286 }
1287
aom_highbd_convolve8_add_src_horiz_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1288 void aom_highbd_convolve8_add_src_horiz_hip_c(
1289 const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst,
1290 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1291 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1292 const InterpKernel *const filters_x = get_filter_base(filter_x);
1293 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1294 (void)filter_y;
1295 (void)y_step_q4;
1296
1297 highbd_convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x,
1298 x0_q4, x_step_q4, w, h, bd);
1299 }
1300
aom_highbd_convolve8_add_src_vert_hip_c(const uint16_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1301 void aom_highbd_convolve8_add_src_vert_hip_c(
1302 const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst,
1303 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
1304 const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
1305 const InterpKernel *const filters_y = get_filter_base(filter_y);
1306 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1307 (void)filter_x;
1308 (void)x_step_q4;
1309
1310 highbd_convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y,
1311 y0_q4, y_step_q4, w, h, bd);
1312 }
1313
aom_highbd_convolve8_add_src_hip_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h,int bd)1314 void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
1315 ptrdiff_t src_stride, uint8_t *dst,
1316 ptrdiff_t dst_stride,
1317 const int16_t *filter_x, int x_step_q4,
1318 const int16_t *filter_y, int y_step_q4,
1319 int w, int h, int bd) {
1320 const InterpKernel *const filters_x = get_filter_base(filter_x);
1321 const int x0_q4 = get_filter_offset(filter_x, filters_x);
1322
1323 const InterpKernel *const filters_y = get_filter_base(filter_y);
1324 const int y0_q4 = get_filter_offset(filter_y, filters_y);
1325
1326 highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
1327 x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
1328 h, bd);
1329 }
1330
1331 #endif // CONFIG_LOOP_RESTORATION
1332 #endif // CONFIG_HIGHBITDEPTH
1333