1 /*
2 * Copyright(c) 2019 Intel Corporation
3 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
4 *
5 * This source code is subject to the terms of the BSD 2 Clause License and
6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7 * was not distributed with this source code in the LICENSE file, you can
8 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
9 * Media Patent License 1.0 was not distributed with this source code in the
10 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
11 */
12 
13 #include <stdlib.h>
14 
15 #include "EbInterPrediction.h"
16 #include "convolve.h"
17 #include "common_dsp_rtcd.h"
18 #include "EbUtility.h"
19 //#include "EbRateDistortionCost.h"
20 
21 #define MVBOUNDLOW \
22     36 //  (80-71)<<2 // 80 = ReferencePadding ; minus 71 is derived from the expression -64 + 1 - 8, and plus 7 is derived from expression -1 + 8
23 #define MVBOUNDHIGH 348 //  (80+7)<<2
24 #define REFPADD_QPEL 320 //  (16+64)<<2
25 
26 #define AOM_INTERP_EXTEND 4
27 
28 #define SCALE_NUMERATOR 8
29 
30 #define SCALE_SUBPEL_BITS 10
31 #define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
32 #define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
33 #define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
34 #define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
35 
36 #define BIL_SUBPEL_BITS 3
37 #define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
38 
39 #define ROUND0_BITS 3
40 #define COMPOUND_ROUND1_BITS 7
41 
42 static WedgeMasksType wedge_masks[BlockSizeS_ALL][2];
43 
is_masked_compound_type(COMPOUND_TYPE type)44 int is_masked_compound_type(COMPOUND_TYPE type) {
45     return (type == COMPOUND_WEDGE || type == COMPOUND_DIFFWTD);
46 }
47 
48 
svt_aom_highbd_subtract_block_c(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src8,ptrdiff_t src_stride,const uint8_t * pred8,ptrdiff_t pred_stride,int bd)49 void svt_aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride,
50                                      const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8,
51                                      ptrdiff_t pred_stride, int bd) {
52     uint16_t *src  = (uint16_t *)(src8);
53     uint16_t *pred = (uint16_t *)(pred8);
54     (void)bd;
55 
56     for (int r = 0; r < rows; r++) {
57         for (int c = 0; c < cols; c++) { diff[c] = src[c] - pred[c]; }
58 
59         diff += diff_stride;
60         pred += pred_stride;
61         src += src_stride;
62     }
63 }
64 
svt_aom_subtract_block_c(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)65 void svt_aom_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride,
66                               const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred,
67                               ptrdiff_t pred_stride) {
68 
69     for (int r = 0; r < rows; r++) {
70         for (int c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
71 
72         diff += diff_stride;
73         pred += pred_stride;
74         src += src_stride;
75     }
76 }
77 
diffwtd_mask(uint8_t * mask,int which_inverse,int mask_base,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)78 static void diffwtd_mask(uint8_t *mask, int which_inverse, int mask_base, const uint8_t *src0,
79                          int src0_stride, const uint8_t *src1, int src1_stride, int h, int w) {
80     for (int i = 0; i < h; ++i) {
81         for (int j = 0; j < w; ++j) {
82             int diff        = abs((int)src0[i * src0_stride + j] - (int)src1[i * src1_stride + j]);
83             int m           = clamp(mask_base + (diff / DIFF_FACTOR), 0, AOM_BLEND_A64_MAX_ALPHA);
84             mask[i * w + j] = which_inverse ? AOM_BLEND_A64_MAX_ALPHA - m : m;
85         }
86     }
87 }
diffwtd_mask_highbd(uint8_t * mask,int which_inverse,int mask_base,const uint16_t * src0,int src0_stride,const uint16_t * src1,int src1_stride,int h,int w,const unsigned int bd)88 static AOM_FORCE_INLINE void diffwtd_mask_highbd(uint8_t *mask, int which_inverse, int mask_base,
89                                                  const uint16_t *src0, int src0_stride,
90                                                  const uint16_t *src1, int src1_stride, int h,
91                                                  int w, const unsigned int bd) {
92     assert(bd >= 8);
93     if (bd == 8) {
94         if (which_inverse) {
95             for (int i = 0; i < h; ++i) {
96                 for (int j = 0; j < w; ++j) {
97                     int          diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
98                     unsigned int m    = negative_to_zero(mask_base + diff);
99                     m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
100                     mask[j]           = AOM_BLEND_A64_MAX_ALPHA - m;
101                 }
102                 src0 += src0_stride;
103                 src1 += src1_stride;
104                 mask += w;
105             }
106         } else {
107             for (int i = 0; i < h; ++i) {
108                 for (int j = 0; j < w; ++j) {
109                     int          diff = abs((int)src0[j] - (int)src1[j]) / DIFF_FACTOR;
110                     unsigned int m    = negative_to_zero(mask_base + diff);
111                     m                 = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
112                     mask[j]           = m;
113                 }
114                 src0 += src0_stride;
115                 src1 += src1_stride;
116                 mask += w;
117             }
118         }
119     } else {
120         const unsigned int bd_shift = bd - 8;
121         if (which_inverse) {
122             for (int i = 0; i < h; ++i) {
123                 for (int j = 0; j < w; ++j) {
124                     int diff       = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
125                     unsigned int m = negative_to_zero(mask_base + diff);
126                     m              = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
127                     mask[j]        = AOM_BLEND_A64_MAX_ALPHA - m;
128                 }
129                 src0 += src0_stride;
130                 src1 += src1_stride;
131                 mask += w;
132             }
133         } else {
134             for (int i = 0; i < h; ++i) {
135                 for (int j = 0; j < w; ++j) {
136                     int diff       = (abs((int)src0[j] - (int)src1[j]) >> bd_shift) / DIFF_FACTOR;
137                     unsigned int m = negative_to_zero(mask_base + diff);
138                     m              = AOMMIN(m, AOM_BLEND_A64_MAX_ALPHA);
139                     mask[j]        = m;
140                 }
141                 src0 += src0_stride;
142                 src1 += src1_stride;
143                 mask += w;
144             }
145         }
146     }
147 }
svt_av1_build_compound_diffwtd_mask_highbd_c(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w,int bd)148 void svt_av1_build_compound_diffwtd_mask_highbd_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type,
149                                                   const uint8_t *src0, int src0_stride,
150                                                   const uint8_t *src1, int src1_stride, int h, int w,
151                                                   int bd) {
152     switch (mask_type) {
153         case DIFFWTD_38:
154             diffwtd_mask_highbd(
155                     mask, 0, 38, (uint16_t *)src0, src0_stride, (uint16_t *)src1, src1_stride, h, w, bd);
156             break;
157         case DIFFWTD_38_INV:
158             diffwtd_mask_highbd(
159                     mask, 1, 38, (uint16_t *)src0, src0_stride, (uint16_t *)src1, src1_stride, h, w, bd);
160             break;
161         default: assert(0);
162     }
163 }
164 
svt_av1_build_compound_diffwtd_mask_c(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)165 void svt_av1_build_compound_diffwtd_mask_c(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type,
166                                            const uint8_t *src0, int src0_stride, const uint8_t *src1,
167                                            int src1_stride, int h, int w) {
168     switch (mask_type) {
169         case DIFFWTD_38: diffwtd_mask(mask, 0, 38, src0, src0_stride, src1, src1_stride, h, w); break;
170         case DIFFWTD_38_INV:
171             diffwtd_mask(mask, 1, 38, src0, src0_stride, src1, src1_stride, h, w);
172             break;
173         default: assert(0);
174     }
175 }
176 
177 
178 // Note: Expect val to be in q4 precision
scaled_x(int32_t val,const ScaleFactors * sf)179 static INLINE int32_t scaled_x(int32_t val, const ScaleFactors *sf) {
180     const int     off  = (sf->x_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
181     const int64_t tval = (int64_t)val * sf->x_scale_fp + off;
182     return (int)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
183 }
184 
185 // Note: Expect val to be in q4 precision
scaled_y(int32_t val,const ScaleFactors * sf)186 static INLINE int32_t scaled_y(int32_t val, const ScaleFactors *sf) {
187     const int32_t off  = (sf->y_scale_fp - (1 << REF_SCALE_SHIFT)) * (1 << (SUBPEL_BITS - 1));
188     const int64_t tval = (int64_t)val * sf->y_scale_fp + off;
189     return (int32_t)ROUND_POWER_OF_TWO_SIGNED_64(tval, REF_SCALE_SHIFT - SCALE_EXTRA_BITS);
190 }
191 
192 // Note: Expect val to be in q4 precision
unscaled_value(int32_t val,const ScaleFactors * sf)193 static int32_t unscaled_value(int32_t val, const ScaleFactors *sf) {
194     (void)sf;
195     return val << SCALE_EXTRA_BITS;
196 }
197 
get_fixed_point_scale_factor(int32_t other_size,int32_t this_size)198 static int32_t get_fixed_point_scale_factor(int32_t other_size, int32_t this_size) {
199     // Calculate scaling factor once for each reference frame
200     // and use fixed point scaling factors in decoding and encoding routines.
201     // Hardware implementations can calculate scale factor in device driver
202     // and use multiplication and shifting on hardware instead of division.
203     return ((other_size << REF_SCALE_SHIFT) + this_size / 2) / this_size;
204 }
205 
206 // Given the fixed point scale, calculate coarse point scale.
fixed_point_scale_to_coarse_point_scale(int32_t scale_fp)207 static int32_t fixed_point_scale_to_coarse_point_scale(int32_t scale_fp) {
208     return ROUND_POWER_OF_TWO(scale_fp, REF_SCALE_SHIFT - SCALE_SUBPEL_BITS);
209 }
210 
211 // Note: x and y are integer precision, mvq4 is q4 precision.
svt_av1_scale_mv(const MV * mvq4,int x,int y,const ScaleFactors * sf)212 MV32 svt_av1_scale_mv(const MV *mvq4, int x, int y, const ScaleFactors *sf) {
213     const int  x_off_q4 = scaled_x(x << SUBPEL_BITS, sf);
214     const int  y_off_q4 = scaled_y(y << SUBPEL_BITS, sf);
215     const MV32 res      = {scaled_y((y << SUBPEL_BITS) + mvq4->row, sf) - y_off_q4,
216                       scaled_x((x << SUBPEL_BITS) + mvq4->col, sf) - x_off_q4};
217     return res;
218 }
219 
svt_av1_setup_scale_factors_for_frame(ScaleFactors * sf,int other_w,int other_h,int this_w,int this_h)220 void svt_av1_setup_scale_factors_for_frame(ScaleFactors *sf, int other_w, int other_h, int this_w,
221                                            int this_h) {
222     if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
223         sf->x_scale_fp = REF_INVALID_SCALE;
224         sf->y_scale_fp = REF_INVALID_SCALE;
225         return;
226     }
227 
228     sf->x_scale_fp = get_fixed_point_scale_factor(other_w, this_w);
229     sf->y_scale_fp = get_fixed_point_scale_factor(other_h, this_h);
230 
231     sf->x_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->x_scale_fp);
232     sf->y_step_q4 = fixed_point_scale_to_coarse_point_scale(sf->y_scale_fp);
233 
234     if (av1_is_scaled(sf)) {
235         sf->scale_value_x = scaled_x;
236         sf->scale_value_y = scaled_y;
237     } else {
238         sf->scale_value_x = unscaled_value;
239         sf->scale_value_y = unscaled_value;
240     }
241 }
242 
has_scale(int32_t xs,int32_t ys)243 static INLINE int32_t has_scale(int32_t xs, int32_t ys) {
244     return xs != SCALE_SUBPEL_SHIFTS || ys != SCALE_SUBPEL_SHIFTS;
245 }
246 
revert_scale_extra_bits(SubpelParams * sp)247 static INLINE void revert_scale_extra_bits(SubpelParams *sp) {
248     sp->subpel_x >>= SCALE_EXTRA_BITS;
249     sp->subpel_y >>= SCALE_EXTRA_BITS;
250     sp->xs >>= SCALE_EXTRA_BITS;
251     sp->ys >>= SCALE_EXTRA_BITS;
252     assert(sp->subpel_x < SUBPEL_SHIFTS);
253     assert(sp->subpel_y < SUBPEL_SHIFTS);
254     assert(sp->xs <= SUBPEL_SHIFTS);
255     assert(sp->ys <= SUBPEL_SHIFTS);
256 }
257 
258 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8[SUBPEL_SHIFTS]) = {
259     {0, 0, 0, 128, 0, 0, 0, 0},
260     {0, 2, -6, 126, 8, -2, 0, 0},
261     {0, 2, -10, 122, 18, -4, 0, 0},
262     {0, 2, -12, 116, 28, -8, 2, 0},
263     {0, 2, -14, 110, 38, -10, 2, 0},
264     {0, 2, -14, 102, 48, -12, 2, 0},
265     {0, 2, -16, 94, 58, -12, 2, 0},
266     {0, 2, -14, 84, 66, -12, 2, 0},
267     {0, 2, -14, 76, 76, -14, 2, 0},
268     {0, 2, -12, 66, 84, -14, 2, 0},
269     {0, 2, -12, 58, 94, -16, 2, 0},
270     {0, 2, -12, 48, 102, -14, 2, 0},
271     {0, 2, -10, 38, 110, -14, 2, 0},
272     {0, 2, -8, 28, 116, -12, 2, 0},
273     {0, 0, -4, 18, 122, -10, 2, 0},
274     {0, 0, -2, 8, 126, -6, 2, 0}};
275 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4[SUBPEL_SHIFTS]) = {
276     {0, 0, 0, 128, 0, 0, 0, 0},
277     {0, 0, -4, 126, 8, -2, 0, 0},
278     {0, 0, -8, 122, 18, -4, 0, 0},
279     {0, 0, -10, 116, 28, -6, 0, 0},
280     {0, 0, -12, 110, 38, -8, 0, 0},
281     {0, 0, -12, 102, 48, -10, 0, 0},
282     {0, 0, -14, 94, 58, -10, 0, 0},
283     {0, 0, -12, 84, 66, -10, 0, 0},
284     {0, 0, -12, 76, 76, -12, 0, 0},
285     {0, 0, -10, 66, 84, -12, 0, 0},
286     {0, 0, -10, 58, 94, -14, 0, 0},
287     {0, 0, -10, 48, 102, -12, 0, 0},
288     {0, 0, -8, 38, 110, -12, 0, 0},
289     {0, 0, -6, 28, 116, -10, 0, 0},
290     {0, 0, -4, 18, 122, -8, 0, 0},
291     {0, 0, -2, 8, 126, -4, 0, 0}};
292 
293 #define MAX_FILTER_TAP 8
get_relative_dist_enc(SeqHeader * seq_header,int ref_hint,int order_hint)294 int get_relative_dist_enc(SeqHeader *seq_header, int ref_hint, int order_hint) {
295     int diff, m;
296     if (!seq_header->order_hint_info.enable_order_hint) return 0;
297     diff = ref_hint - order_hint;
298     m    = 1 << (seq_header->order_hint_info.order_hint_bits - 1);
299     diff = (diff & (m - 1)) - (diff & m);
300     return diff;
301 }
302 
303 static const int quant_dist_weight[4][2] = {{2, 3}, {2, 5}, {2, 7}, {1, MAX_FRAME_DISTANCE}};
304 static const int quant_dist_lookup_table[2][4][2] = {
305     {{9, 7}, {11, 5}, {12, 4}, {13, 3}},
306     {{7, 9}, {5, 11}, {4, 12}, {3, 13}},
307 };
308 
svt_av1_dist_wtd_comp_weight_assign(SeqHeader * seq_header,int cur_frame_index,int bck_frame_index,int fwd_frame_index,int compound_idx,int order_idx,int * fwd_offset,int * bck_offset,int * use_dist_wtd_comp_avg,int is_compound)309 void svt_av1_dist_wtd_comp_weight_assign(SeqHeader *seq_header, int cur_frame_index,
310                                          int bck_frame_index, int fwd_frame_index, int compound_idx,
311                                          int order_idx, int *fwd_offset, int *bck_offset,
312                                          int *use_dist_wtd_comp_avg, int is_compound) {
313     assert(fwd_offset != NULL && bck_offset != NULL);
314     if (!is_compound || compound_idx) {
315         *use_dist_wtd_comp_avg = 0;
316         return;
317     }
318 
319     *use_dist_wtd_comp_avg = 1;
320 
321     int d0 = clamp(abs(get_relative_dist_enc(seq_header, fwd_frame_index, cur_frame_index)),
322                    0,
323                    MAX_FRAME_DISTANCE);
324     int d1 = clamp(abs(get_relative_dist_enc(seq_header, cur_frame_index, bck_frame_index)),
325                    0,
326                    MAX_FRAME_DISTANCE);
327 
328     const int order = d0 <= d1;
329 
330     if (d0 == 0 || d1 == 0) {
331         *fwd_offset = quant_dist_lookup_table[order_idx][3][order];
332         *bck_offset = quant_dist_lookup_table[order_idx][3][1 - order];
333         return;
334     }
335 
336     int i;
337     for (i = 0; i < 3; ++i) {
338         int c0    = quant_dist_weight[i][order];
339         int c1    = quant_dist_weight[i][!order];
340         int d0_c0 = d0 * c0;
341         int d1_c1 = d1 * c1;
342         if ((d0 > d1 && d0_c0 < d1_c1) || (d0 <= d1 && d0_c0 > d1_c1)) break;
343     }
344 
345     *fwd_offset = quant_dist_lookup_table[order_idx][i][order];
346     *bck_offset = quant_dist_lookup_table[order_idx][i][1 - order];
347 }
348 
svt_av1_convolve_2d_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)349 void svt_av1_convolve_2d_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
350                               int32_t dst_stride, int32_t w, int32_t h,
351                               InterpFilterParams *filter_params_x,
352                               InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
353                               const int32_t subpel_y_q4, ConvolveParams *conv_params) {
354     int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
355     int32_t       im_h      = h + filter_params_y->taps - 1;
356     int32_t       im_stride = w;
357     const int32_t fo_vert   = filter_params_y->taps / 2 - 1;
358     const int32_t fo_horiz  = filter_params_x->taps / 2 - 1;
359     const int32_t bd        = 8;
360     const int32_t bits      = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
361 
362     // horizontal filter
363     const uint8_t *src_horiz = src - fo_vert * src_stride;
364     const int16_t *x_filter =
365         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
366     for (int32_t y = 0; y < im_h; ++y) {
367         for (int32_t x = 0; x < w; ++x) {
368             int32_t sum = (1 << (bd + FILTER_BITS - 1));
369             for (int32_t k = 0; k < filter_params_x->taps; ++k)
370                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
371             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
372             im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
373         }
374     }
375 
376     // vertical filter
377     int16_t *      src_vert = im_block + fo_vert * im_stride;
378     const int16_t *y_filter =
379         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
380     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
381     for (int32_t y = 0; y < h; ++y) {
382         for (int32_t x = 0; x < w; ++x) {
383             int32_t sum = 1 << offset_bits;
384             for (int32_t k = 0; k < filter_params_y->taps; ++k)
385                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
386             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
387             int16_t res             = (ConvBufType)(ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
388                                         ((1 << (offset_bits - conv_params->round_1)) +
389                                          (1 << (offset_bits - conv_params->round_1 - 1))));
390             dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
391         }
392     }
393 }
394 
svt_av1_convolve_y_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)395 void svt_av1_convolve_y_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
396                              int32_t dst_stride, int32_t w, int32_t h,
397                              InterpFilterParams *filter_params_x,
398                              InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
399                              const int32_t subpel_y_q4, ConvolveParams *conv_params) {
400     assert(filter_params_y != NULL);
401     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
402     (void)filter_params_x;
403     (void)subpel_x_q4;
404     (void)conv_params;
405 
406     assert(conv_params->round_0 <= FILTER_BITS);
407     assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
408            ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
409 
410     // vertical filter
411     const int16_t *y_filter =
412         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
413 
414     for (int32_t y = 0; y < h; ++y) {
415         for (int32_t x = 0; x < w; ++x) {
416             int32_t res = 0;
417             for (int32_t k = 0; k < filter_params_y->taps; ++k)
418                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
419             dst[y * dst_stride + x] =
420                 (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), 8);
421         }
422     }
423 }
424 
svt_av1_convolve_x_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)425 void svt_av1_convolve_x_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
426                              int32_t dst_stride, int32_t w, int32_t h,
427                              InterpFilterParams *filter_params_x,
428                              InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
429                              const int32_t subpel_y_q4, ConvolveParams *conv_params) {
430     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
431     const int32_t bits     = FILTER_BITS - conv_params->round_0;
432     (void)filter_params_y;
433     (void)subpel_y_q4;
434     (void)conv_params;
435 
436     assert(bits >= 0);
437     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
438            ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
439 
440     // horizontal filter
441     const int16_t *x_filter =
442         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
443 
444     for (int32_t y = 0; y < h; ++y) {
445         for (int32_t x = 0; x < w; ++x) {
446             int32_t res = 0;
447             for (int32_t k = 0; k < filter_params_x->taps; ++k)
448                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
449             res                     = ROUND_POWER_OF_TWO(res, conv_params->round_0);
450             dst[y * dst_stride + x] = (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), 8);
451         }
452     }
453 }
454 
svt_av1_convolve_2d_copy_sr_c(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)455 void svt_av1_convolve_2d_copy_sr_c(const uint8_t *src, int32_t src_stride, uint8_t *dst,
456                                    int32_t dst_stride, int32_t w, int32_t h,
457                                    InterpFilterParams *filter_params_x,
458                                    InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
459                                    const int32_t subpel_y_q4, ConvolveParams *conv_params) {
460     (void)filter_params_x;
461     (void)filter_params_y;
462     (void)subpel_x_q4;
463     (void)subpel_y_q4;
464     (void)conv_params;
465 
466     for (int32_t y = 0; y < h; ++y) {
467         for (int32_t x = 0; x < w; ++x) dst[y * dst_stride + x] = src[y * src_stride + x];
468     }
469 }
470 
svt_av1_convolve_2d_scale_c(const uint8_t * src,int src_stride,uint8_t * dst8,int dst8_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params)471 void svt_av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride,
472                                  int w, int h, const InterpFilterParams *filter_params_x,
473                                  const InterpFilterParams *filter_params_y, const int subpel_x_qn,
474                                  const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
475                                  ConvolveParams *conv_params) {
476     int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
477     int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
478     CONV_BUF_TYPE *dst16        = conv_params->dst;
479     const int      dst16_stride = conv_params->dst_stride;
480     const int      bits         = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
481     assert(bits >= 0);
482     int       im_stride = w;
483     const int fo_vert   = filter_params_y->taps / 2 - 1;
484     const int fo_horiz  = filter_params_x->taps / 2 - 1;
485     const int bd        = 8;
486 
487     // horizontal filter
488     const uint8_t *src_horiz = src - fo_vert * src_stride;
489     for (int y = 0; y < im_h; ++y) {
490         int x_qn = subpel_x_qn;
491         for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
492             const uint8_t *const src_x        = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
493             const int            x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
494             assert(x_filter_idx < SUBPEL_SHIFTS);
495             const int16_t *x_filter =
496                 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
497             int32_t sum = (1 << (bd + FILTER_BITS - 1));
498             for (int k = 0; k < filter_params_x->taps; ++k) {
499                 sum += x_filter[k] * src_x[k - fo_horiz];
500             }
501             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
502             im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
503         }
504         src_horiz += src_stride;
505     }
506 
507     // vertical filter
508     int16_t * src_vert    = im_block + fo_vert * im_stride;
509     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
510     for (int x = 0; x < w; ++x) {
511         int y_qn = subpel_y_qn;
512         for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
513             const int16_t *src_y        = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
514             const int      y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
515             assert(y_filter_idx < SUBPEL_SHIFTS);
516             const int16_t *y_filter =
517                 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
518             int32_t sum = 1 << offset_bits;
519             for (int k = 0; k < filter_params_y->taps; ++k) {
520                 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
521             }
522             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
523             CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
524             if (conv_params->is_compound) {
525                 if (conv_params->do_average) {
526                     int32_t tmp = dst16[y * dst16_stride + x];
527                     if (conv_params->use_dist_wtd_comp_avg) {
528                         tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
529                         tmp = tmp >> DIST_PRECISION_BITS;
530                     } else {
531                         tmp += res;
532                         tmp = tmp >> 1;
533                     }
534                     /* Subtract round offset and convolve round */
535                     tmp                       = tmp - ((1 << (offset_bits - conv_params->round_1)) +
536                                  (1 << (offset_bits - conv_params->round_1 - 1)));
537                     dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
538                 } else {
539                     dst16[y * dst16_stride + x] = res;
540                 }
541             } else {
542                 /* Subtract round offset and convolve round */
543                 int32_t tmp               = res - ((1 << (offset_bits - conv_params->round_1)) +
544                                      (1 << (offset_bits - conv_params->round_1 - 1)));
545                 dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
546             }
547         }
548         src_vert++;
549     }
550 }
551 
svt_av1_jnt_convolve_2d_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)552 void svt_av1_jnt_convolve_2d_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
553                                int32_t dst8_stride, int32_t w, int32_t h,
554                                InterpFilterParams *filter_params_x,
555                                InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
556                                const int32_t subpel_y_q4, ConvolveParams *conv_params) {
557     ConvBufType * dst        = conv_params->dst;
558     int32_t       dst_stride = conv_params->dst_stride;
559     int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
560     int32_t       im_h       = h + filter_params_y->taps - 1;
561     int32_t       im_stride  = w;
562     const int32_t fo_vert    = filter_params_y->taps / 2 - 1;
563     const int32_t fo_horiz   = filter_params_x->taps / 2 - 1;
564     const int32_t bd         = 8;
565     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
566 
567     // horizontal filter
568     const uint8_t *src_horiz = src - fo_vert * src_stride;
569     const int16_t *x_filter =
570         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
571     for (int32_t y = 0; y < im_h; ++y) {
572         for (int32_t x = 0; x < w; ++x) {
573             int32_t sum = (1 << (bd + FILTER_BITS - 1));
574             for (int32_t k = 0; k < filter_params_x->taps; ++k)
575                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
576             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
577             im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
578         }
579     }
580 
581     // vertical filter
582     int16_t *      src_vert = im_block + fo_vert * im_stride;
583     const int16_t *y_filter =
584         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
585     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
586     for (int32_t y = 0; y < h; ++y) {
587         for (int32_t x = 0; x < w; ++x) {
588             int32_t sum = 1 << offset_bits;
589             for (int32_t k = 0; k < filter_params_y->taps; ++k)
590                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
591             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
592             ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
593             if (conv_params->do_average) {
594                 int32_t tmp = dst[y * dst_stride + x];
595                 if (conv_params->use_jnt_comp_avg) {
596                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
597                     tmp = tmp >> DIST_PRECISION_BITS;
598                 } else {
599                     tmp += res;
600                     tmp = tmp >> 1;
601                 }
602                 tmp -= (1 << (offset_bits - conv_params->round_1)) +
603                        (1 << (offset_bits - conv_params->round_1 - 1));
604                 dst8[y * dst8_stride + x] =
605                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
606             } else
607                 dst[y * dst_stride + x] = res;
608         }
609     }
610 }
611 
svt_av1_jnt_convolve_y_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)612 void svt_av1_jnt_convolve_y_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
613                               int32_t dst8_stride, int32_t w, int32_t h,
614                               InterpFilterParams *filter_params_x,
615                               InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
616                               const int32_t subpel_y_q4, ConvolveParams *conv_params) {
617     ConvBufType * dst          = conv_params->dst;
618     int32_t       dst_stride   = conv_params->dst_stride;
619     const int32_t fo_vert      = filter_params_y->taps / 2 - 1;
620     const int32_t bits         = FILTER_BITS - conv_params->round_0;
621     const int32_t bd           = 8;
622     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
623     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
624                                  (1 << (offset_bits - conv_params->round_1 - 1));
625     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
626     (void)filter_params_x;
627     (void)subpel_x_q4;
628 
629     // vertical filter
630     const int16_t *y_filter =
631         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
632     for (int32_t y = 0; y < h; ++y) {
633         for (int32_t x = 0; x < w; ++x) {
634             int32_t res = 0;
635             for (int32_t k = 0; k < filter_params_y->taps; ++k)
636                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
637             res *= (1 << bits);
638             res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
639 
640             if (conv_params->do_average) {
641                 int32_t tmp = dst[y * dst_stride + x];
642                 if (conv_params->use_jnt_comp_avg) {
643                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
644                     tmp = tmp >> DIST_PRECISION_BITS;
645                 } else {
646                     tmp += res;
647                     tmp = tmp >> 1;
648                 }
649                 tmp -= round_offset;
650                 dst8[y * dst8_stride + x] =
651                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
652             } else
653                 dst[y * dst_stride + x] = (ConvBufType)res;
654         }
655     }
656 }
657 
svt_av1_jnt_convolve_x_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)658 void svt_av1_jnt_convolve_x_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
659                               int32_t dst8_stride, int32_t w, int32_t h,
660                               InterpFilterParams *filter_params_x,
661                               InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
662                               const int32_t subpel_y_q4, ConvolveParams *conv_params) {
663     ConvBufType * dst          = conv_params->dst;
664     int32_t       dst_stride   = conv_params->dst_stride;
665     const int32_t fo_horiz     = filter_params_x->taps / 2 - 1;
666     const int32_t bits         = FILTER_BITS - conv_params->round_1;
667     const int32_t bd           = 8;
668     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
669     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
670                                  (1 << (offset_bits - conv_params->round_1 - 1));
671     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
672     (void)filter_params_y;
673     (void)subpel_y_q4;
674 
675     // horizontal filter
676     const int16_t *x_filter =
677         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
678     for (int32_t y = 0; y < h; ++y) {
679         for (int32_t x = 0; x < w; ++x) {
680             int32_t res = 0;
681             for (int32_t k = 0; k < filter_params_x->taps; ++k)
682                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
683             res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
684             res += round_offset;
685 
686             if (conv_params->do_average) {
687                 int32_t tmp = dst[y * dst_stride + x];
688                 if (conv_params->use_jnt_comp_avg) {
689                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
690                     tmp = tmp >> DIST_PRECISION_BITS;
691                 } else {
692                     tmp += res;
693                     tmp = tmp >> 1;
694                 }
695                 tmp -= round_offset;
696                 dst8[y * dst8_stride + x] =
697                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), 8);
698             } else
699                 dst[y * dst_stride + x] = (ConvBufType)res;
700         }
701     }
702 }
703 
svt_av1_jnt_convolve_2d_copy_c(const uint8_t * src,int32_t src_stride,uint8_t * dst8,int32_t dst8_stride,int32_t w,int32_t h,InterpFilterParams * filter_params_x,InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params)704 void svt_av1_jnt_convolve_2d_copy_c(const uint8_t *src, int32_t src_stride, uint8_t *dst8,
705                                     int32_t dst8_stride, int32_t w, int32_t h,
706                                     InterpFilterParams *filter_params_x,
707                                     InterpFilterParams *filter_params_y, const int32_t subpel_x_q4,
708                                     const int32_t subpel_y_q4, ConvolveParams *conv_params) {
709     ConvBufType * dst          = conv_params->dst;
710     int32_t       dst_stride   = conv_params->dst_stride;
711     const int32_t bits         = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
712     const int32_t bd           = 8;
713     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
714     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
715                                  (1 << (offset_bits - conv_params->round_1 - 1));
716     (void)filter_params_x;
717     (void)filter_params_y;
718     (void)subpel_x_q4;
719     (void)subpel_y_q4;
720 
721     for (int32_t y = 0; y < h; ++y) {
722         for (int32_t x = 0; x < w; ++x) {
723             ConvBufType res = src[y * src_stride + x] << bits;
724             res += (ConvBufType)round_offset;
725 
726             if (conv_params->do_average) {
727                 int32_t tmp = dst[y * dst_stride + x];
728                 if (conv_params->use_jnt_comp_avg) {
729                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
730                     tmp = tmp >> DIST_PRECISION_BITS;
731                 } else {
732                     tmp += res;
733                     tmp = tmp >> 1;
734                 }
735                 tmp -= round_offset;
736                 dst8[y * dst8_stride + x] =
737                     (uint8_t)clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), 8);
738             } else
739                 dst[y * dst_stride + x] = res;
740         }
741     }
742 }
743 
svt_av1_highbd_convolve_2d_copy_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)744 void svt_av1_highbd_convolve_2d_copy_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
745                                           int32_t dst_stride, int32_t w, int32_t h,
746                                           const InterpFilterParams *filter_params_x,
747                                           const InterpFilterParams *filter_params_y,
748                                           const int32_t subpel_x_q4, const int32_t subpel_y_q4,
749                                           ConvolveParams *conv_params, int32_t bd) {
750     (void)filter_params_x;
751     (void)filter_params_y;
752     (void)subpel_x_q4;
753     (void)subpel_y_q4;
754     (void)conv_params;
755     (void)bd;
756 
757     for (int32_t y = 0; y < h; ++y) {
758         for (int32_t x = 0; x < w; ++x) dst[y * dst_stride + x] = src[y * src_stride + x];
759     }
760 }
761 
svt_av1_highbd_convolve_x_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)762 void svt_av1_highbd_convolve_x_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
763                                     int32_t dst_stride, int32_t w, int32_t h,
764                                     const InterpFilterParams *filter_params_x,
765                                     const InterpFilterParams *filter_params_y,
766                                     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
767                                     ConvolveParams *conv_params, int32_t bd) {
768     const int32_t fo_horiz = filter_params_x->taps / 2 - 1;
769     const int32_t bits     = FILTER_BITS - conv_params->round_0;
770     (void)filter_params_y;
771     (void)subpel_y_q4;
772 
773     assert(bits >= 0);
774     assert((FILTER_BITS - conv_params->round_1) >= 0 ||
775            ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
776 
777     // horizontal filter
778     const int16_t *x_filter =
779         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
780     for (int32_t y = 0; y < h; ++y) {
781         for (int32_t x = 0; x < w; ++x) {
782             int32_t res = 0;
783             for (int32_t k = 0; k < filter_params_x->taps; ++k)
784                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
785             res                     = ROUND_POWER_OF_TWO(res, conv_params->round_0);
786             dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
787         }
788     }
789 }
790 
svt_av1_highbd_convolve_y_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)791 void svt_av1_highbd_convolve_y_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
792                                     int32_t dst_stride, int32_t w, int32_t h,
793                                     const InterpFilterParams *filter_params_x,
794                                     const InterpFilterParams *filter_params_y,
795                                     const int32_t subpel_x_q4, const int32_t subpel_y_q4,
796                                     ConvolveParams *conv_params, int32_t bd) {
797     assert(filter_params_y != NULL);
798     const int32_t fo_vert = filter_params_y->taps / 2 - 1;
799     (void)filter_params_x;
800     (void)subpel_x_q4;
801     (void)conv_params;
802 
803     assert(conv_params->round_0 <= FILTER_BITS);
804     assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
805            ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
806     // vertical filter
807     const int16_t *y_filter =
808         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
809     for (int32_t y = 0; y < h; ++y) {
810         for (int32_t x = 0; x < w; ++x) {
811             int32_t res = 0;
812             for (int32_t k = 0; k < filter_params_y->taps; ++k)
813                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
814             dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
815         }
816     }
817 }
818 
svt_av1_highbd_convolve_2d_sr_c(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)819 void svt_av1_highbd_convolve_2d_sr_c(const uint16_t *src, int32_t src_stride, uint16_t *dst,
820                                      int32_t dst_stride, int32_t w, int32_t h,
821                                      const InterpFilterParams *filter_params_x,
822                                      const InterpFilterParams *filter_params_y,
823                                      const int32_t subpel_x_q4, const int32_t subpel_y_q4,
824                                      ConvolveParams *conv_params, int32_t bd) {
825     int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
826     int32_t       im_h      = h + filter_params_y->taps - 1;
827     int32_t       im_stride = w;
828     const int32_t fo_vert   = filter_params_y->taps / 2 - 1;
829     const int32_t fo_horiz  = filter_params_x->taps / 2 - 1;
830     const int32_t bits      = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
831     assert(bits >= 0);
832 
833     // horizontal filter
834     const uint16_t *src_horiz = src - fo_vert * src_stride;
835     const int16_t * x_filter =
836         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
837     for (int32_t y = 0; y < im_h; ++y) {
838         for (int32_t x = 0; x < w; ++x) {
839             int32_t sum = (1 << (bd + FILTER_BITS - 1));
840             for (int32_t k = 0; k < filter_params_x->taps; ++k)
841                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
842             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
843             im_block[y * im_stride + x] =
844                 (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
845         }
846     }
847 
848     // vertical filter
849     int16_t *      src_vert = im_block + fo_vert * im_stride;
850     const int16_t *y_filter =
851         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
852     const int32_t offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
853     for (int32_t y = 0; y < h; ++y) {
854         for (int32_t x = 0; x < w; ++x) {
855             int32_t sum = 1 << offset_bits;
856             for (int32_t k = 0; k < filter_params_y->taps; ++k)
857                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
858             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
859             int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
860                           ((1 << (offset_bits - conv_params->round_1)) +
861                            (1 << (offset_bits - conv_params->round_1 - 1)));
862             dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
863         }
864     }
865 }
866 
svt_av1_highbd_convolve_2d_scale_c(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int x_step_qn,const int subpel_y_qn,const int y_step_qn,ConvolveParams * conv_params,int bd)867 void svt_av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride, uint16_t *dst,
868                                         int dst_stride, int w, int h,
869                                         const InterpFilterParams *filter_params_x,
870                                         const InterpFilterParams *filter_params_y,
871                                         const int subpel_x_qn, const int x_step_qn,
872                                         const int subpel_y_qn, const int y_step_qn,
873                                         ConvolveParams *conv_params, int bd) {
874     int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
875     int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) + filter_params_y->taps;
876     int im_stride               = w;
877     const int      fo_vert      = filter_params_y->taps / 2 - 1;
878     const int      fo_horiz     = filter_params_x->taps / 2 - 1;
879     CONV_BUF_TYPE *dst16        = conv_params->dst;
880     const int      dst16_stride = conv_params->dst_stride;
881     const int      bits         = FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
882     assert(bits >= 0);
883     // horizontal filter
884     const uint16_t *src_horiz = src - fo_vert * src_stride;
885     for (int y = 0; y < im_h; ++y) {
886         int x_qn = subpel_x_qn;
887         for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
888             const uint16_t *const src_x        = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
889             const int             x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
890             assert(x_filter_idx < SUBPEL_SHIFTS);
891             const int16_t *x_filter =
892                 av1_get_interp_filter_subpel_kernel(*filter_params_x, x_filter_idx);
893             int32_t sum = (1 << (bd + FILTER_BITS - 1));
894             for (int k = 0; k < filter_params_x->taps; ++k) {
895                 sum += x_filter[k] * src_x[k - fo_horiz];
896             }
897             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
898             im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
899         }
900         src_horiz += src_stride;
901     }
902 
903     // vertical filter
904     int16_t * src_vert    = im_block + fo_vert * im_stride;
905     const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
906     for (int x = 0; x < w; ++x) {
907         int y_qn = subpel_y_qn;
908         for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
909             const int16_t *src_y        = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
910             const int      y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
911             assert(y_filter_idx < SUBPEL_SHIFTS);
912             const int16_t *y_filter =
913                 av1_get_interp_filter_subpel_kernel(*filter_params_y, y_filter_idx);
914             int32_t sum = 1 << offset_bits;
915             for (int k = 0; k < filter_params_y->taps; ++k) {
916                 sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
917             }
918             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
919             CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
920             if (conv_params->is_compound) {
921                 if (conv_params->do_average) {
922                     int32_t tmp = dst16[y * dst16_stride + x];
923                     if (conv_params->use_dist_wtd_comp_avg) {
924                         tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
925                         tmp = tmp >> DIST_PRECISION_BITS;
926                     } else {
927                         tmp += res;
928                         tmp = tmp >> 1;
929                     }
930                     /* Subtract round offset and convolve round */
931                     tmp                     = tmp - ((1 << (offset_bits - conv_params->round_1)) +
932                                  (1 << (offset_bits - conv_params->round_1 - 1)));
933                     dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
934                 } else {
935                     dst16[y * dst16_stride + x] = res;
936                 }
937             } else {
938                 /* Subtract round offset and convolve round */
939                 int32_t tmp             = res - ((1 << (offset_bits - conv_params->round_1)) +
940                                      (1 << (offset_bits - conv_params->round_1 - 1)));
941                 dst[y * dst_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
942             }
943         }
944         src_vert++;
945     }
946 }
947 
svt_av1_highbd_jnt_convolve_x_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)948 void svt_av1_highbd_jnt_convolve_x_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
949                                      int32_t dst16_stride, int32_t w, int32_t h,
950                                      const InterpFilterParams *filter_params_x,
951                                      const InterpFilterParams *filter_params_y,
952                                      const int32_t subpel_x_q4, const int32_t subpel_y_q4,
953                                      ConvolveParams *conv_params, int32_t bd) {
954     ConvBufType * dst          = conv_params->dst;
955     int32_t       dst_stride   = conv_params->dst_stride;
956     const int32_t fo_horiz     = filter_params_x->taps / 2 - 1;
957     const int32_t bits         = FILTER_BITS - conv_params->round_1;
958     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
959     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
960                                  (1 << (offset_bits - conv_params->round_1 - 1));
961     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
962     assert(round_bits >= 0);
963     (void)filter_params_y;
964     (void)subpel_y_q4;
965     assert(bits >= 0);
966     // horizontal filter
967     const int16_t *x_filter =
968         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
969     for (int32_t y = 0; y < h; ++y) {
970         for (int32_t x = 0; x < w; ++x) {
971             int32_t res = 0;
972             for (int32_t k = 0; k < filter_params_x->taps; ++k)
973                 res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
974             res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
975             res += round_offset;
976 
977             if (conv_params->do_average) {
978                 int32_t tmp = dst[y * dst_stride + x];
979                 if (conv_params->use_jnt_comp_avg) {
980                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
981                     tmp = tmp >> DIST_PRECISION_BITS;
982                 } else {
983                     tmp += res;
984                     tmp = tmp >> 1;
985                 }
986                 tmp -= round_offset;
987                 dst16[y * dst16_stride + x] =
988                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
989             } else
990                 dst[y * dst_stride + x] = (ConvBufType)res;
991         }
992     }
993 }
994 
svt_av1_highbd_jnt_convolve_y_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)995 void svt_av1_highbd_jnt_convolve_y_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
996                                      int32_t dst16_stride, int32_t w, int32_t h,
997                                      const InterpFilterParams *filter_params_x,
998                                      const InterpFilterParams *filter_params_y,
999                                      const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1000                                      ConvolveParams *conv_params, int32_t bd) {
1001     ConvBufType * dst          = conv_params->dst;
1002     int32_t       dst_stride   = conv_params->dst_stride;
1003     const int32_t fo_vert      = filter_params_y->taps / 2 - 1;
1004     const int32_t bits         = FILTER_BITS - conv_params->round_0;
1005     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
1006     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
1007                                  (1 << (offset_bits - conv_params->round_1 - 1));
1008     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1009     assert(round_bits >= 0);
1010     (void)filter_params_x;
1011     (void)subpel_x_q4;
1012     assert(bits >= 0);
1013     // vertical filter
1014     const int16_t *y_filter =
1015         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1016     for (int32_t y = 0; y < h; ++y) {
1017         for (int32_t x = 0; x < w; ++x) {
1018             int32_t res = 0;
1019             for (int32_t k = 0; k < filter_params_y->taps; ++k)
1020                 res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
1021             res *= (1 << bits);
1022             res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
1023 
1024             if (conv_params->do_average) {
1025                 int32_t tmp = dst[y * dst_stride + x];
1026                 if (conv_params->use_jnt_comp_avg) {
1027                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1028                     tmp = tmp >> DIST_PRECISION_BITS;
1029                 } else {
1030                     tmp += res;
1031                     tmp = tmp >> 1;
1032                 }
1033                 tmp -= round_offset;
1034                 dst16[y * dst16_stride + x] =
1035                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1036             } else
1037                 dst[y * dst_stride + x] = (ConvBufType)res;
1038         }
1039     }
1040 }
1041 
svt_av1_highbd_jnt_convolve_2d_copy_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)1042 void svt_av1_highbd_jnt_convolve_2d_copy_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
1043                                            int32_t dst16_stride, int32_t w, int32_t h,
1044                                            const InterpFilterParams *filter_params_x,
1045                                            const InterpFilterParams *filter_params_y,
1046                                            const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1047                                            ConvolveParams *conv_params, int32_t bd) {
1048     ConvBufType * dst          = conv_params->dst;
1049     int32_t       dst_stride   = conv_params->dst_stride;
1050     const int32_t bits         = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
1051     const int32_t offset_bits  = bd + 2 * FILTER_BITS - conv_params->round_0;
1052     const int32_t round_offset = (1 << (offset_bits - conv_params->round_1)) +
1053                                  (1 << (offset_bits - conv_params->round_1 - 1));
1054     assert(bits >= 0);
1055     (void)filter_params_x;
1056     (void)filter_params_y;
1057     (void)subpel_x_q4;
1058     (void)subpel_y_q4;
1059 
1060     for (int32_t y = 0; y < h; ++y) {
1061         for (int32_t x = 0; x < w; ++x) {
1062             ConvBufType res = src[y * src_stride + x] << bits;
1063             res += (ConvBufType)round_offset;
1064             if (conv_params->do_average) {
1065                 int32_t tmp = dst[y * dst_stride + x];
1066                 if (conv_params->use_jnt_comp_avg) {
1067                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1068                     tmp = tmp >> DIST_PRECISION_BITS;
1069                 } else {
1070                     tmp += res;
1071                     tmp = tmp >> 1;
1072                 }
1073                 tmp -= round_offset;
1074                 dst16[y * dst16_stride + x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
1075             } else
1076                 dst[y * dst_stride + x] = res;
1077         }
1078     }
1079 }
1080 
svt_av1_highbd_jnt_convolve_2d_c(const uint16_t * src,int32_t src_stride,uint16_t * dst16,int32_t dst16_stride,int32_t w,int32_t h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int32_t subpel_x_q4,const int32_t subpel_y_q4,ConvolveParams * conv_params,int32_t bd)1081 void svt_av1_highbd_jnt_convolve_2d_c(const uint16_t *src, int32_t src_stride, uint16_t *dst16,
1082                                       int32_t dst16_stride, int32_t w, int32_t h,
1083                                       const InterpFilterParams *filter_params_x,
1084                                       const InterpFilterParams *filter_params_y,
1085                                       const int32_t subpel_x_q4, const int32_t subpel_y_q4,
1086                                       ConvolveParams *conv_params, int32_t bd)
1087 
1088 {
1089     int16_t       im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
1090     ConvBufType * dst        = conv_params->dst;
1091     int32_t       dst_stride = conv_params->dst_stride;
1092     int32_t       im_h       = h + filter_params_y->taps - 1;
1093     int32_t       im_stride  = w;
1094     const int32_t fo_vert    = filter_params_y->taps / 2 - 1;
1095     const int32_t fo_horiz   = filter_params_x->taps / 2 - 1;
1096 
1097     const int32_t round_bits = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
1098     assert(round_bits >= 0);
1099 
1100     // horizontal filter
1101     const uint16_t *src_horiz = src - fo_vert * src_stride;
1102     const int16_t * x_filter =
1103         av1_get_interp_filter_subpel_kernel(*filter_params_x, subpel_x_q4 & SUBPEL_MASK);
1104     for (int y = 0; y < im_h; ++y) {
1105         for (int x = 0; x < w; ++x) {
1106             int32_t sum = (1 << (bd + FILTER_BITS - 1));
1107             for (int k = 0; k < filter_params_x->taps; ++k)
1108                 sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
1109             assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
1110             (void)bd;
1111             im_block[y * im_stride + x] = (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
1112         }
1113     }
1114 
1115     // vertical filter
1116     int16_t *      src_vert    = im_block + fo_vert * im_stride;
1117     const int32_t  offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
1118     const int16_t *y_filter =
1119         av1_get_interp_filter_subpel_kernel(*filter_params_y, subpel_y_q4 & SUBPEL_MASK);
1120     for (int y = 0; y < h; ++y) {
1121         for (int x = 0; x < w; ++x) {
1122             int32_t sum = 1 << offset_bits;
1123             for (int k = 0; k < filter_params_y->taps; ++k)
1124                 sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
1125             assert(0 <= sum && sum < (1 << (offset_bits + 2)));
1126             ConvBufType res = (ConvBufType)ROUND_POWER_OF_TWO(sum, conv_params->round_1);
1127             if (conv_params->do_average) {
1128                 int32_t tmp = dst[y * dst_stride + x];
1129                 if (conv_params->use_jnt_comp_avg) {
1130                     tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
1131                     tmp = tmp >> DIST_PRECISION_BITS;
1132                 } else {
1133                     tmp += res;
1134                     tmp = tmp >> 1;
1135                 }
1136                 tmp -= (1 << (offset_bits - conv_params->round_1)) +
1137                        (1 << (offset_bits - conv_params->round_1 - 1));
1138                 dst16[y * dst16_stride + x] =
1139                     clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
1140             } else
1141                 dst[y * dst_stride + x] = res;
1142         }
1143     }
1144 }
1145 
1146 aom_highbd_convolve_fn_t convolveHbd[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
asm_set_convolve_hbd_asm_table(void)1147 void asm_set_convolve_hbd_asm_table(void) {
1148     convolveHbd[0][0][0] = svt_av1_highbd_convolve_2d_copy_sr;
1149     convolveHbd[0][0][1] = svt_av1_highbd_jnt_convolve_2d_copy;
1150 
1151     convolveHbd[0][1][0] = svt_av1_highbd_convolve_y_sr;
1152     convolveHbd[0][1][1] = svt_av1_highbd_jnt_convolve_y;
1153 
1154     convolveHbd[1][0][0] = svt_av1_highbd_convolve_x_sr;
1155     convolveHbd[1][0][1] = svt_av1_highbd_jnt_convolve_x;
1156 
1157     convolveHbd[1][1][0] = svt_av1_highbd_convolve_2d_sr;
1158     convolveHbd[1][1][1] = svt_av1_highbd_jnt_convolve_2d;
1159 }
1160 
1161 AomConvolveFn convolve[/*subX*/ 2][/*subY*/ 2][/*bi*/ 2];
asm_set_convolve_asm_table(void)1162 void asm_set_convolve_asm_table(void) {
1163     convolve[0][0][0] = svt_av1_convolve_2d_copy_sr;
1164     convolve[0][0][1] = svt_av1_jnt_convolve_2d_copy;
1165 
1166     convolve[0][1][0] = svt_av1_convolve_y_sr;
1167     convolve[0][1][1] = svt_av1_jnt_convolve_y;
1168 
1169     convolve[1][0][0] = svt_av1_convolve_x_sr;
1170     convolve[1][0][1] = svt_av1_jnt_convolve_x;
1171 
1172     convolve[1][1][0] = svt_av1_convolve_2d_sr;
1173     convolve[1][1][1] = svt_av1_jnt_convolve_2d;
1174 }
1175 
1176 InterpFilterParams av1RegularFilter = {
1177     (const int16_t *)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR};
1178 InterpFilterParams av1RegularFilterW4 = {
1179     (const int16_t *)sub_pel_filters_4, SUBPEL_TAPS, SUBPEL_SHIFTS, EIGHTTAP_REGULAR};
1180 
1181 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
1182     {0, 0, 0, 128, 0, 0, 0, 0},
1183     {-2, 2, -6, 126, 8, -2, 2, 0},
1184     {-2, 6, -12, 124, 16, -6, 4, -2},
1185     {-2, 8, -18, 120, 26, -10, 6, -2},
1186     {-4, 10, -22, 116, 38, -14, 6, -2},
1187     {-4, 10, -22, 108, 48, -18, 8, -2},
1188     {-4, 10, -24, 100, 60, -20, 8, -2},
1189     {-4, 10, -24, 90, 70, -22, 10, -2},
1190     {-4, 12, -24, 80, 80, -24, 12, -4},
1191     {-2, 10, -22, 70, 90, -24, 10, -4},
1192     {-2, 8, -20, 60, 100, -24, 10, -4},
1193     {-2, 8, -18, 48, 108, -22, 10, -4},
1194     {-2, 6, -14, 38, 116, -22, 10, -4},
1195     {-2, 6, -10, 26, 120, -18, 8, -2},
1196     {-2, 4, -6, 16, 124, -12, 6, -2},
1197     {0, 2, -2, 8, 126, -6, 2, -2}};
1198 
1199 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
1200     {0, 0, 0, 128, 0, 0, 0, 0},
1201     {0, 2, 28, 62, 34, 2, 0, 0},
1202     {0, 0, 26, 62, 36, 4, 0, 0},
1203     {0, 0, 22, 62, 40, 4, 0, 0},
1204     {0, 0, 20, 60, 42, 6, 0, 0},
1205     {0, 0, 18, 58, 44, 8, 0, 0},
1206     {0, 0, 16, 56, 46, 10, 0, 0},
1207     {0, -2, 16, 54, 48, 12, 0, 0},
1208     {0, -2, 14, 52, 52, 14, -2, 0},
1209     {0, 0, 12, 48, 54, 16, -2, 0},
1210     {0, 0, 10, 46, 56, 16, 0, 0},
1211     {0, 0, 8, 44, 58, 18, 0, 0},
1212     {0, 0, 6, 42, 60, 20, 0, 0},
1213     {0, 0, 4, 40, 62, 22, 0, 0},
1214     {0, 0, 4, 36, 62, 26, 0, 0},
1215     {0, 0, 2, 34, 62, 28, 2, 0}};
1216 DECLARE_ALIGNED(256, const InterpKernel, bilinear_filters[SUBPEL_SHIFTS]) = {
1217     {0, 0, 0, 128, 0, 0, 0, 0},
1218     {0, 0, 0, 120, 8, 0, 0, 0},
1219     {0, 0, 0, 112, 16, 0, 0, 0},
1220     {0, 0, 0, 104, 24, 0, 0, 0},
1221     {0, 0, 0, 96, 32, 0, 0, 0},
1222     {0, 0, 0, 88, 40, 0, 0, 0},
1223     {0, 0, 0, 80, 48, 0, 0, 0},
1224     {0, 0, 0, 72, 56, 0, 0, 0},
1225     {0, 0, 0, 64, 64, 0, 0, 0},
1226     {0, 0, 0, 56, 72, 0, 0, 0},
1227     {0, 0, 0, 48, 80, 0, 0, 0},
1228     {0, 0, 0, 40, 88, 0, 0, 0},
1229     {0, 0, 0, 32, 96, 0, 0, 0},
1230     {0, 0, 0, 24, 104, 0, 0, 0},
1231     {0, 0, 0, 16, 112, 0, 0, 0},
1232     {0, 0, 0, 8, 120, 0, 0, 0}};
1233 DECLARE_ALIGNED(256, const InterpKernel, sub_pel_filters_4smooth[SUBPEL_SHIFTS]) = {
1234     {0, 0, 0, 128, 0, 0, 0, 0},
1235     {0, 0, 30, 62, 34, 2, 0, 0},
1236     {0, 0, 26, 62, 36, 4, 0, 0},
1237     {0, 0, 22, 62, 40, 4, 0, 0},
1238     {0, 0, 20, 60, 42, 6, 0, 0},
1239     {0, 0, 18, 58, 44, 8, 0, 0},
1240     {0, 0, 16, 56, 46, 10, 0, 0},
1241     {0, 0, 14, 54, 48, 12, 0, 0},
1242     {0, 0, 12, 52, 52, 12, 0, 0},
1243     {0, 0, 12, 48, 54, 14, 0, 0},
1244     {0, 0, 10, 46, 56, 16, 0, 0},
1245     {0, 0, 8, 44, 58, 18, 0, 0},
1246     {0, 0, 6, 42, 60, 20, 0, 0},
1247     {0, 0, 4, 40, 62, 22, 0, 0},
1248     {0, 0, 4, 36, 62, 26, 0, 0},
1249     {0, 0, 2, 34, 62, 30, 0, 0}};
1250 BlockSize scale_chroma_bsize(BlockSize bsize, int32_t subsampling_x, int32_t subsampling_y);
1251 
convolve_2d_for_intrabc(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params)1252 void convolve_2d_for_intrabc(const uint8_t *src, int src_stride, uint8_t *dst,
1253                                     int dst_stride, int w, int h, int subpel_x_q4, int subpel_y_q4,
1254                                     ConvolveParams *conv_params) {
1255     const InterpFilterParams *filter_params_x =
1256         subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1257     const InterpFilterParams *filter_params_y =
1258         subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1259     if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1260         svt_av1_convolve_2d_sr(src,
1261                                src_stride,
1262                                dst,
1263                                dst_stride,
1264                                w,
1265                                h,
1266                                (InterpFilterParams *)filter_params_x,
1267                                (InterpFilterParams *)filter_params_y,
1268                                8,
1269                                8,
1270                                conv_params);
1271     } else if (subpel_x_q4 != 0) {
1272         svt_av1_convolve_x_sr(src,
1273                               src_stride,
1274                               dst,
1275                               dst_stride,
1276                               w,
1277                               h,
1278                               (InterpFilterParams *)filter_params_x,
1279                               (InterpFilterParams *)filter_params_y,
1280                               8,
1281                               0,
1282                               conv_params);
1283     } else {
1284         svt_av1_convolve_y_sr(src,
1285                               src_stride,
1286                               dst,
1287                               dst_stride,
1288                               w,
1289                               h,
1290                               (InterpFilterParams *)filter_params_x,
1291                               (InterpFilterParams *)filter_params_y,
1292                               0,
1293                               8,
1294                               conv_params);
1295     }
1296 }
highbd_convolve_2d_for_intrabc(const uint16_t * src,int src_stride,uint16_t * dst,int dst_stride,int w,int h,int subpel_x_q4,int subpel_y_q4,ConvolveParams * conv_params,int bd)1297 void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride, uint16_t *dst,
1298                                            int dst_stride, int w, int h, int subpel_x_q4,
1299                                            int subpel_y_q4, ConvolveParams *conv_params, int bd) {
1300     const InterpFilterParams *filter_params_x =
1301         subpel_x_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1302     const InterpFilterParams *filter_params_y =
1303         subpel_y_q4 ? &av1_interp_filter_params_list[BILINEAR] : NULL;
1304     if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
1305         svt_av1_highbd_convolve_2d_sr(src,
1306                                       src_stride,
1307                                       dst,
1308                                       dst_stride,
1309                                       w,
1310                                       h,
1311                                       filter_params_x,
1312                                       filter_params_y,
1313                                       8,
1314                                       8,
1315                                       conv_params,
1316                                       bd);
1317     } else if (subpel_x_q4 != 0) {
1318         svt_av1_highbd_convolve_x_sr(src,
1319                                      src_stride,
1320                                      dst,
1321                                      dst_stride,
1322                                      w,
1323                                      h,
1324                                      filter_params_x,
1325                                      filter_params_y,
1326                                      8,
1327                                      0,
1328                                      conv_params,
1329                                      bd);
1330     } else {
1331         svt_av1_highbd_convolve_y_sr(src,
1332                                      src_stride,
1333                                      dst,
1334                                      dst_stride,
1335                                      w,
1336                                      h,
1337                                      filter_params_x,
1338                                      filter_params_y,
1339                                      0,
1340                                      8,
1341                                      conv_params,
1342                                      bd);
1343     }
1344 }
1345 
svt_inter_predictor(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const SubpelParams * subpel_params,const ScaleFactors * sf,int32_t w,int32_t h,ConvolveParams * conv_params,InterpFilters interp_filters,int32_t is_intrabc)1346 void svt_inter_predictor(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
1347                          const SubpelParams *subpel_params, const ScaleFactors *sf, int32_t w,
1348                          int32_t h, ConvolveParams *conv_params, InterpFilters interp_filters,
1349                          int32_t is_intrabc) {
1350     InterpFilterParams filter_params_x, filter_params_y;
1351     const int32_t      is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1352 
1353     av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1354 
1355     assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1356     assert(sf);
1357     UNUSED(sf);
1358     assert(IMPLIES(is_intrabc, !is_scaled));
1359 
1360     if (is_scaled) {
1361         if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1362             convolve_2d_for_intrabc(src,
1363                                     src_stride,
1364                                     dst,
1365                                     dst_stride,
1366                                     w,
1367                                     h,
1368                                     subpel_params->subpel_x,
1369                                     subpel_params->subpel_y,
1370                                     conv_params);
1371             return;
1372         }
1373         if (conv_params->is_compound) { assert(conv_params->dst != NULL); }
1374         svt_av1_convolve_2d_scale(src,
1375                                   src_stride,
1376                                   dst,
1377                                   dst_stride,
1378                                   w,
1379                                   h,
1380                                   &filter_params_x,
1381                                   &filter_params_y,
1382                                   subpel_params->subpel_x,
1383                                   subpel_params->xs,
1384                                   subpel_params->subpel_y,
1385                                   subpel_params->ys,
1386                                   conv_params);
1387     } else {
1388         SubpelParams sp = *subpel_params;
1389         revert_scale_extra_bits(&sp);
1390 
1391         if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1392             convolve_2d_for_intrabc(
1393                 src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params);
1394             return;
1395         }
1396 
1397         convolve[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1398                                                                                src_stride,
1399                                                                                dst,
1400                                                                                dst_stride,
1401                                                                                w,
1402                                                                                h,
1403                                                                                &filter_params_x,
1404                                                                                &filter_params_y,
1405                                                                                sp.subpel_x,
1406                                                                                sp.subpel_y,
1407                                                                                conv_params);
1408     }
1409 }
1410 
svt_highbd_inter_predictor(const uint16_t * src,int32_t src_stride,uint16_t * dst,int32_t dst_stride,const SubpelParams * subpel_params,const ScaleFactors * sf,int32_t w,int32_t h,ConvolveParams * conv_params,InterpFilters interp_filters,int32_t is_intrabc,int32_t bd)1411 void svt_highbd_inter_predictor(const uint16_t *src, int32_t src_stride, uint16_t *dst,
1412                                 int32_t dst_stride, const SubpelParams *subpel_params,
1413                                 const ScaleFactors *sf, int32_t w, int32_t h,
1414                                 ConvolveParams *conv_params, InterpFilters interp_filters,
1415                                 int32_t is_intrabc, int32_t bd) {
1416     InterpFilterParams filter_params_x, filter_params_y;
1417     const int32_t      is_scaled = has_scale(subpel_params->xs, subpel_params->ys);
1418 
1419     av1_get_convolve_filter_params(interp_filters, &filter_params_x, &filter_params_y, w, h);
1420 
1421     assert(conv_params->do_average == 0 || conv_params->do_average == 1);
1422     assert(sf);
1423     UNUSED(sf);
1424     assert(IMPLIES(is_intrabc, !is_scaled));
1425 
1426     if (is_scaled) {
1427         if (is_intrabc && (subpel_params->subpel_x != 0 || subpel_params->subpel_y != 0)) {
1428             highbd_convolve_2d_for_intrabc(src,
1429                                            src_stride,
1430                                            dst,
1431                                            dst_stride,
1432                                            w,
1433                                            h,
1434                                            subpel_params->subpel_x,
1435                                            subpel_params->subpel_y,
1436                                            conv_params,
1437                                            bd);
1438             return;
1439         }
1440         if (conv_params->is_compound) { assert(conv_params->dst != NULL); }
1441         svt_av1_highbd_convolve_2d_scale(src,
1442                                          src_stride,
1443                                          dst,
1444                                          dst_stride,
1445                                          w,
1446                                          h,
1447                                          &filter_params_x,
1448                                          &filter_params_y,
1449                                          subpel_params->subpel_x,
1450                                          subpel_params->xs,
1451                                          subpel_params->subpel_y,
1452                                          subpel_params->ys,
1453                                          conv_params,
1454                                          bd);
1455     } else {
1456         SubpelParams sp = *subpel_params;
1457         revert_scale_extra_bits(&sp);
1458 
1459         if (is_intrabc && (sp.subpel_x != 0 || sp.subpel_y != 0)) {
1460             highbd_convolve_2d_for_intrabc(
1461                 src, src_stride, dst, dst_stride, w, h, sp.subpel_x, sp.subpel_y, conv_params, bd);
1462             return;
1463         }
1464 
1465         convolveHbd[sp.subpel_x != 0][sp.subpel_y != 0][conv_params->is_compound](src,
1466                                                                                   src_stride,
1467                                                                                   dst,
1468                                                                                   dst_stride,
1469                                                                                   w,
1470                                                                                   h,
1471                                                                                   &filter_params_x,
1472                                                                                   &filter_params_y,
1473                                                                                   sp.subpel_x,
1474                                                                                   sp.subpel_y,
1475                                                                                   conv_params,
1476                                                                                   bd);
1477     }
1478 }
1479 
1480 
1481 #define USE_PRECOMPUTED_WEDGE_SIGN 1
1482 #define USE_PRECOMPUTED_WEDGE_MASK 1
1483 
1484 #if USE_PRECOMPUTED_WEDGE_MASK
1485 static const uint8_t wedge_primary_oblique_odd[MASK_PRIMARY_SIZE] = {
1486         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1487         0,  0,  0,  0,  0,  0,  1,  2,  6,  18, 37, 53, 60, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1488         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1489 };
1490 static const uint8_t wedge_primary_oblique_even[MASK_PRIMARY_SIZE] = {
1491         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1492         0,  0,  0,  0,  0,  0,  1,  4,  11, 27, 46, 58, 62, 63, 64, 64, 64, 64, 64, 64, 64, 64,
1493         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1494 };
1495 static const uint8_t wedge_primary_vertical[MASK_PRIMARY_SIZE] = {
1496         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1497         0,  0,  0,  0,  0,  0,  0,  2,  7,  21, 43, 57, 62, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1498         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
1499 };
1500 
1501 
1502 DECLARE_ALIGNED(16, static uint8_t, wedge_signflip_lookup[BlockSizeS_ALL][MAX_WEDGE_TYPES]) = {
1503         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1504         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1505         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1506         { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1507         { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1508         { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1509         { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1510         { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1511         { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1512         { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, },
1513         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1514         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1515         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1516         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1517         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1518         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1519         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1520         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1521         { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, },
1522         { 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, },
1523         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1524         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },  // not used
1525 };
1526 
1527 
1528 static const WedgeCodeType wedge_codebook_16_hgtw[16] = {
1529         { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1530         { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1531         { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 4 },
1532         { WEDGE_HORIZONTAL, 4, 6 }, { WEDGE_VERTICAL, 4, 4 },
1533         { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1534         { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1535         { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1536         { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1537 };
1538 
1539 static const WedgeCodeType wedge_codebook_16_hltw[16] = {
1540         { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1541         { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1542         { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 4, 4 },
1543         { WEDGE_VERTICAL, 6, 4 }, { WEDGE_HORIZONTAL, 4, 4 },
1544         { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1545         { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1546         { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1547         { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1548 };
1549 
1550 static const WedgeCodeType wedge_codebook_16_heqw[16] = {
1551         { WEDGE_OBLIQUE27, 4, 4 }, { WEDGE_OBLIQUE63, 4, 4 },
1552         { WEDGE_OBLIQUE117, 4, 4 }, { WEDGE_OBLIQUE153, 4, 4 },
1553         { WEDGE_HORIZONTAL, 4, 2 }, { WEDGE_HORIZONTAL, 4, 6 },
1554         { WEDGE_VERTICAL, 2, 4 }, { WEDGE_VERTICAL, 6, 4 },
1555         { WEDGE_OBLIQUE27, 4, 2 }, { WEDGE_OBLIQUE27, 4, 6 },
1556         { WEDGE_OBLIQUE153, 4, 2 }, { WEDGE_OBLIQUE153, 4, 6 },
1557         { WEDGE_OBLIQUE63, 2, 4 }, { WEDGE_OBLIQUE63, 6, 4 },
1558         { WEDGE_OBLIQUE117, 2, 4 }, { WEDGE_OBLIQUE117, 6, 4 },
1559 };
1560 
1561 static const WedgeParamsType wedge_params_lookup[BlockSizeS_ALL] = {
1562         { 0, NULL, NULL, NULL },
1563         { 0, NULL, NULL, NULL },
1564         { 0, NULL, NULL, NULL },
1565         { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_8X8],
1566                 wedge_masks[BLOCK_8X8] },
1567         { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X16],
1568                 wedge_masks[BLOCK_8X16] },
1569         { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_16X8],
1570                 wedge_masks[BLOCK_16X8] },
1571         { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_16X16],
1572                 wedge_masks[BLOCK_16X16] },
1573         { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_16X32],
1574                 wedge_masks[BLOCK_16X32] },
1575         { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X16],
1576                 wedge_masks[BLOCK_32X16] },
1577         { 4, wedge_codebook_16_heqw, wedge_signflip_lookup[BLOCK_32X32],
1578                 wedge_masks[BLOCK_32X32] },
1579         { 0, NULL, NULL, NULL },
1580         { 0, NULL, NULL, NULL },
1581         { 0, NULL, NULL, NULL },
1582         { 0, NULL, NULL, NULL },
1583         { 0, NULL, NULL, NULL },
1584         { 0, NULL, NULL, NULL },
1585         { 0, NULL, NULL, NULL },
1586         { 0, NULL, NULL, NULL },
1587         { 4, wedge_codebook_16_hgtw, wedge_signflip_lookup[BLOCK_8X32],
1588                 wedge_masks[BLOCK_8X32] },
1589         { 4, wedge_codebook_16_hltw, wedge_signflip_lookup[BLOCK_32X8],
1590                 wedge_masks[BLOCK_32X8] },
1591         { 0, NULL, NULL, NULL },
1592         { 0, NULL, NULL, NULL },
1593 };
1594 
is_interintra_wedge_used(BlockSize sb_type)1595 int is_interintra_wedge_used(BlockSize sb_type) {
1596     return wedge_params_lookup[sb_type].bits > 0;
1597 }
1598 
get_wedge_bits_lookup(BlockSize sb_type)1599 int32_t get_wedge_bits_lookup(BlockSize sb_type) {
1600     return wedge_params_lookup[sb_type].bits;
1601 }
1602 
av1_get_contiguous_soft_mask(int wedge_index,int wedge_sign,BlockSize sb_type)1603 const uint8_t *av1_get_contiguous_soft_mask(int wedge_index, int wedge_sign,
1604                                                    BlockSize sb_type) {
1605     return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
1606 }
1607 
aom_convolve_copy_c(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1608 static void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1609                                 ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,
1610                                 const int16_t *filter_y, int filter_y_stride, int w, int h) {
1611 
1612     (void)filter_x;
1613     (void)filter_x_stride;
1614     (void)filter_y;
1615     (void)filter_y_stride;
1616 
1617     for (int r = h; r > 0; --r) {
1618         svt_memcpy(dst, src, w);
1619         src += src_stride;
1620         dst += dst_stride;
1621     }
1622 }
1623 
shift_copy(const uint8_t * src,uint8_t * dst,int shift,int width)1624 static void shift_copy(const uint8_t *src, uint8_t *dst, int shift, int width) {
1625     if (shift >= 0) {
1626         svt_memcpy(dst + shift, src, width - shift);
1627         memset(dst, src[0], shift);
1628     } else {
1629         shift = -shift;
1630         svt_memcpy(dst, src + shift, width - shift);
1631         memset(dst + width - shift, src[width - 1], shift);
1632     }
1633 }
1634 
get_wedge_params_bits(BlockSize sb_type)1635 int get_wedge_params_bits(BlockSize sb_type) {
1636     return wedge_params_lookup[sb_type].bits;
1637 }
1638 
1639 #endif // USE_PRECOMPUTED_WEDGE_MASK
1640 
1641 
1642 // [negative][direction]
1643 DECLARE_ALIGNED(16, static uint8_t,
1644                 wedge_mask_obl[2][WEDGE_DIRECTIONS][MASK_PRIMARY_SIZE * MASK_PRIMARY_SIZE]);
1645 
1646 // 4 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
1647 // on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
1648 DECLARE_ALIGNED(16, static uint8_t, wedge_mask_buf[2 * MAX_WEDGE_TYPES * 4 * MAX_WEDGE_SQUARE]);
1649 
init_wedge_primary_masks()1650 static void init_wedge_primary_masks() {
1651     const int w      = MASK_PRIMARY_SIZE;
1652     const int h      = MASK_PRIMARY_SIZE;
1653     const int stride = MASK_PRIMARY_STRIDE;
1654     // Note: index [0] stores the primary, and [1] its complement.
1655 #if USE_PRECOMPUTED_WEDGE_MASK
1656     // Generate prototype by shifting the primary
1657     int shift = h / 4;
1658     for (int i = 0; i < h; i += 2) {
1659         shift_copy(wedge_primary_oblique_even,
1660                    &wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride],
1661                    shift,
1662                    MASK_PRIMARY_SIZE);
1663         shift--;
1664         shift_copy(wedge_primary_oblique_odd,
1665                    &wedge_mask_obl[0][WEDGE_OBLIQUE63][(i + 1) * stride],
1666                    shift,
1667                    MASK_PRIMARY_SIZE);
1668         svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][i * stride],
1669                    wedge_primary_vertical,
1670                    MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
1671         svt_memcpy(&wedge_mask_obl[0][WEDGE_VERTICAL][(i + 1) * stride],
1672                    wedge_primary_vertical,
1673                    MASK_PRIMARY_SIZE * sizeof(wedge_primary_vertical[0]));
1674     }
1675 #else
1676     static const double smoother_param = 2.85;
1677     const int           a[2]           = {2, 1};
1678     const double        asqrt          = sqrt(a[0] * a[0] + a[1] * a[1]);
1679     for (int i = 0; i < h; i++) {
1680         for (int j = 0; j < w; ++j) {
1681             int       x   = (2 * j + 1 - w);
1682             int       y   = (2 * i + 1 - h);
1683             double    d   = (a[0] * x + a[1] * y) / asqrt;
1684             const int msk = (int)rint((1.0 + tanh(d / smoother_param)) * 32);
1685             wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j] = msk;
1686             const int mskx = (int)rint((1.0 + tanh(x / smoother_param)) * 32);
1687             wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j] = mskx;
1688         }
1689     }
1690 #endif // USE_PRECOMPUTED_WEDGE_MASK
1691     for (int i = 0; i < h; ++i) {
1692         for (int j = 0; j < w; ++j) {
1693             const int msk = wedge_mask_obl[0][WEDGE_OBLIQUE63][i * stride + j];
1694             wedge_mask_obl[0][WEDGE_OBLIQUE27][j * stride + i] = msk;
1695             wedge_mask_obl[0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
1696             wedge_mask_obl[0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
1697                     (1 << WEDGE_WEIGHT_BITS) - msk;
1698             wedge_mask_obl[1][WEDGE_OBLIQUE63][i * stride + j] =
1699             wedge_mask_obl[1][WEDGE_OBLIQUE27][j * stride + i] = (1 << WEDGE_WEIGHT_BITS) - msk;
1700             wedge_mask_obl[1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
1701             wedge_mask_obl[1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] = msk;
1702             const int mskx = wedge_mask_obl[0][WEDGE_VERTICAL][i * stride + j];
1703             wedge_mask_obl[0][WEDGE_HORIZONTAL][j * stride + i] = mskx;
1704             wedge_mask_obl[1][WEDGE_VERTICAL][i * stride + j] =
1705             wedge_mask_obl[1][WEDGE_HORIZONTAL][j * stride + i] =
1706                     (1 << WEDGE_WEIGHT_BITS) - mskx;
1707         }
1708     }
1709 }
1710 
1711 #if !USE_PRECOMPUTED_WEDGE_SIGN
1712 // If the signs for the wedges for various blocksizes are
1713 // inconsistent flip the sign flag. Do it only once for every
1714 // wedge codebook.
init_wedge_signs()1715 static void init_wedge_signs() {
1716     memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
1717     for (BLOCK_SIZE sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES_ALL; ++sb_type) {
1718         const int               bw           = block_size_wide[sb_type];
1719         const int               bh           = block_size_high[sb_type];
1720         const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
1721         const int               wbits        = wedge_params.bits;
1722         const int               wtypes       = 1 << wbits;
1723 
1724         if (wbits) {
1725             for (int w = 0; w < wtypes; ++w) {
1726                 // Get the mask primary, i.e. index [0]
1727                 const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
1728                 int            avg  = 0;
1729                 for (int i = 0; i < bw; ++i) avg += mask[i];
1730                 for (int i = 1; i < bh; ++i) avg += mask[i * MASK_PRIMARY_STRIDE];
1731                 avg = (avg + (bw + bh - 1) / 2) / (bw + bh - 1);
1732                 // Default sign of this wedge is 1 if the average < 32, 0 otherwise.
1733                 // If default sign is 1:
1734                 //   If sign requested is 0, we need to flip the sign and return
1735                 //   the complement i.e. index [1] instead. If sign requested is 1
1736                 //   we need to flip the sign and return index [0] instead.
1737                 // If default sign is 0:
1738                 //   If sign requested is 0, we need to return index [0] the primary
1739                 //   if sign requested is 1, we need to return the complement index [1]
1740                 //   instead.
1741                 wedge_params.signflip[w] = (avg < 32);
1742             }
1743         }
1744     }
1745 }
1746 #endif // !USE_PRECOMPUTED_WEDGE_SIGN
1747 
get_wedge_mask_inplace(int wedge_index,int neg,BlockSize sb_type)1748 static const uint8_t *get_wedge_mask_inplace(int wedge_index, int neg, BlockSize sb_type) {
1749     const int            bh = block_size_high[sb_type];
1750     const int            bw = block_size_wide[sb_type];
1751 
1752     assert(wedge_index >= 0 && wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
1753     const WedgeCodeType *a  = wedge_params_lookup[sb_type].codebook + wedge_index;
1754     int                  woff, hoff;
1755     const uint8_t        wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
1756 
1757     woff   = (a->x_offset * bw) >> 3;
1758     hoff   = (a->y_offset * bh) >> 3;
1759     return wedge_mask_obl[neg ^ wsignflip][a->direction] +
1760         MASK_PRIMARY_STRIDE * (MASK_PRIMARY_SIZE / 2 - hoff) + MASK_PRIMARY_SIZE / 2 - woff;
1761 }
1762 
init_wedge_masks()1763 static void init_wedge_masks() {
1764     uint8_t * dst = wedge_mask_buf;
1765     memset(wedge_masks, 0, sizeof(wedge_masks));
1766     for (BlockSize bsize = BLOCK_4X4; bsize < BlockSizeS_ALL; ++bsize) {
1767         const int              bw           = block_size_wide[bsize];
1768         const int              bh           = block_size_high[bsize];
1769         const WedgeParamsType *wedge_params = &wedge_params_lookup[bsize];
1770         const int              wbits        = wedge_params->bits;
1771         const int              wtypes       = 1 << wbits;
1772         if (wbits == 0) continue;
1773         for (int w = 0; w < wtypes; ++w) {
1774             const uint8_t *mask;
1775             mask = get_wedge_mask_inplace(w, 0, bsize);
1776             aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
1777             wedge_params->masks[0][w] = dst;
1778             dst += bw * bh;
1779 
1780             mask = get_wedge_mask_inplace(w, 1, bsize);
1781             aom_convolve_copy_c(mask, MASK_PRIMARY_STRIDE, dst, bw, NULL, 0, NULL, 0, bw, bh);
1782             wedge_params->masks[1][w] = dst;
1783             dst += bw * bh;
1784         }
1785         assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
1786     }
1787 }
1788 
1789 // Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
svt_av1_init_wedge_masks(void)1790 void svt_av1_init_wedge_masks(void) {
1791     init_wedge_primary_masks();
1792 #if !USE_PRECOMPUTED_WEDGE_SIGN
1793     init_wedge_signs();
1794 #endif // !USE_PRECOMPUTED_WEDGE_SIGN
1795     init_wedge_masks();
1796 }
1797 
1798 int is_masked_compound_type(COMPOUND_TYPE type);
1799 
1800 /* clang-format off */
1801 static const uint8_t ii_weights1d[MAX_SB_SIZE] = {
1802         60, 58, 56, 54, 52, 50, 48, 47, 45, 44, 42, 41, 39, 38, 37, 35, 34, 33, 32,
1803         31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 22, 21, 20, 19, 19, 18, 18, 17, 16,
1804         16, 15, 15, 14, 14, 13, 13, 12, 12, 12, 11, 11, 10, 10, 10,  9,  9,  9,  8,
1805         8,  8,  8,  7,  7,  7,  7,  6,  6,  6,  6,  6,  5,  5,  5,  5,  5,  4,  4,
1806         4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  2,  2,
1807         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1,  1,  1,
1808         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
1809 };
1810 static uint8_t ii_size_scales[BlockSizeS_ALL] = {
1811         32, 16, 16, 16, 8, 8, 8, 4,
1812         4,  4,  2,  2,  2, 1, 1, 1,
1813         8,  8,  4,  4,  2, 2
1814 };
1815 /* clang-format on */
1816 
build_smooth_interintra_mask(uint8_t * mask,int stride,BlockSize plane_bsize,InterIntraMode mode)1817 void build_smooth_interintra_mask(uint8_t *mask, int stride, BlockSize plane_bsize,
1818                                   InterIntraMode mode) {
1819     const int bw         = block_size_wide[plane_bsize];
1820     const int bh         = block_size_high[plane_bsize];
1821     const int size_scale = ii_size_scales[plane_bsize];
1822 
1823     switch (mode) {
1824         case II_V_PRED:
1825             for (int i = 0; i < bh; ++i) {
1826                 memset(mask, ii_weights1d[i * size_scale], bw * sizeof(mask[0]));
1827                 mask += stride;
1828             }
1829             break;
1830 
1831         case II_H_PRED:
1832             for (int i = 0; i < bh; ++i) {
1833                 for (int j = 0; j < bw; ++j) mask[j] = ii_weights1d[j * size_scale];
1834                 mask += stride;
1835             }
1836             break;
1837 
1838         case II_SMOOTH_PRED:
1839             for (int i = 0; i < bh; ++i) {
1840                 for (int j = 0; j < bw; ++j) mask[j] = ii_weights1d[(i < j ? i : j) * size_scale];
1841                 mask += stride;
1842             }
1843             break;
1844 
1845         case II_DC_PRED:
1846         default:
1847             for (int i = 0; i < bh; ++i) {
1848                 memset(mask, 32, bw * sizeof(mask[0]));
1849                 mask += stride;
1850             }
1851             break;
1852     }
1853 }
1854 
combine_interintra_highbd(InterIntraMode mode,uint8_t use_wedge_interintra,uint8_t wedge_index,uint8_t wedge_sign,BlockSize bsize,BlockSize plane_bsize,uint8_t * comppred8,int compstride,const uint8_t * interpred8,int interstride,const uint8_t * intrapred8,int intrastride,int bd)1855 void combine_interintra_highbd(InterIntraMode mode, uint8_t use_wedge_interintra,
1856                                uint8_t wedge_index, uint8_t wedge_sign, BlockSize bsize,
1857                                BlockSize plane_bsize, uint8_t *comppred8, int compstride,
1858                                const uint8_t *interpred8, int interstride,
1859                                const uint8_t *intrapred8, int intrastride, int bd) {
1860     const int bw = block_size_wide[plane_bsize];
1861     const int bh = block_size_high[plane_bsize];
1862 
1863     if (use_wedge_interintra) {
1864         if (is_interintra_wedge_used(bsize)) {
1865             const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
1866             const int      subh = 2 * mi_size_high[bsize] == bh;
1867             const int      subw = 2 * mi_size_wide[bsize] == bw;
1868             svt_aom_highbd_blend_a64_mask(comppred8,
1869                                           compstride,
1870                                           intrapred8,
1871                                           intrastride,
1872                                           interpred8,
1873                                           interstride,
1874                                           mask,
1875                                           block_size_wide[bsize],
1876                                           bw,
1877                                           bh,
1878                                           subw,
1879                                           subh,
1880                                           bd);
1881         }
1882         return;
1883     }
1884 
1885     uint8_t mask[MAX_SB_SQUARE];
1886     build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
1887     svt_aom_highbd_blend_a64_mask(comppred8,
1888                                   compstride,
1889                                   intrapred8,
1890                                   intrastride,
1891                                   interpred8,
1892                                   interstride,
1893                                   mask,
1894                                   bw,
1895                                   bw,
1896                                   bh,
1897                                   0,
1898                                   0,
1899                                   bd);
1900 }
1901 
av1_get_compound_type_mask(const InterInterCompoundData * const comp_data,uint8_t * seg_mask,BlockSize sb_type)1902 static const uint8_t *av1_get_compound_type_mask(const InterInterCompoundData *const comp_data,
1903                                                  uint8_t *seg_mask, BlockSize sb_type) {
1904     assert(is_masked_compound_type(comp_data->type));
1905     (void)sb_type;
1906     switch (comp_data->type) {
1907         case COMPOUND_WEDGE:
1908             return av1_get_contiguous_soft_mask(comp_data->wedge_index, comp_data->wedge_sign, sb_type);
1909         case COMPOUND_DIFFWTD: return seg_mask;
1910         default: assert(0); return NULL;
1911     }
1912 }
1913 
build_masked_compound_no_round(uint8_t * dst,int dst_stride,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,const InterInterCompoundData * const comp_data,uint8_t * seg_mask,BlockSize sb_type,int h,int w,ConvolveParams * conv_params,uint8_t bit_depth,EbBool is_16bit)1914 void build_masked_compound_no_round(uint8_t *dst, int dst_stride, const CONV_BUF_TYPE *src0,
1915                                     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride,
1916                                     const InterInterCompoundData *const comp_data,
1917                                     uint8_t *seg_mask, BlockSize sb_type, int h, int w,
1918                                     ConvolveParams *conv_params, uint8_t bit_depth, EbBool is_16bit) {
1919     // Derive subsampling from h and w passed in. May be refactored to
1920     // pass in subsampling factors directly.
1921     const int      subh = (2 << mi_size_high_log2[sb_type]) == h;
1922     const int      subw = (2 << mi_size_wide_log2[sb_type]) == w;
1923     const uint8_t *mask = av1_get_compound_type_mask(comp_data, seg_mask, sb_type);
1924 
1925     if (is_16bit) {
1926         svt_aom_highbd_blend_a64_d16_mask(dst,
1927                                           dst_stride,
1928                                           src0,
1929                                           src0_stride,
1930                                           src1,
1931                                           src1_stride,
1932                                           mask,
1933                                           block_size_wide[sb_type],
1934                                           w,
1935                                           h,
1936                                           subw,
1937                                           subh,
1938                                           conv_params,
1939                                           bit_depth);
1940     } else {
1941         svt_aom_lowbd_blend_a64_d16_mask(dst,
1942                                          dst_stride,
1943                                          src0,
1944                                          src0_stride,
1945                                          src1,
1946                                          src1_stride,
1947                                          mask,
1948                                          block_size_wide[sb_type],
1949                                          w,
1950                                          h,
1951                                          subw,
1952                                          subh,
1953                                          conv_params);
1954     }
1955 }
1956 
1957 
av1_find_ref_dv(IntMv * ref_dv,const TileInfo * const tile,int mib_size,int mi_row,int mi_col)1958 void av1_find_ref_dv(IntMv *ref_dv, const TileInfo *const tile, int mib_size, int mi_row,
1959                      int mi_col) {
1960     (void)mi_col;
1961     if (mi_row - mib_size < tile->mi_row_start) {
1962         ref_dv->as_mv.row = 0;
1963         ref_dv->as_mv.col = -MI_SIZE * mib_size - INTRABC_DELAY_PIXELS;
1964     } else {
1965         ref_dv->as_mv.row = -MI_SIZE * mib_size;
1966         ref_dv->as_mv.col = 0;
1967     }
1968     ref_dv->as_mv.row *= 8;
1969     ref_dv->as_mv.col *= 8;
1970 }
1971 
1972 #define n_elements(x) (int32_t)(sizeof(x) / sizeof(x[0]))
1973 
comp_ref0(int32_t ref_idx)1974 MvReferenceFrame comp_ref0(int32_t ref_idx) {
1975     static const MvReferenceFrame lut[] = {
1976             LAST_FRAME, // LAST_LAST2_FRAMES,
1977             LAST_FRAME, // LAST_LAST3_FRAMES,
1978             LAST_FRAME, // LAST_GOLDEN_FRAMES,
1979             BWDREF_FRAME, // BWDREF_ALTREF_FRAMES,
1980             LAST2_FRAME, // LAST2_LAST3_FRAMES
1981             LAST2_FRAME, // LAST2_GOLDEN_FRAMES,
1982             LAST3_FRAME, // LAST3_GOLDEN_FRAMES,
1983             BWDREF_FRAME, // BWDREF_ALTREF2_FRAMES,
1984             ALTREF2_FRAME, // ALTREF2_ALTREF_FRAMES,
1985     };
1986     assert(n_elements(lut) == TOTAL_UNIDIR_COMP_REFS);
1987     return lut[ref_idx];
1988 }
1989 
comp_ref1(int32_t ref_idx)1990 MvReferenceFrame comp_ref1(int32_t ref_idx) {
1991     static const MvReferenceFrame lut[] = {
1992             LAST2_FRAME, // LAST_LAST2_FRAMES,
1993             LAST3_FRAME, // LAST_LAST3_FRAMES,
1994             GOLDEN_FRAME, // LAST_GOLDEN_FRAMES,
1995             ALTREF_FRAME, // BWDREF_ALTREF_FRAMES,
1996             LAST3_FRAME, // LAST2_LAST3_FRAMES
1997             GOLDEN_FRAME, // LAST2_GOLDEN_FRAMES,
1998             GOLDEN_FRAME, // LAST3_GOLDEN_FRAMES,
1999             ALTREF2_FRAME, // BWDREF_ALTREF2_FRAMES,
2000             ALTREF_FRAME, // ALTREF2_ALTREF_FRAMES,
2001     };
2002     assert(n_elements(lut) == TOTAL_UNIDIR_COMP_REFS);
2003     return lut[ref_idx];
2004 }
2005 
get_uni_comp_ref_idx(const MvReferenceFrame * const rf)2006 int8_t get_uni_comp_ref_idx(const MvReferenceFrame *const rf) {
2007     // Single ref pred
2008     if (rf[1] <= INTRA_FRAME) return -1;
2009 
2010     // Bi-directional comp ref pred
2011     if ((rf[0] < BWDREF_FRAME) && (rf[1] >= BWDREF_FRAME)) return -1;
2012 
2013     for (int8_t ref_idx = 0; ref_idx < TOTAL_UNIDIR_COMP_REFS; ++ref_idx) {
2014         if (rf[0] == comp_ref0(ref_idx) && rf[1] == comp_ref1(ref_idx)) return ref_idx;
2015     }
2016     return -1;
2017 }
2018 
av1_ref_frame_type(const MvReferenceFrame * const rf)2019 int8_t av1_ref_frame_type(const MvReferenceFrame *const rf) {
2020     if (rf[1] > INTRA_FRAME) {
2021         const int8_t uni_comp_ref_idx = get_uni_comp_ref_idx(rf);
2022         if (uni_comp_ref_idx >= 0) {
2023             assert((TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx) <
2024                    MODE_CTX_REF_FRAMES);
2025             return TOTAL_REFS_PER_FRAME + FWD_REFS * BWD_REFS + uni_comp_ref_idx;
2026         } else {
2027             return TOTAL_REFS_PER_FRAME + FWD_RF_OFFSET(rf[0]) + BWD_RF_OFFSET(rf[1]) * FWD_REFS;
2028         }
2029     }
2030 
2031     return rf[0];
2032 }
2033 
2034 static MvReferenceFrame ref_frame_map[TOTAL_COMP_REFS][2] = {
2035         {LAST_FRAME, BWDREF_FRAME},
2036         {LAST2_FRAME, BWDREF_FRAME},
2037         {LAST3_FRAME, BWDREF_FRAME},
2038         {GOLDEN_FRAME, BWDREF_FRAME},
2039         {LAST_FRAME, ALTREF2_FRAME},
2040         {LAST2_FRAME, ALTREF2_FRAME},
2041         {LAST3_FRAME, ALTREF2_FRAME},
2042         {GOLDEN_FRAME, ALTREF2_FRAME},
2043         {LAST_FRAME, ALTREF_FRAME},
2044         {LAST2_FRAME, ALTREF_FRAME},
2045         {LAST3_FRAME, ALTREF_FRAME},
2046         {GOLDEN_FRAME, ALTREF_FRAME},
2047         {LAST_FRAME, LAST2_FRAME},
2048         {LAST_FRAME, LAST3_FRAME},
2049         {LAST_FRAME, GOLDEN_FRAME},
2050         {BWDREF_FRAME, ALTREF_FRAME},
2051         // NOTE: Following reference frame pairs are not supported to be explicitly
2052         //       signalled, but they are possibly chosen by the use of skip_mode,
2053         //       which may use the most recent one-sided reference frame pair.
2054         {LAST2_FRAME, LAST3_FRAME},
2055         {LAST2_FRAME, GOLDEN_FRAME},
2056         {LAST3_FRAME, GOLDEN_FRAME},
2057         {BWDREF_FRAME, ALTREF2_FRAME},
2058         {ALTREF2_FRAME, ALTREF_FRAME}};
2059 
av1_set_ref_frame(MvReferenceFrame * rf,int8_t ref_frame_type)2060 void av1_set_ref_frame(MvReferenceFrame *rf, int8_t ref_frame_type) {
2061     if (ref_frame_type >= TOTAL_REFS_PER_FRAME) {
2062         rf[0] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][0];
2063         rf[1] = ref_frame_map[ref_frame_type - TOTAL_REFS_PER_FRAME][1];
2064     } else {
2065         rf[0] = ref_frame_type;
2066         rf[1] = NONE_FRAME;
2067         // assert(ref_frame_type > NONE_FRAME); AMIR
2068     }
2069 }
2070 
svt_av1_skip_u4x4_pred_in_obmc(BlockSize bsize,int dir,int subsampling_x,int subsampling_y)2071 int svt_av1_skip_u4x4_pred_in_obmc(BlockSize bsize, int dir, int subsampling_x, int subsampling_y) {
2072     assert(is_motion_variation_allowed_bsize(bsize));
2073 
2074     const BlockSize bsize_plane = get_plane_block_size(bsize, subsampling_x, subsampling_y);
2075     switch (bsize_plane) {
2076 #if DISABLE_CHROMA_U8X8_OBMC
2077         case BLOCK_4X4:
2078     case BLOCK_8X4:
2079     case BLOCK_4X8: return 1; break;
2080 #else
2081         case BLOCK_4X4:
2082         case BLOCK_8X4:
2083         case BLOCK_4X8: return dir == 0; break;
2084 #endif
2085         default: return 0;
2086     }
2087 }
2088 
2089 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS)
2090 
2091 /**
2092  * Computes SSE of a compound predictor constructed from 2 fundamental
2093  * predictors p0 and p1 using blending with mask.
2094  *
2095  * r1:  Residuals of p1.
2096  *      (source - p1)
2097  * d:   Difference of p1 and p0.
2098  *      (p1 - p0)
2099  * m:   The blending mask
2100  * N:   Number of pixels
2101  *
2102  * 'r1', 'd', and 'm' are contiguous.
2103  *
2104  * Computes:
2105  *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
2106  *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
2107  *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
2108  *    is equivalent to:
2109  *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
2110  *    which is the SSE of the residuals of the compound predictor scaled up by
2111  *    MAX_MASK_VALUE**2.
2112  *
2113  * Note that we clamp the partial term in the loop to 16 bits signed. This is
2114  * to facilitate equivalent SIMD implementation. It should have no effect if
2115  * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
2116  * holds for 8 bit input, and on real input, it should hold practically always,
2117  * as residuals are expected to be small.
2118  */
svt_av1_wedge_sse_from_residuals_c(const int16_t * r1,const int16_t * d,const uint8_t * m,int N)2119 uint64_t svt_av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, const uint8_t *m,
2120                                             int N) {
2121     uint64_t csse = 0;
2122 
2123     for (int i = 0; i < N; i++) {
2124         int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i];
2125         t         = clamp(t, INT16_MIN, INT16_MAX);
2126         csse += t * t;
2127     }
2128     return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
2129 }
2130 
2131 
combine_interintra(InterIntraMode mode,int8_t use_wedge_interintra,int wedge_index,int wedge_sign,BlockSize bsize,BlockSize plane_bsize,uint8_t * comppred,int compstride,const uint8_t * interpred,int interstride,const uint8_t * intrapred,int intrastride)2132 void combine_interintra(InterIntraMode mode, int8_t use_wedge_interintra, int wedge_index,
2133                         int wedge_sign, BlockSize bsize, BlockSize plane_bsize, uint8_t *comppred,
2134                         int compstride, const uint8_t *interpred, int interstride,
2135                         const uint8_t *intrapred, int intrastride) {
2136     const int bw = block_size_wide[plane_bsize];
2137     const int bh = block_size_high[plane_bsize];
2138 
2139     if (use_wedge_interintra) {
2140         if (is_interintra_wedge_used(bsize)) {
2141             const uint8_t *mask = av1_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
2142             const int      subw = 2 * mi_size_wide[bsize] == bw;
2143             const int      subh = 2 * mi_size_high[bsize] == bh;
2144             svt_aom_blend_a64_mask(comppred,
2145                                    compstride,
2146                                    intrapred,
2147                                    intrastride,
2148                                    interpred,
2149                                    interstride,
2150                                    mask,
2151                                    block_size_wide[bsize],
2152                                    bw,
2153                                    bh,
2154                                    subw,
2155                                    subh);
2156         }
2157         return;
2158     } else {
2159         uint8_t mask[MAX_SB_SQUARE];
2160         build_smooth_interintra_mask(mask, bw, plane_bsize, mode);
2161         svt_aom_blend_a64_mask(comppred,
2162                                compstride,
2163                                intrapred,
2164                                intrastride,
2165                                interpred,
2166                                interstride,
2167                                mask,
2168                                bw,
2169                                bw,
2170                                bh,
2171                                0,
2172                                0);
2173     }
2174 }
2175 
svt_aom_highbd_blend_a64_hmask_16bit_c(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h,int bd)2176 void svt_aom_highbd_blend_a64_hmask_16bit_c(uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
2177                                             uint32_t src0_stride, const uint16_t *src1,
2178                                             uint32_t src1_stride, const uint8_t *mask, int w, int h,
2179                                             int bd) {
2180     (void)bd;
2181 
2182     assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
2183     assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
2184 
2185     assert(h >= 1);
2186     assert(w >= 1);
2187     assert(IS_POWER_OF_TWO(h));
2188     assert(IS_POWER_OF_TWO(w));
2189 
2190     assert(bd == 8 || bd == 10 || bd == 12);
2191 
2192     for (int i = 0; i < h; ++i) {
2193         for (int j = 0; j < w; ++j) {
2194             dst[i * dst_stride + j] =
2195                     AOM_BLEND_A64(mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
2196         }
2197     }
2198 }
2199 
svt_aom_sum_squares_i16_c(const int16_t * src,uint32_t n)2200 uint64_t svt_aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
2201     uint64_t ss = 0;
2202     do {
2203         const int16_t v = *src++;
2204         ss += v * v;
2205     } while (--n);
2206 
2207     return ss;
2208 }
2209 
2210 // obmc_mask_N[overlap_position]
2211 static const uint8_t obmc_mask_1[1]                      = {64};
2212 DECLARE_ALIGNED(2, static const uint8_t, obmc_mask_2[2]) = {45, 64};
2213 
2214 DECLARE_ALIGNED(4, static const uint8_t, obmc_mask_4[4]) = {39, 50, 59, 64};
2215 
2216 static const uint8_t obmc_mask_8[8] = {36, 42, 48, 53, 57, 61, 64, 64};
2217 
2218 static const uint8_t obmc_mask_16[16] = {
2219         34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 64, 64, 64, 64};
2220 
2221 static const uint8_t obmc_mask_32[32] = {33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48,
2222                                          50, 51, 52, 53, 55, 56, 57, 58, 59, 60, 60,
2223                                          61, 62, 64, 64, 64, 64, 64, 64, 64, 64};
2224 
2225 static const uint8_t obmc_mask_64[64] = {
2226         33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44, 45, 46, 47, 47, 48, 49,
2227         50, 51, 51, 51, 52, 52, 53, 54, 55, 56, 56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60,
2228         61, 62, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
2229 };
2230 
svt_av1_get_obmc_mask(int length)2231 const uint8_t *svt_av1_get_obmc_mask(int length) {
2232     switch (length) {
2233         case 1: return obmc_mask_1;
2234         case 2: return obmc_mask_2;
2235         case 4: return obmc_mask_4;
2236         case 8: return obmc_mask_8;
2237         case 16: return obmc_mask_16;
2238         case 32: return obmc_mask_32;
2239         case 64: return obmc_mask_64;
2240         default: assert(0); return NULL;
2241     }
2242 }
2243